In [115]:
import json
from os import path

import matplotlib
import pandas as pd

# 1. Data

In [3]:
training_data_folder = "/Users/gokhanceyhan/work/data/almrrc2021/almrrc2021-data-training/model_build_inputs"

In [11]:
with open(path.join(training_data_folder, "route_data.json")) as f:
    route_data_tr_json = json.load(f)
route_data_tr = pd.DataFrame.from_records(route_data_tr_json).transpose()

In [31]:
with open(path.join(training_data_folder, "travel_times.json")) as f:
    travel_times_tr_json = json.load(f)
travel_times_tr = pd.DataFrame.from_records(travel_times_tr_json).transpose()

In [93]:
with open(path.join(training_data_folder, "actual_sequences.json")) as f:
    actual_sequences_tr_json = json.load(f)
actual_sequences_tr = pd.DataFrame.from_records(actual_sequences_tr_json).transpose()

In [92]:
with open(path.join(training_data_folder, "package_data.json")) as f:
    package_data_tr_json = json.load(f)
package_data_tr = pd.DataFrame.from_records(package_data_tr_json).transpose()

# 2. Feature generation

## 2.1. Instance features

- Stops
    - (Done) Num stops
    - (Done) Number of unique zones
    - Density of stops (?)
- Travel times
    - (Done) Descriptive stats of inter-stop travel times
- Packages
    - (Done) Number of packages
    - (Done) Num of packages delivered
    - (Done) Num packages with time window
    - Descriptive stats of time windows
    - Total/max planned service time
    - Capacity utilization

### Stops

In [64]:
class StopFeatures:

    def __init__(self, route_stops) -> None:
        self._route_stops = route_stops

    def generate(self):
        features_df = pd.DataFrame(data=[], index=self._route_stops.index)
        features_df["stops.num_stops"] = self._route_stops.apply(StopFeatures._num_stops)
        features_df["stops.num_zones"] = self._route_stops.apply(StopFeatures._num_zones)
        return features_df

    @classmethod
    def _num_stops(cls, stops_json):
        return len(stops_json.keys())
    
    @classmethod
    def _num_zones(cls, stops_json):
        return len(set([stop["zone_id"] for stop in stops_json.values()]))
        

In [65]:
stop_features = StopFeatures(route_stops=route_data_tr["stops"])
stop_features_df = stop_features.generate()

In [66]:
stop_features_df.head()

Unnamed: 0,stops.num_stops,stops.num_zones
RouteID_00143bdd-0a6b-49ec-bb35-36593d303e77,119,31
RouteID_0016bc70-cb8d-48b0-aa55-8ee50bdcdb59,106,12
RouteID_001948e9-4675-486d-9ec5-912fd8e0770f,128,11
RouteID_001b4ee3-c4f2-467f-932b-c85524d1021f,142,18
RouteID_0021a2aa-780f-460d-b09a-f301709e2523,155,27


### Travel times

In [55]:
class TravelTimeFeatures:

    def __init__(self, travel_times_df) -> None:
        self._travel_times_df = travel_times_df

    def generate(self):
        features_df = pd.DataFrame(data=[], index=self._travel_times_df.index)
        df_ = self._travel_times_df.apply(TravelTimeFeatures._avg_travel_time, axis=1)
        features_df["travel_time.avg"] = df_.apply(lambda t: t[0])
        features_df["travel_time.max"] = df_.apply(lambda t: t[1])
        features_df["travel_time.std"] = df_.apply(lambda t: t[2])
        return features_df

    @classmethod
    def _avg_travel_time(cls, route_travel_times):
        non_empty_columns = route_travel_times[~route_travel_times.isna()]
        values = [v for c in non_empty_columns for v in c.values()]
        s = pd.Series(values)
        return s.mean(), s.max(), s.std()


In [61]:
travel_time_features = TravelTimeFeatures(travel_times_df=travel_times_tr)
travel_time_features_df = travel_time_features.generate()

In [62]:
travel_time_features_df.head()

Unnamed: 0,travel_time.avg,travel_time.max,travel_time.std
RouteID_00143bdd-0a6b-49ec-bb35-36593d303e77,234.724815,2011.9,221.381663
RouteID_0016bc70-cb8d-48b0-aa55-8ee50bdcdb59,250.400169,2520.7,310.63552
RouteID_001948e9-4675-486d-9ec5-912fd8e0770f,336.257318,2882.6,330.339727
RouteID_001b4ee3-c4f2-467f-932b-c85524d1021f,300.221752,2382.3,258.838491
RouteID_0021a2aa-780f-460d-b09a-f301709e2523,425.770027,2095.7,226.664698


### Packages

In [73]:
class PackageFeatures:

    def __init__(self, packages_df) -> None:
        self._packages_df = packages_df

    def generate(self):
        features_df = pd.DataFrame(data=[], index=self._packages_df.index)
        features_df["packages.total"] = self._packages_df.apply(PackageFeatures._num_packages, axis=1)
        features_df["packages.delivered"] = self._packages_df.apply(PackageFeatures._num_delivered, axis=1)
        features_df["packages.time_window"] = self._packages_df.apply(PackageFeatures._num_time_window, axis=1)
        return features_df

    @classmethod
    def _num_packages(cls, route_stops):
        stops = route_stops[~route_stops.isna()]
        return sum([len(s.keys()) for s in stops])
    
    @classmethod
    def _num_delivered(cls, route_stops):
        stops = route_stops[~route_stops.isna()]
        return len([p for s in stops for p in s.values() if p["scan_status"] == "DELIVERED"])

    @classmethod
    def _num_time_window(cls, route_stops):
        stops = route_stops[~route_stops.isna()]
        return len([
            p for s in stops for p in s.values() 
            if isinstance(p["time_window"]["start_time_utc"], str) and 
            isinstance(p["time_window"]["end_time_utc"], str)])
    

In [74]:
package_features = PackageFeatures(package_data_tr)
package_features_df = package_features.generate()

In [75]:
package_features_df.head()

Unnamed: 0,packages.total,packages.delivered,packages.time_window
RouteID_00143bdd-0a6b-49ec-bb35-36593d303e77,276,274,55
RouteID_0016bc70-cb8d-48b0-aa55-8ee50bdcdb59,199,196,17
RouteID_001948e9-4675-486d-9ec5-912fd8e0770f,212,212,25
RouteID_001b4ee3-c4f2-467f-932b-c85524d1021f,241,241,21
RouteID_0021a2aa-780f-460d-b09a-f301709e2523,219,218,1


## 2.2. Performance features

- (Done) Total travel time
- (Done) Max travel time
- Total delivery delay
- Max delivery delay
- Number of zone changes
- Number of revisits (?)
- Total route duration (?)

In [120]:
class PerformanceFeatures:

    def __init__(self, sequences, travel_times_df) -> None:
        self._sequences = sequences.reset_index(drop=False).set_index("index", drop=False)
        self._travel_times_df = travel_times_df

    def generate(self):
        features_df = pd.DataFrame(data=[], index=self._sequences.index)
        travel_time_stats_df_ = self._sequences.apply(self._total_travel_time, axis=1)
        features_df["criterion.travel_time.sum"] = travel_time_stats_df_.apply(lambda t: t[0])
        features_df["criterion.travel_time.max"] = travel_time_stats_df_.apply(lambda t: t[1])
        return features_df

    def _total_travel_time(self, sequence):
        all_travel_times = self._travel_times_df.loc[sequence["index"]]
        stops = [s for (s, idx) in sorted([(s, idx) for (s, idx) in sequence["actual"].items()], key=lambda x: x[1])]
        stop_pairs = [(stops[i], stops[i+1]) for i in range(len(stops) - 1)]
        travel_times = [all_travel_times[from_][to] for (from_, to) in stop_pairs]
        return sum(travel_times), max(travel_times)

        

In [121]:
performance_measures = PerformanceFeatures(actual_sequences_tr["actual"], travel_times_tr)
performance_measures_df = performance_measures.generate()

In [122]:
performance_measures_df.head()

Unnamed: 0_level_0,criterion.travel_time.sum,criterion.travel_time.max
index,Unnamed: 1_level_1,Unnamed: 2_level_1
RouteID_00143bdd-0a6b-49ec-bb35-36593d303e77,7754.1,1509.7
RouteID_0016bc70-cb8d-48b0-aa55-8ee50bdcdb59,6647.8,2397.3
RouteID_001948e9-4675-486d-9ec5-912fd8e0770f,9897.4,2802.3
RouteID_001b4ee3-c4f2-467f-932b-c85524d1021f,9491.5,2045.2
RouteID_0021a2aa-780f-460d-b09a-f301709e2523,13523.7,1803.5


### Features Data Frame

In [111]:
features_df = stop_features_df.merge(
    travel_time_features_df, left_index=True, right_index=True).merge(
        package_features_df, left_index=True, right_index=True).merge(
            performance_measures_df, left_index=True, right_index=True)

In [112]:
features_df.head()

Unnamed: 0,stops.num_stops,stops.num_zones,travel_time.avg,travel_time.max,travel_time.std,packages.total,packages.delivered,packages.time_window,criterion.total_travel_time
RouteID_00143bdd-0a6b-49ec-bb35-36593d303e77,119,31,234.724815,2011.9,221.381663,276,274,55,7754.1
RouteID_0016bc70-cb8d-48b0-aa55-8ee50bdcdb59,106,12,250.400169,2520.7,310.63552,199,196,17,6647.8
RouteID_001948e9-4675-486d-9ec5-912fd8e0770f,128,11,336.257318,2882.6,330.339727,212,212,25,9897.4
RouteID_001b4ee3-c4f2-467f-932b-c85524d1021f,142,18,300.221752,2382.3,258.838491,241,241,21,9491.5
RouteID_0021a2aa-780f-460d-b09a-f301709e2523,155,27,425.770027,2095.7,226.664698,219,218,1,13523.7


In [113]:
features_df.describe()

Unnamed: 0,stops.num_stops,stops.num_zones,travel_time.avg,travel_time.max,travel_time.std,packages.total,packages.delivered,packages.time_window,criterion.total_travel_time
count,6112.0,6112.0,6112.0,6112.0,6112.0,6112.0,6112.0,6112.0,6112.0
mean,147.991983,21.039758,361.220321,2187.091361,252.843044,238.41214,236.604876,18.650687,10784.274427
std,31.033653,5.513901,96.744614,647.570266,67.825991,30.979955,31.381523,15.260114,2651.629121
min,33.0,5.0,155.101369,621.7,104.063827,150.0,128.0,1.0,2650.7
25%,129.0,18.0,295.034598,1704.525,202.44058,217.0,215.0,7.0,9138.4
50%,151.0,21.0,345.917123,2169.05,245.786486,239.0,238.0,15.0,11027.9
75%,170.0,24.0,408.896993,2581.25,294.611789,262.0,261.0,27.0,12623.325
max,238.0,48.0,1045.083064,5751.2,705.973971,304.0,299.0,102.0,20112.2
