In [None]:
import pandas as pd

In [None]:
import sys
sys.path.insert(0,"../python/")
import modeling_utils as mu

In [None]:
import operator
from datawand.parametrization import ParamHelper
ph = ParamHelper("../pipelines/EcmlTaxi.json",sys.argv)

In [None]:
data_folder = ph.get("data_folder")
part = ph.get("part")
lat_digits = ph.get("lat_digits")
lng_digits = ph.get("lng_digits")

# Load data with generated features

In [None]:
taxi_trips = pd.read_csv("%s/gen_train%s_lng%i_lat%i.csv" % (data_folder, part, lng_digits, lat_digits))

In [None]:
taxi_trips_TEST = pd.read_csv("%s/gen_test_lng%i_lat%i.csv" % (data_folder, lng_digits, lat_digits))

### Exclude some features

   * the complete route information (leaving only aggregation based features)
   * precise time information is not needed either

In [None]:
def remove_features_from_data(df):
    if "POLYLINE" in df.columns:
        del df["POLYLINE"]
    if "TRIP_LAT" in df.columns:
        del df["TRIP_LAT"]
    if "TRIP_LNG" in df.columns:
        del df["TRIP_LNG"]
    if "TRIP_ID" in df.columns:
        del df["TRIP_ID"]
    if "DATE" in df.columns:
        del df["DATE"]
    if "TIME" in df.columns:
        del df["TIME"]
    if "TIMESTAMP" in df.columns:
        del df["TIMESTAMP"]

In [None]:
remove_features_from_data(taxi_trips)
remove_features_from_data(taxi_trips_TEST)

### Replace missing data with mean

In [None]:
def fillna_with_mean(df):
    col_means = df.mean()
    return df.fillna(col_means,axis=0)

In [None]:
taxi_trips = fillna_with_mean(taxi_trips)
taxi_trips_TEST = fillna_with_mean(taxi_trips_TEST)

In [None]:
taxi_trips.columns

# Select features for models

In [None]:
from sklearn.ensemble import GradientBoostingRegressor

In [None]:
#base_features = ['CALL_TYPE','ORIGIN_CALL','ORIGIN_STAND','TAXI_ID','DAY_TYPE','DAY_OF_WEEK','TIME_OF_DAY','TRIP_SIZE']
base_features = ['DAY_OF_WEEK','TIME_OF_DAY','TRIP_SIZE']

In [None]:
lat_features, lng_features = [], []

for feat in taxi_trips.columns:
    if feat in base_features:
        lat_features.append(feat)
        lng_features.append(feat)
    else:
        if "LAT" in feat and feat != "DESTINATION_LAT" and feat != "DESTINATION_LAT_FULL":
            lat_features.append(feat)
        elif "LNG" in feat and feat != "DESTINATION_LNG" and feat != "DESTINATION_LNG_FULL":
            lng_features.append(feat)
            
print lat_features
print lng_features

In [None]:
if "DEPARTURE_LAT_FULL" in lat_features:
    lat_features.remove("DEPARTURE_LAT_FULL")
if "DEPARTURE_LNG_FULL" in lng_features:
    lng_features.remove("DEPARTURE_LNG_FULL")

In [None]:
num_trees = ph.get("num_trees")
depth = ph.get("depth")

## Train latitude learner

In [None]:
lat_clf = GradientBoostingRegressor(n_estimators=num_trees,max_depth=depth)
if ph.get("gbt_use_exact_lat"):
    lat_clf.fit(taxi_trips[lat_features],taxi_trips["DESTINATION_LAT_FULL"])
else:
    lat_clf.fit(taxi_trips[lat_features],taxi_trips["DESTINATION_LAT"])

In [None]:
lat_importances = dict(zip(lat_features,lat_clf.feature_importances_))
pd.DataFrame(sorted(lat_importances.items(), key=operator.itemgetter(1),reverse=True),columns=["name","importance"])

## Train longitude learner

In [None]:
lng_clf = GradientBoostingRegressor(n_estimators=num_trees,max_depth=depth)
if ph.get("gbt_use_exact_lng"):
    lng_clf.fit(taxi_trips[lng_features],taxi_trips["DESTINATION_LNG_FULL"])
else:
    lng_clf.fit(taxi_trips[lng_features],taxi_trips["DESTINATION_LNG"])

In [None]:
lng_importances = dict(zip(lng_features,lng_clf.feature_importances_))
pd.DataFrame(sorted(lng_importances.items(), key=operator.itemgetter(1),reverse=True),columns=["name","importance"])

## Saving feature importances

In [None]:
lat_imp_df = pd.DataFrame(lat_importances.items(),columns=["name","weight"])
lng_imp_df = pd.DataFrame(lng_importances.items(),columns=["name","weight"])
imp_df = pd.concat([lat_imp_df,lng_imp_df]).groupby("name").mean()
imp_df = pd.DataFrame(imp_df).reset_index()
imp_df.sort_values("weight",ascending=False)

In [None]:
imp_df.to_csv("%s/gbt_feature_importances%s_lng%i_lat%i.csv"  % (data_folder, part, lng_digits, lat_digits),index=False)

## Evaluation

In [None]:
from haversine import haversine

#### Using original destination

In [None]:
train_haversine, train_pred_df = mu.eval_gbt_models(taxi_trips,lat_clf,lng_clf,lat_features,lng_features)
test_haversine, test_pred_df = mu.eval_gbt_models(taxi_trips_TEST,lat_clf,lng_clf,lat_features,lng_features)
print "test: %f, train: %f (Haversine distance)" % (test_haversine, train_haversine)

In [None]:
if ph.get("export_prediction"):
    train_pred_df.to_csv("%s/gbt_model_pred_train%s_lng%i_lat%i.csv"  % (data_folder, part, lng_digits, lat_digits),index=False)
    test_pred_df.to_csv("%s/gbt_model_pred_test%s_lng%i_lat%i.csv"  % (data_folder, part, lng_digits, lat_digits) ,index=False)
    print("Predictions were exported")

#### Using rounded destination

train_haversine, _ = mu.eval_gbt_models(taxi_trips,lat_clf,lng_clf,lat_features,lng_features,use_original_trg=False)
test_haversine, _ = mu.eval_gbt_models(taxi_trips_TEST,lat_clf,lng_clf,lat_features,lng_features,use_original_trg=False)
print "test: %f, train: %f (Haversine distance)" % (test_haversine, train_haversine)