In [1]:
import pandas as pd

In [2]:
import sys
sys.path.insert(0,"../python/")
import modeling_utils as mu

In [3]:
import operator
from datawand.parametrization import ParamHelper
ph = ParamHelper("../pipelines/EcmlTaxi.json",sys.argv)

Default parameters:
{u'data_folder': u'/home/fberes/DATA/taxi-challange/', u'gbt_use_exact_lat': False, u'lng_digits': 2, u'lat_digits': 3, u'is_test': False, u'part': u'_50000', u'gbt_use_exact_lng': False}
Custom parameters:
{u'depth': 5, u'export_prediction': True, u'num_trees': 40}


In [4]:
data_folder = ph.get("data_folder")
part = ph.get("part")
lat_digits = ph.get("lat_digits")
lng_digits = ph.get("lng_digits")

Using default parameter: data_folder=/home/fberes/DATA/taxi-challange/
Using default parameter: part=_50000
Using default parameter: lat_digits=3
Using default parameter: lng_digits=2


# Load data with generated features

In [5]:
taxi_trips = pd.read_csv("%s/gen_train%s_lng%i_lat%i.csv" % (data_folder, part, lng_digits, lat_digits))

In [6]:
taxi_trips_TEST = pd.read_csv("%s/gen_test_lng%i_lat%i.csv" % (data_folder, lng_digits, lat_digits))

### Exclude some features

   * the complete route information (leaving only aggregation based features)
   * precise time information is not needed either

In [7]:
def remove_features_from_data(df):
    if "POLYLINE" in df.columns:
        del df["POLYLINE"]
    if "TRIP_LAT" in df.columns:
        del df["TRIP_LAT"]
    if "TRIP_LNG" in df.columns:
        del df["TRIP_LNG"]
    if "TRIP_ID" in df.columns:
        del df["TRIP_ID"]
    if "DATE" in df.columns:
        del df["DATE"]
    if "TIME" in df.columns:
        del df["TIME"]
    if "TIMESTAMP" in df.columns:
        del df["TIMESTAMP"]

In [8]:
remove_features_from_data(taxi_trips)
remove_features_from_data(taxi_trips_TEST)

### Replace missing data with mean

In [9]:
def fillna_with_mean(df):
    col_means = df.mean()
    return df.fillna(col_means,axis=0)

In [10]:
taxi_trips = fillna_with_mean(taxi_trips)
taxi_trips_TEST = fillna_with_mean(taxi_trips_TEST)

In [11]:
taxi_trips.columns

Index([u'CALL_TYPE', u'ORIGIN_CALL', u'ORIGIN_STAND', u'TAXI_ID', u'DAY_TYPE',
       u'MISSING_DATA', u'DESTINATION_LAT_FULL', u'DESTINATION_LNG_FULL',
       u'DESTINATION_LAT', u'DESTINATION_LNG', u'DEPARTURE_LAT_FULL',
       u'DEPARTURE_LNG_FULL', u'DEPARTURE_LAT', u'DEPARTURE_LNG', u'TRIP_SIZE',
       u'TRIP_LAT_UNIQUE', u'TRIP_LAT_UNIQUE_RATIO', u'TRIP_LNG_UNIQUE',
       u'TRIP_LNG_UNIQUE_RATIO', u'TRIP_LAT_MIN', u'TRIP_LAT_MAX',
       u'TRIP_LAT_MEAN', u'TRIP_LAT_MEDIAN', u'TRIP_LAT_STD', u'TRIP_LNG_MIN',
       u'TRIP_LNG_MAX', u'TRIP_LNG_MEAN', u'TRIP_LNG_MEDIAN', u'TRIP_LNG_STD',
       u'TRIP_LAT_MIN_DIFF', u'TRIP_LNG_MIN_DIFF', u'TRIP_LAT_MAX_DIFF',
       u'TRIP_LNG_MAX_DIFF', u'TRIP_LAT_MEAN_DIFF', u'TRIP_LNG_MEAN_DIFF',
       u'TRIP_LAT_MEDIAN_DIFF', u'TRIP_LNG_MEDIAN_DIFF', u'DAY_OF_WEEK',
       u'TIME_OF_DAY', u'CALL_TYPE_onehot_dispatch',
       u'CALL_TYPE_onehot_driver', u'CALL_TYPE_onehot_other'],
      dtype='object')

# Select features for models

In [12]:
from sklearn.ensemble import GradientBoostingRegressor

In [13]:
#base_features = ['CALL_TYPE','ORIGIN_CALL','ORIGIN_STAND','TAXI_ID','DAY_TYPE','DAY_OF_WEEK','TIME_OF_DAY','TRIP_SIZE']
base_features = ['DAY_OF_WEEK','TIME_OF_DAY','TRIP_SIZE']

In [14]:
lat_features, lng_features = [], []

for feat in taxi_trips.columns:
    if feat in base_features:
        lat_features.append(feat)
        lng_features.append(feat)
    else:
        if "LAT" in feat and feat != "DESTINATION_LAT" and feat != "DESTINATION_LAT_FULL":
            lat_features.append(feat)
        elif "LNG" in feat and feat != "DESTINATION_LNG" and feat != "DESTINATION_LNG_FULL":
            lng_features.append(feat)
            
print lat_features
print lng_features

['DEPARTURE_LAT_FULL', 'DEPARTURE_LAT', 'TRIP_SIZE', 'TRIP_LAT_UNIQUE', 'TRIP_LAT_UNIQUE_RATIO', 'TRIP_LAT_MIN', 'TRIP_LAT_MAX', 'TRIP_LAT_MEAN', 'TRIP_LAT_MEDIAN', 'TRIP_LAT_STD', 'TRIP_LAT_MIN_DIFF', 'TRIP_LAT_MAX_DIFF', 'TRIP_LAT_MEAN_DIFF', 'TRIP_LAT_MEDIAN_DIFF', 'DAY_OF_WEEK', 'TIME_OF_DAY']
['DEPARTURE_LNG_FULL', 'DEPARTURE_LNG', 'TRIP_SIZE', 'TRIP_LNG_UNIQUE', 'TRIP_LNG_UNIQUE_RATIO', 'TRIP_LNG_MIN', 'TRIP_LNG_MAX', 'TRIP_LNG_MEAN', 'TRIP_LNG_MEDIAN', 'TRIP_LNG_STD', 'TRIP_LNG_MIN_DIFF', 'TRIP_LNG_MAX_DIFF', 'TRIP_LNG_MEAN_DIFF', 'TRIP_LNG_MEDIAN_DIFF', 'DAY_OF_WEEK', 'TIME_OF_DAY']


In [15]:
if "DEPARTURE_LAT_FULL" in lat_features:
    lat_features.remove("DEPARTURE_LAT_FULL")
if "DEPARTURE_LNG_FULL" in lng_features:
    lng_features.remove("DEPARTURE_LNG_FULL")

In [16]:
num_trees = ph.get("num_trees")
depth = ph.get("depth")

## Train latitude learner

In [17]:
lat_clf = GradientBoostingRegressor(n_estimators=num_trees,max_depth=depth)
if ph.get("gbt_use_exact_lat"):
    lat_clf.fit(taxi_trips[lat_features],taxi_trips["DESTINATION_LAT_FULL"])
else:
    lat_clf.fit(taxi_trips[lat_features],taxi_trips["DESTINATION_LAT"])

Using default parameter: gbt_use_exact_lat=False


In [18]:
lat_importances = dict(zip(lat_features,lat_clf.feature_importances_))
pd.DataFrame(sorted(lat_importances.items(), key=operator.itemgetter(1),reverse=True),columns=["name","importance"])

Unnamed: 0,name,importance
0,TRIP_LAT_MIN,0.197597
1,TRIP_LAT_MEAN,0.159804
2,TRIP_LAT_MAX,0.146109
3,TRIP_LAT_MAX_DIFF,0.121821
4,DEPARTURE_LAT,0.08949
5,TRIP_LAT_MIN_DIFF,0.075511
6,TRIP_LAT_STD,0.045191
7,TRIP_LAT_MEDIAN,0.037422
8,TRIP_LAT_UNIQUE,0.033655
9,TRIP_LAT_UNIQUE_RATIO,0.033586


## Train longitude learner

In [19]:
lng_clf = GradientBoostingRegressor(n_estimators=num_trees,max_depth=depth)
if ph.get("gbt_use_exact_lng"):
    lng_clf.fit(taxi_trips[lng_features],taxi_trips["DESTINATION_LNG_FULL"])
else:
    lng_clf.fit(taxi_trips[lng_features],taxi_trips["DESTINATION_LNG"])

Using default parameter: gbt_use_exact_lng=False


In [20]:
lng_importances = dict(zip(lng_features,lng_clf.feature_importances_))
pd.DataFrame(sorted(lng_importances.items(), key=operator.itemgetter(1),reverse=True),columns=["name","importance"])

Unnamed: 0,name,importance
0,TRIP_LNG_MEAN,0.352347
1,TRIP_LNG_MAX,0.113811
2,TRIP_LNG_MIN_DIFF,0.098697
3,TRIP_LNG_MAX_DIFF,0.091751
4,TRIP_LNG_MIN,0.075198
5,DEPARTURE_LNG,0.05301
6,TRIP_LNG_UNIQUE_RATIO,0.052352
7,TRIP_LNG_UNIQUE,0.04718
8,TRIP_SIZE,0.04444
9,TRIP_LNG_STD,0.024067


## Saving feature importances

In [21]:
lat_imp_df = pd.DataFrame(lat_importances.items(),columns=["name","weight"])
lng_imp_df = pd.DataFrame(lng_importances.items(),columns=["name","weight"])
imp_df = pd.concat([lat_imp_df,lng_imp_df]).groupby("name").mean()
imp_df = pd.DataFrame(imp_df).reset_index()
imp_df.sort_values("weight",ascending=False)

Unnamed: 0,name,weight
17,TRIP_LNG_MEAN,0.352347
10,TRIP_LAT_MIN,0.197597
6,TRIP_LAT_MEAN,0.159804
4,TRIP_LAT_MAX,0.146109
5,TRIP_LAT_MAX_DIFF,0.121821
15,TRIP_LNG_MAX,0.113811
22,TRIP_LNG_MIN_DIFF,0.098697
16,TRIP_LNG_MAX_DIFF,0.091751
1,DEPARTURE_LAT,0.08949
11,TRIP_LAT_MIN_DIFF,0.075511


In [22]:
imp_df.to_csv("%s/gbt_feature_importances%s_lng%i_lat%i.csv"  % (data_folder, part, lng_digits, lat_digits),index=False)

## Evaluation

In [23]:
from haversine import haversine

#### Using original destination

In [24]:
train_haversine, train_pred_df = mu.eval_gbt_models(taxi_trips,lat_clf,lng_clf,lat_features,lng_features)
test_haversine, test_pred_df = mu.eval_gbt_models(taxi_trips_TEST,lat_clf,lng_clf,lat_features,lng_features)
print "test: %f, train: %f (Haversine distance)" % (test_haversine, train_haversine)

test: 0.727116, train: 0.675459 (Haversine distance)


In [25]:
if ph.get("export_prediction"):
    train_pred_df.to_csv("%s/gbt_model_pred_train%s_lng%i_lat%i.csv"  % (data_folder, part, lng_digits, lat_digits),index=False)
    test_pred_df.to_csv("%s/gbt_model_pred_test%s_lng%i_lat%i.csv"  % (data_folder, part, lng_digits, lat_digits) ,index=False)
    print("Predictions were exported")

Predictions were exported


#### Using rounded destination

train_haversine, _ = mu.eval_gbt_models(taxi_trips,lat_clf,lng_clf,lat_features,lng_features,use_original_trg=False)
test_haversine, _ = mu.eval_gbt_models(taxi_trips_TEST,lat_clf,lng_clf,lat_features,lng_features,use_original_trg=False)
print "test: %f, train: %f (Haversine distance)" % (test_haversine, train_haversine)