In [None]:
import sys
sys.path.insert(0,"../python/")
import modeling_utils as mu

In [None]:
import pandas as pd
import numpy as np

In [None]:
%matplotlib inline

In [None]:
import sys
from datawand.parametrization import ParamHelper
ph = ParamHelper("../pipelines/EcmlTaxi.json",sys.argv)

In [None]:
data_folder = ph.get("data_folder")
part = ph.get("part")
lat_digits = ph.get("lat_digits")
lng_digits = ph.get("lng_digits")

# 1. Load data with generated features

In [None]:
taxi_trips = pd.read_csv("%s/gen_train%s_lng%i_lat%i.csv" % (data_folder, part, lng_digits, lat_digits))
taxi_trips = taxi_trips[['DESTINATION_LAT_FULL','DESTINATION_LNG_FULL','DESTINATION_LAT','DESTINATION_LNG']]

In [None]:
taxi_trips.columns

In [None]:
taxi_trips.head()

In [None]:
lat_splits = sorted(taxi_trips["DESTINATION_LAT"].unique())
print 'Unique latitude splits: %i' % len(lat_splits)

In [None]:
lng_splits = sorted(taxi_trips["DESTINATION_LNG"].unique())
print 'Unique longitude splits: %i' % len(lng_splits)

# 2. Drop invalid data

In [None]:
print len(taxi_trips)
taxi_trips = taxi_trips[np.isfinite(taxi_trips["DESTINATION_LAT_FULL"])]
print len(taxi_trips)
taxi_trips = taxi_trips[np.isfinite(taxi_trips["DESTINATION_LNG_FULL"])]
print len(taxi_trips)

# 3. Generate location cell keys

## a.) For the traing data

In [None]:
def generate_bins(df,col,digits,loc_bounds=None):
    eps = 0.0001
    prec = np.power(10.0,-1.0 * digits)
    print prec
    if loc_bounds == None:
        loc_min, loc_max = df[col].min(), df[col].max()
    else:
        loc_min, loc_max = loc_bounds
    #loc_splits = np.arange(loc_min-prec,loc_max+prec,prec)
    loc_splits = np.arange(loc_min-prec,loc_max+2*prec,prec)
    print loc_min, loc_max, len(loc_splits)
    return pd.cut(df[col],loc_splits,include_lowest=True,labels=range(len(loc_splits)-1)), (loc_min, loc_max)

def generate_loc_key(df,lat_col,lng_col,lat_bounds=None,lng_bounds=None,drop_tmp_keys=True):
    df["LAT_KEY"], lat_bounds = generate_bins(df,lat_col,digits=lat_digits,loc_bounds=lat_bounds)
    df["LNG_KEY"], lng_bounds = generate_bins(df,lng_col,digits=lng_digits,loc_bounds=lng_bounds)
    df["LOC_KEY"] = df.apply(lambda x: "%s_%s" % (str(x["LAT_KEY"]),str(x["LNG_KEY"])), axis=1)
    if drop_tmp_keys:
        df = df.drop(["LAT_KEY","LNG_KEY"],axis=1)
    print len(df["LOC_KEY"].unique())
    return df, lat_bounds, lng_bounds

In [None]:
taxi_trips, LAT_BOUNDS, LNG_BOUNDS = generate_loc_key(taxi_trips,lat_col="DESTINATION_LAT_FULL",lng_col="DESTINATION_LNG_FULL")

In [None]:
taxi_trips.head(2)

## b.) Apply these cells for GBT model predictions

In [None]:
gbt_train_prediction = pd.read_csv("%s/gbt_model_pred_train%s_lng%i_lat%i.csv"  % (data_folder, part, lng_digits, lat_digits))
gbt_train_prediction = gbt_train_prediction.drop(["DESTINATION_LAT","DESTINATION_LNG"],axis=1)
print len(gbt_train_prediction)

In [None]:
gbt_test_prediction = pd.read_csv("%s/gbt_model_pred_test%s_lng%i_lat%i.csv"  % (data_folder, part, lng_digits, lat_digits))
gbt_test_prediction = gbt_test_prediction.drop(["DESTINATION_LAT","DESTINATION_LNG"],axis=1)
print len(gbt_test_prediction)

In [None]:
gbt_train_prediction.head()

In [None]:
gbt_train_prediction, _, _ = generate_loc_key(gbt_train_prediction,lat_col="LABEL_LAT",lng_col="LABEL_LNG",lat_bounds=LAT_BOUNDS,lng_bounds=LNG_BOUNDS)

In [None]:
gbt_test_prediction, _, _ = generate_loc_key(gbt_test_prediction,lat_col="LABEL_LAT",lng_col="LABEL_LNG",lat_bounds=LAT_BOUNDS,lng_bounds=LNG_BOUNDS)

In [None]:
len(set(gbt_train_prediction["LOC_KEY"]).intersection(set(gbt_test_prediction["LOC_KEY"])))

# 4. Define POI based on GPS locations

In [None]:
from haversine import haversine

#### Round locations for  ~10 meter precision to define spots

In [None]:
haversine((-8.618643, 41.1414),(-8.618643, 41.1415))

In [None]:
PRECISION = 4
taxi_trips["DESTINATION_LAT"] = taxi_trips["DESTINATION_LAT_FULL"].apply(lambda x: round(x,PRECISION))
taxi_trips["DESTINATION_LNG"] = taxi_trips["DESTINATION_LNG_FULL"].apply(lambda x: round(x,PRECISION))

#### Mean distance after decreasing the accuracy of GPS locations: ~3 meter

In [None]:
mean_haversine, _ = mu.compute_mean_haversine(taxi_trips,("DESTINATION_LAT","DESTINATION_LNG"),("DESTINATION_LAT_FULL","DESTINATION_LNG_FULL"))
print mean_haversine

In [None]:
taxi_trips["POI"] = taxi_trips.apply(lambda x: (x["DESTINATION_LAT"],x["DESTINATION_LNG"]),axis=1)

In [None]:
taxi_trips.head(2)

### Location popularity histogram

In [None]:
taxi_trips[["LOC_KEY","POI"]].groupby("LOC_KEY").count().hist(bins=50)

# 5. Specifying destination predictions

## a.) Predict the mean of spots within a cell

In [None]:
cell_means = taxi_trips[["LOC_KEY","DESTINATION_LAT","DESTINATION_LNG"]].groupby("LOC_KEY").mean()
cell_means = pd.DataFrame(cell_means).reset_index()

In [None]:
cell_means.head()

In [None]:
def set_mean_destination(df,detailed_locs):
    joined = df.merge(detailed_locs,on="LOC_KEY",how='left')
    joined["MEAN_PRED_LAT"] = joined.apply(lambda x: x["PRED_LAT"] if x["DESTINATION_LAT"] == np.nan else x["DESTINATION_LAT"],axis=1)
    joined["MEAN_PRED_LNG"] = joined.apply(lambda x: x["PRED_LNG"] if x["DESTINATION_LNG"] == np.nan else x["DESTINATION_LNG"],axis=1)
    return joined

In [None]:
gbt_train_prediction = set_mean_destination(gbt_train_prediction,cell_means)

In [None]:
gbt_test_prediction = set_mean_destination(gbt_test_prediction,cell_means)

#### There are 6 routes that are not covered due to the fact that I only used a small fraction of the training data

In [None]:
gbt_test_prediction[gbt_test_prediction["MEAN_PRED_LNG"].isnull()]

### Performance results of the mean predictor model

In [None]:
mean_pred_train_haversine, _ = mu.compute_mean_haversine(gbt_train_prediction,("MEAN_PRED_LAT","MEAN_PRED_LNG"),("LABEL_LAT","LABEL_LNG"))
mean_pred_test_haversine, _ = mu.compute_mean_haversine(gbt_test_prediction,("MEAN_PRED_LAT","MEAN_PRED_LNG"),("LABEL_LAT","LABEL_LNG"))
print "Test: %f, Train: %f (Haversine)" % (mean_pred_test_haversine,mean_pred_train_haversine)

In [None]:
mean_pred_test_haversine, _ = mu.compute_mean_haversine(gbt_test_prediction,("MEAN_PRED_LAT","MEAN_PRED_LNG"),("LABEL_LAT","LABEL_LNG"))
print mean_pred_test_haversine

In [None]:
print len(gbt_train_prediction), len(gbt_test_prediction)

## b.) Combine locations using k-closest point

#### Trip length based features had to be excluded: because in test the trip length was smaller

In [None]:
#feat_order = list(gbt_importances["name"])
feat_order = []
feat_order += ["DEPARTURE_LAT","TRIP_LAT_MEAN","TRIP_LAT_MIN","TRIP_LAT_MAX","TRIP_LAT_MEDIAN"]
feat_order += ["DEPARTURE_LNG","TRIP_LNG_MEAN","TRIP_LNG_MIN","TRIP_LNG_MAX","TRIP_LNG_MEDIAN"]
feat_order += ['DESTINATION_LAT_FULL', 'DESTINATION_LNG_FULL']
print feat_order

In [None]:
%%time
knn_models, cell_locations = mu.init_knn_models(gbt_train_prediction,feat_order,k=2)

In [None]:
%%time
mu.predict_with_knn(gbt_test_prediction,knn_models,cell_locations,feat_order)

%%time
predict_with_knn(gbt_train_prediction,knn_models,cell_locations,feat_order)

In [None]:
gbt_test_prediction[feat_order].head()

### Performance results of the k-NN based model

In [None]:
knn_pred_test_haversine, _ = mu.compute_mean_haversine(gbt_test_prediction,("KNN_PRED_LAT","KNN_PRED_LNG"),("LABEL_LAT","LABEL_LNG"))
print knn_pred_test_haversine

In [None]:
len(gbt_test_prediction)

In [None]:
len(gbt_test_prediction[(gbt_test_prediction["MEAN_PRED_LAT"] != gbt_test_prediction["KNN_PRED_LAT"]) | (gbt_test_prediction["MEAN_PRED_LNG"] != gbt_test_prediction["KNN_PRED_LNG"])])

# Visualization of the predictions

In [None]:
from bokeh.plotting import figure, show, output_file, output_notebook
output_notebook()

def visualize_predition(df, pred_prefix, mean_haversine, show_mean=False):
    lat0 = df["LABEL_LAT"]
    lat1 = df["%s_PRED_LAT" % pred_prefix]
    lng0 = df["LABEL_LNG"]
    lng1 = df["%s_PRED_LNG" % pred_prefix]
    lat2 = df["MEAN_PRED_LAT"]
    lng2 = df["MEAN_PRED_LNG"]
    # init plot
    p = figure(title = "Predicted Destinations with %s. Mean Haversine distance: %f" % (pred_prefix,mean_haversine), width=900, height=600)
    p.xaxis.axis_label = 'LNG'
    p.yaxis.axis_label = 'LAT'
    # errors
    p.segment(x0=lng0, y0=lat0, x1=lng1, y1=lat1, color="black", line_width=1)
    # locations
    if show_mean:
        p.circle(lng2, lat2, color='green', fill_alpha=0.2, size=10)
    p.circle(lng1, lat1, color='red', fill_alpha=0.2, size=10)
    p.circle(lng0, lat0, color='blue', fill_alpha=0.2, size=10)
    #output
    output_file("%s_destination_prediction_lng%i_lat%i.html" % (pred_prefix, lng_digits, lat_digits), title="Predicted Destinations")
    show(p, notebook_handle=True)

### Prediction results of the mean predictor

In [None]:
visualize_predition(gbt_test_prediction,"MEAN",mean_pred_test_haversine)

### Prediction results of the k-NN predictor

In [None]:
visualize_predition(gbt_test_prediction,"KNN",knn_pred_test_haversine,show_mean=False)