In [None]:
import pandas as pd
import numpy as np

In [None]:
%matplotlib inline

In [None]:
import sys
from datawand.parametrization import ParamHelper
ph = ParamHelper("../pipelines/EcmlTaxi.json",sys.argv)

In [None]:
data_folder = ph.get("data_folder")

In [None]:
is_test = ph.get("is_test")

In [None]:
part = ph.get("part")
lat_digits = ph.get("lat_digits")
lng_digits = ph.get("lng_digits")

In [None]:
if is_test:
    taxi_trips =  pd.read_csv("%s/test.csv" % (data_folder))
else:
    taxi_trips =  pd.read_csv("%s/train%s.csv" % (data_folder, part))

# Parsing GPS coordinates

In [None]:
def round_gps(coord,digit0=None,digit1=None):
    x, y = coord
    if digit0 != None:
        x = round(x,digit0)
    if digit1 != None:
        y = round(y,digit1)
    return (x,y)

def round_gps_list(coord_list,digit0=None,digit1=None):
    return [round_gps(coord,digit0,digit1) for coord in coord_list]

### Converting string locations to gps lists

In [None]:
taxi_trips["POLYLINE_ORIGI"] = taxi_trips["POLYLINE"].apply(lambda x: round_gps_list(eval(x)))

In [None]:
taxi_trips["POLYLINE"] = taxi_trips["POLYLINE"].apply(lambda x: round_gps_list(eval(x),digit0=lng_digits,digit1=lat_digits))
taxi_trips["TRIP_LAT"] = taxi_trips["POLYLINE"].apply(lambda x: [loc[1] for loc in x][1:-1])
taxi_trips["TRIP_LNG"] = taxi_trips["POLYLINE"].apply(lambda x: [loc[0] for loc in x][1:-1])

In [None]:
taxi_trips.head(2)

### Departures and destinations

In [None]:
taxi_trips["DESTINATION_LAT_FULL"] = taxi_trips.apply(lambda x : x["POLYLINE_ORIGI"][-1][1] if len(x["POLYLINE_ORIGI"])>0 else np.nan, axis=1)
taxi_trips["DESTINATION_LNG_FULL"] = taxi_trips.apply(lambda x : x["POLYLINE_ORIGI"][-1][0] if len(x["POLYLINE_ORIGI"])>0 else np.nan, axis=1)

In [None]:
taxi_trips["DESTINATION_LAT"] = taxi_trips.apply(lambda x : x["POLYLINE"][-1][1] if len(x["POLYLINE"])>0 else np.nan, axis=1)
taxi_trips["DESTINATION_LNG"] = taxi_trips.apply(lambda x : x["POLYLINE"][-1][0] if len(x["POLYLINE"])>0 else np.nan, axis=1)

In [None]:
taxi_trips["DEPARTURE_LAT_FULL"] = taxi_trips.apply(lambda x : x["POLYLINE_ORIGI"][0][1] if len(x["POLYLINE_ORIGI"])>0 else np.nan, axis=1)
taxi_trips["DEPARTURE_LNG_FULL"] = taxi_trips.apply(lambda x : x["POLYLINE_ORIGI"][0][0] if len(x["POLYLINE_ORIGI"])>0 else np.nan, axis=1)

In [None]:
taxi_trips["DEPARTURE_LAT"] = taxi_trips.apply(lambda x : x["POLYLINE"][0][1] if len(x["POLYLINE"])>0 else np.nan, axis=1)
taxi_trips["DEPARTURE_LNG"] = taxi_trips.apply(lambda x : x["POLYLINE"][0][0] if len(x["POLYLINE"])>0 else np.nan, axis=1)

In [None]:
del taxi_trips["POLYLINE_ORIGI"]

In [None]:
print len(taxi_trips["DESTINATION_LAT"].astype("str").unique())
print len(taxi_trips["DESTINATION_LNG"].astype("str").unique())

In [None]:
print len(taxi_trips["DEPARTURE_LAT"].astype("str").unique())
print len(taxi_trips["DEPARTURE_LNG"].astype("str").unique())

#### Unique locations

In [None]:
gps_locs = {}
for i in xrange(len(taxi_trips)):
    for loc in set(taxi_trips.ix[i]["POLYLINE"]):
        if not loc in gps_locs:
            gps_locs[loc] = 0
        gps_locs[loc] += 1

In [None]:
len(gps_locs)

### Route statistics

#### features trip length 

In [None]:
taxi_trips["TRIP_SIZE"] = taxi_trips["TRIP_LAT"].apply(lambda x: len(x))

In [None]:
for col in ["TRIP_LAT","TRIP_LNG"]:
    taxi_trips["%s_UNIQUE" % col] = taxi_trips[col].apply(lambda x: 0 if len(x)==0 else len(np.unique(x)))
    taxi_trips["%s_UNIQUE_RATIO" % col] = taxi_trips[["%s_UNIQUE" % col,"TRIP_SIZE"]].apply(lambda x: float(x["%s_UNIQUE" % col]) / x["TRIP_SIZE"] if x["TRIP_SIZE"] > 0 else 0.0, axis=1)

taxi_trips["TRIP_LNG_UNIQUE_RATIO"].hist(bins=50)

#### features based on statistics

In [None]:
for col in ["TRIP_LAT","TRIP_LNG"]:
    taxi_trips["%s_MIN" % col] = taxi_trips[col].apply(lambda x: np.nan if len(x)==0 else np.min(x))
    taxi_trips["%s_MAX" % col] = taxi_trips[col].apply(lambda x: np.nan if len(x)==0 else np.max(x))
    taxi_trips["%s_MEAN" % col] = taxi_trips[col].apply(lambda x: np.nan if len(x)==0 else np.mean(x))
    taxi_trips["%s_MEDIAN" % col] = taxi_trips[col].apply(lambda x: np.nan if len(x)==0 else np.median(x))
    taxi_trips["%s_STD" % col] = taxi_trips[col].apply(lambda x: np.nan if len(x)==0 else np.std(x))

In [None]:
taxi_trips.head(1)

#### features based on distance between departure and statistics

In [None]:
for feat_type in ["MIN","MAX","MEAN","MEDIAN"]:
    for axis in ["LAT","LNG"]:
        stat_feat = "TRIP_%s_%s" % (axis,feat_type)
        depart_feat = "DEPARTURE_%s" % axis
        taxi_trips["TRIP_%s_%s_DIFF" % (axis,feat_type)] = taxi_trips[[depart_feat,stat_feat]].apply(lambda x: np.nan if x[stat_feat]==np.nan else abs(x[depart_feat]-x[stat_feat]), axis=1)

# Parsing Time information

In [None]:
import datetime

def get_time_info(timestamp,info_type):
    dt = datetime.datetime.fromtimestamp(timestamp)
    if info_type == "date":
        return datetime.datetime(dt.year,dt.month,dt.day)
    elif info_type == "time":
        return datetime.time(dt.hour,dt.minute,dt.second)
    elif info_type == "time_of_day":
        return dt.hour // 6
    elif info_type == "day_of_week":
        return dt.weekday()
    else:
        raise RuntimeError("Bad argument for 'info_type': %s" % info_type)

In [None]:
taxi_trips["DATE"] = taxi_trips["TIMESTAMP"].apply(lambda x: get_time_info(x,"date"))

In [None]:
taxi_trips["TIME"] = taxi_trips["TIMESTAMP"].apply(lambda x: get_time_info(x,"time"))

In [None]:
taxi_trips["DAY_OF_WEEK"] = taxi_trips["TIMESTAMP"].apply(lambda x: get_time_info(x,"day_of_week"))

In [None]:
taxi_trips["TIME_OF_DAY"] = taxi_trips["TIMESTAMP"].apply(lambda x: get_time_info(x,"time_of_day"))

In [None]:
taxi_trips.head(2)

In [None]:
taxi_trips.columns

# OneHotEncoding

In [None]:
def one_hot_one_column(dataframe, feature):
    if "float" in str(dataframe[feature].dtype) or "int" in str(dataframe[feature].dtype):
        raise RuntimeError("feature is already numeric")
    df_dummy = pd.get_dummies(dataframe[feature], prefix=feature+"_onehot")
    #dataframe = dataframe.drop(feature, axis=1)
    return dataframe.join(df_dummy)

def one_hot(dataframe, features):
    if type(features) is str:
        features = [features]
    for feature in features:
        print feature
        dataframe = one_hot_one_column(dataframe, feature)
    return dataframe

In [None]:
categorical_columns = ['CALL_TYPE']
taxi_trips = one_hot(taxi_trips, categorical_columns)

In [None]:
taxi_trips.head(2)

# Export data file

In [None]:
if is_test:
    taxi_trips.to_csv("%s/gen_test_lng%i_lat%i.csv" % (data_folder, lng_digits, lat_digits),index=False)
else:
    taxi_trips.to_csv("%s/gen_train%s_lng%i_lat%i.csv" % (data_folder, part, lng_digits, lat_digits),index=False)