In [1]:
import pandas as pd
import numpy as np

In [2]:
%matplotlib inline

In [3]:
import sys
from datawand.parametrization import ParamHelper
ph = ParamHelper("../pipelines/EcmlTaxi.json",sys.argv)

Default parameters:
{u'data_folder': u'/home/fberes/DATA/taxi-challange/', u'gbt_use_exact_lat': False, u'lng_digits': 2, u'lat_digits': 3, u'is_test': False, u'part': u'_50000', u'gbt_use_exact_lng': False}
Custom parameters:
{u'is_test': False}


In [4]:
data_folder = ph.get("data_folder")

Using default parameter: data_folder=/home/fberes/DATA/taxi-challange/


In [5]:
is_test = ph.get("is_test")

In [6]:
part = ph.get("part")
lat_digits = ph.get("lat_digits")
lng_digits = ph.get("lng_digits")

Using default parameter: part=_50000
Using default parameter: lat_digits=3
Using default parameter: lng_digits=2


In [7]:
if is_test:
    taxi_trips =  pd.read_csv("%s/test.csv" % (data_folder))
else:
    taxi_trips =  pd.read_csv("%s/train%s.csv" % (data_folder, part))

# Parsing GPS coordinates

In [8]:
def round_gps(coord,digit0=None,digit1=None):
    x, y = coord
    if digit0 != None:
        x = round(x,digit0)
    if digit1 != None:
        y = round(y,digit1)
    return (x,y)

def round_gps_list(coord_list,digit0=None,digit1=None):
    return [round_gps(coord,digit0,digit1) for coord in coord_list]

### Converting string locations to gps lists

In [9]:
taxi_trips["POLYLINE_ORIGI"] = taxi_trips["POLYLINE"].apply(lambda x: round_gps_list(eval(x)))

In [10]:
taxi_trips["POLYLINE"] = taxi_trips["POLYLINE"].apply(lambda x: round_gps_list(eval(x),digit0=lng_digits,digit1=lat_digits))
taxi_trips["TRIP_LAT"] = taxi_trips["POLYLINE"].apply(lambda x: [loc[1] for loc in x][1:-1])
taxi_trips["TRIP_LNG"] = taxi_trips["POLYLINE"].apply(lambda x: [loc[0] for loc in x][1:-1])

In [11]:
taxi_trips.head(2)

Unnamed: 0,TRIP_ID,CALL_TYPE,ORIGIN_CALL,ORIGIN_STAND,TAXI_ID,TIMESTAMP,DAY_TYPE,MISSING_DATA,POLYLINE,POLYLINE_ORIGI,TRIP_LAT,TRIP_LNG
0,1372636858620000589,other,,,20000589,1372636858,normal,False,"[(-8.62, 41.141), (-8.62, 41.141), (-8.62, 41....","[(-8.618643, 41.141412), (-8.618499, 41.141376...","[41.141, 41.143, 41.144, 41.144, 41.145, 41.14...","[-8.62, -8.62, -8.62, -8.62, -8.63, -8.63, -8...."
1,1372637303620000596,driver,,7.0,20000596,1372637303,normal,False,"[(-8.64, 41.16), (-8.64, 41.16), (-8.64, 41.16...","[(-8.639847, 41.159826), (-8.640351, 41.159871...","[41.16, 41.16, 41.16, 41.161, 41.161, 41.162, ...","[-8.64, -8.64, -8.64, -8.65, -8.65, -8.65, -8...."


### Departures and destinations

In [12]:
taxi_trips["DESTINATION_LAT_FULL"] = taxi_trips.apply(lambda x : x["POLYLINE_ORIGI"][-1][1] if len(x["POLYLINE_ORIGI"])>0 else np.nan, axis=1)
taxi_trips["DESTINATION_LNG_FULL"] = taxi_trips.apply(lambda x : x["POLYLINE_ORIGI"][-1][0] if len(x["POLYLINE_ORIGI"])>0 else np.nan, axis=1)

In [13]:
taxi_trips["DESTINATION_LAT"] = taxi_trips.apply(lambda x : x["POLYLINE"][-1][1] if len(x["POLYLINE"])>0 else np.nan, axis=1)
taxi_trips["DESTINATION_LNG"] = taxi_trips.apply(lambda x : x["POLYLINE"][-1][0] if len(x["POLYLINE"])>0 else np.nan, axis=1)

In [14]:
taxi_trips["DEPARTURE_LAT_FULL"] = taxi_trips.apply(lambda x : x["POLYLINE_ORIGI"][0][1] if len(x["POLYLINE_ORIGI"])>0 else np.nan, axis=1)
taxi_trips["DEPARTURE_LNG_FULL"] = taxi_trips.apply(lambda x : x["POLYLINE_ORIGI"][0][0] if len(x["POLYLINE_ORIGI"])>0 else np.nan, axis=1)

In [15]:
taxi_trips["DEPARTURE_LAT"] = taxi_trips.apply(lambda x : x["POLYLINE"][0][1] if len(x["POLYLINE"])>0 else np.nan, axis=1)
taxi_trips["DEPARTURE_LNG"] = taxi_trips.apply(lambda x : x["POLYLINE"][0][0] if len(x["POLYLINE"])>0 else np.nan, axis=1)

In [16]:
del taxi_trips["POLYLINE_ORIGI"]

In [17]:
print len(taxi_trips["DESTINATION_LAT"].astype("str").unique())
print len(taxi_trips["DESTINATION_LNG"].astype("str").unique())

405
77


In [18]:
print len(taxi_trips["DEPARTURE_LAT"].astype("str").unique())
print len(taxi_trips["DEPARTURE_LNG"].astype("str").unique())

192
40


#### Unique locations

In [19]:
gps_locs = {}
for i in xrange(len(taxi_trips)):
    for loc in set(taxi_trips.ix[i]["POLYLINE"]):
        if not loc in gps_locs:
            gps_locs[loc] = 0
        gps_locs[loc] += 1

In [20]:
len(gps_locs)

13465

### Route statistics

#### features trip length 

In [21]:
taxi_trips["TRIP_SIZE"] = taxi_trips["TRIP_LAT"].apply(lambda x: len(x))

In [22]:
for col in ["TRIP_LAT","TRIP_LNG"]:
    taxi_trips["%s_UNIQUE" % col] = taxi_trips[col].apply(lambda x: 0 if len(x)==0 else len(np.unique(x)))
    taxi_trips["%s_UNIQUE_RATIO" % col] = taxi_trips[["%s_UNIQUE" % col,"TRIP_SIZE"]].apply(lambda x: float(x["%s_UNIQUE" % col]) / x["TRIP_SIZE"] if x["TRIP_SIZE"] > 0 else 0.0, axis=1)

taxi_trips["TRIP_LNG_UNIQUE_RATIO"].hist(bins=50)

#### features based on statistics

In [23]:
for col in ["TRIP_LAT","TRIP_LNG"]:
    taxi_trips["%s_MIN" % col] = taxi_trips[col].apply(lambda x: np.nan if len(x)==0 else np.min(x))
    taxi_trips["%s_MAX" % col] = taxi_trips[col].apply(lambda x: np.nan if len(x)==0 else np.max(x))
    taxi_trips["%s_MEAN" % col] = taxi_trips[col].apply(lambda x: np.nan if len(x)==0 else np.mean(x))
    taxi_trips["%s_MEDIAN" % col] = taxi_trips[col].apply(lambda x: np.nan if len(x)==0 else np.median(x))
    taxi_trips["%s_STD" % col] = taxi_trips[col].apply(lambda x: np.nan if len(x)==0 else np.std(x))

In [24]:
taxi_trips.head(1)

Unnamed: 0,TRIP_ID,CALL_TYPE,ORIGIN_CALL,ORIGIN_STAND,TAXI_ID,TIMESTAMP,DAY_TYPE,MISSING_DATA,POLYLINE,TRIP_LAT,...,TRIP_LAT_MIN,TRIP_LAT_MAX,TRIP_LAT_MEAN,TRIP_LAT_MEDIAN,TRIP_LAT_STD,TRIP_LNG_MIN,TRIP_LNG_MAX,TRIP_LNG_MEAN,TRIP_LNG_MEDIAN,TRIP_LNG_STD
0,1372636858620000589,other,,,20000589,1372636858,normal,False,"[(-8.62, 41.141), (-8.62, 41.141), (-8.62, 41....","[41.141, 41.143, 41.144, 41.144, 41.145, 41.14...",...,41.141,41.155,41.149286,41.151,0.004355,-8.63,-8.62,-8.628095,-8.63,0.003927


#### features based on distance between departure and statistics

In [25]:
for feat_type in ["MIN","MAX","MEAN","MEDIAN"]:
    for axis in ["LAT","LNG"]:
        stat_feat = "TRIP_%s_%s" % (axis,feat_type)
        depart_feat = "DEPARTURE_%s" % axis
        taxi_trips["TRIP_%s_%s_DIFF" % (axis,feat_type)] = taxi_trips[[depart_feat,stat_feat]].apply(lambda x: np.nan if x[stat_feat]==np.nan else abs(x[depart_feat]-x[stat_feat]), axis=1)

# Parsing Time information

In [26]:
import datetime

def get_time_info(timestamp,info_type):
    dt = datetime.datetime.fromtimestamp(timestamp)
    if info_type == "date":
        return datetime.datetime(dt.year,dt.month,dt.day)
    elif info_type == "time":
        return datetime.time(dt.hour,dt.minute,dt.second)
    elif info_type == "time_of_day":
        return dt.hour // 6
    elif info_type == "day_of_week":
        return dt.weekday()
    else:
        raise RuntimeError("Bad argument for 'info_type': %s" % info_type)

In [27]:
taxi_trips["DATE"] = taxi_trips["TIMESTAMP"].apply(lambda x: get_time_info(x,"date"))

In [28]:
taxi_trips["TIME"] = taxi_trips["TIMESTAMP"].apply(lambda x: get_time_info(x,"time"))

In [29]:
taxi_trips["DAY_OF_WEEK"] = taxi_trips["TIMESTAMP"].apply(lambda x: get_time_info(x,"day_of_week"))

In [30]:
taxi_trips["TIME_OF_DAY"] = taxi_trips["TIMESTAMP"].apply(lambda x: get_time_info(x,"time_of_day"))

In [31]:
taxi_trips.head(2)

Unnamed: 0,TRIP_ID,CALL_TYPE,ORIGIN_CALL,ORIGIN_STAND,TAXI_ID,TIMESTAMP,DAY_TYPE,MISSING_DATA,POLYLINE,TRIP_LAT,...,TRIP_LAT_MAX_DIFF,TRIP_LNG_MAX_DIFF,TRIP_LAT_MEAN_DIFF,TRIP_LNG_MEAN_DIFF,TRIP_LAT_MEDIAN_DIFF,TRIP_LNG_MEDIAN_DIFF,DATE,TIME,DAY_OF_WEEK,TIME_OF_DAY
0,1372636858620000589,other,,,20000589,1372636858,normal,False,"[(-8.62, 41.141), (-8.62, 41.141), (-8.62, 41....","[41.141, 41.143, 41.144, 41.144, 41.145, 41.14...",...,0.014,0.0,0.008286,0.008095,0.01,0.01,2013-07-01,00:00:58,0,0
1,1372637303620000596,driver,,7.0,20000596,1372637303,normal,False,"[(-8.64, 41.16), (-8.64, 41.16), (-8.64, 41.16...","[41.16, 41.16, 41.16, 41.161, 41.161, 41.162, ...",...,0.011,0.0,0.004294,0.019412,0.004,0.02,2013-07-01,00:08:23,0,0


In [32]:
taxi_trips.columns

Index([u'TRIP_ID', u'CALL_TYPE', u'ORIGIN_CALL', u'ORIGIN_STAND', u'TAXI_ID',
       u'TIMESTAMP', u'DAY_TYPE', u'MISSING_DATA', u'POLYLINE', u'TRIP_LAT',
       u'TRIP_LNG', u'DESTINATION_LAT_FULL', u'DESTINATION_LNG_FULL',
       u'DESTINATION_LAT', u'DESTINATION_LNG', u'DEPARTURE_LAT_FULL',
       u'DEPARTURE_LNG_FULL', u'DEPARTURE_LAT', u'DEPARTURE_LNG', u'TRIP_SIZE',
       u'TRIP_LAT_UNIQUE', u'TRIP_LAT_UNIQUE_RATIO', u'TRIP_LNG_UNIQUE',
       u'TRIP_LNG_UNIQUE_RATIO', u'TRIP_LAT_MIN', u'TRIP_LAT_MAX',
       u'TRIP_LAT_MEAN', u'TRIP_LAT_MEDIAN', u'TRIP_LAT_STD', u'TRIP_LNG_MIN',
       u'TRIP_LNG_MAX', u'TRIP_LNG_MEAN', u'TRIP_LNG_MEDIAN', u'TRIP_LNG_STD',
       u'TRIP_LAT_MIN_DIFF', u'TRIP_LNG_MIN_DIFF', u'TRIP_LAT_MAX_DIFF',
       u'TRIP_LNG_MAX_DIFF', u'TRIP_LAT_MEAN_DIFF', u'TRIP_LNG_MEAN_DIFF',
       u'TRIP_LAT_MEDIAN_DIFF', u'TRIP_LNG_MEDIAN_DIFF', u'DATE', u'TIME',
       u'DAY_OF_WEEK', u'TIME_OF_DAY'],
      dtype='object')

# OneHotEncoding

In [33]:
def one_hot_one_column(dataframe, feature):
    if "float" in str(dataframe[feature].dtype) or "int" in str(dataframe[feature].dtype):
        raise RuntimeError("feature is already numeric")
    df_dummy = pd.get_dummies(dataframe[feature], prefix=feature+"_onehot")
    #dataframe = dataframe.drop(feature, axis=1)
    return dataframe.join(df_dummy)

def one_hot(dataframe, features):
    if type(features) is str:
        features = [features]
    for feature in features:
        print feature
        dataframe = one_hot_one_column(dataframe, feature)
    return dataframe

In [34]:
categorical_columns = ['CALL_TYPE']
taxi_trips = one_hot(taxi_trips, categorical_columns)

CALL_TYPE


In [35]:
taxi_trips.head(2)

Unnamed: 0,TRIP_ID,CALL_TYPE,ORIGIN_CALL,ORIGIN_STAND,TAXI_ID,TIMESTAMP,DAY_TYPE,MISSING_DATA,POLYLINE,TRIP_LAT,...,TRIP_LNG_MEAN_DIFF,TRIP_LAT_MEDIAN_DIFF,TRIP_LNG_MEDIAN_DIFF,DATE,TIME,DAY_OF_WEEK,TIME_OF_DAY,CALL_TYPE_onehot_dispatch,CALL_TYPE_onehot_driver,CALL_TYPE_onehot_other
0,1372636858620000589,other,,,20000589,1372636858,normal,False,"[(-8.62, 41.141), (-8.62, 41.141), (-8.62, 41....","[41.141, 41.143, 41.144, 41.144, 41.145, 41.14...",...,0.008095,0.01,0.01,2013-07-01,00:00:58,0,0,0.0,0.0,1.0
1,1372637303620000596,driver,,7.0,20000596,1372637303,normal,False,"[(-8.64, 41.16), (-8.64, 41.16), (-8.64, 41.16...","[41.16, 41.16, 41.16, 41.161, 41.161, 41.162, ...",...,0.019412,0.004,0.02,2013-07-01,00:08:23,0,0,0.0,1.0,0.0


# Export data file

In [36]:
if is_test:
    taxi_trips.to_csv("%s/gen_test_lng%i_lat%i.csv" % (data_folder, lng_digits, lat_digits),index=False)
else:
    taxi_trips.to_csv("%s/gen_train%s_lng%i_lat%i.csv" % (data_folder, part, lng_digits, lat_digits),index=False)