In [1]:
#libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

#import metrics
from sklearn.model_selection import cross_val_score
from sklearn.metrics import f1_score
pd.options.mode.chained_assignment = None


#read csv files
df = pd.read_csv("data_train.csv")
df2 = pd.read_csv("data_test.csv")

In [2]:
def df_transform(df2):
    #time - stratify into categories, e.g. hours
    df = df2.copy()
    df[["hour_entry", "minute_entry", "second_entry"]] = df["time_entry"].str.extract(r'(\d+):(\d+):(\d+)').astype(int)
    df[["hour_exit", "minute_exit", "second_exit"]] = df["time_exit"].str.extract(r'(\d+):(\d+):(\d+)').astype(int)
    df["totalsecs_entry"] = (3600 * df["hour_entry"] + 60 * df["minute_entry"] + df["second_entry"]) 
    df["totalsecs_exit"] = (3600 * df["hour_exit"] + 60 * df["minute_exit"] + df["second_exit"])
    
    [x_lower, x_upper, y_lower, y_upper] = [3750901.5068,3770901.5068, -19268905.6133,-19208905.6133]
    df["incenter_entry"] =  ((df["x_entry"] < x_lower) & (df["x_entry"] < x_upper) & \
                             (df["y_entry"] < y_lower) &  (df["y_entry"] < y_upper)) * 1
    df["incenter_exit"] =  ((df["x_exit"] < x_lower) & (df["x_exit"] < x_upper) & \
                             (df["y_exit"] < y_lower) &  (df["y_exit"] < y_upper)) * 1
    
    
    #unknowns = ['x_exit','y_exit','dx','dy','vx','vy','dist','speed','n2','incenter_exit']
    targets = df[df['hour_exit'] >= 15][['trajectory_id','x_exit','y_exit','incenter_exit']]
    df.loc[df['hour_exit'] >= 15,['x_exit','y_exit','incenter_exit']] = np.nan

    df["dt"] = df["totalsecs_exit"] - df["totalsecs_entry"]
    df['dx'] = df["x_exit"] - df["x_entry"]
    df['dy'] = df["y_exit"] - df["y_entry"]
    
    tn0 = df[df["dt"] > 0]
    df.loc[df['dt'] > 0,'vx'] = tn0["dx"] / tn0["dt"]
    df.loc[df['dt'] > 0,'vy'] = tn0["dy"] / tn0["dt"]
    df.loc[df['dt'] > 0,'dist'] = np.sqrt(tn0['dx'] * tn0['dx'] + tn0['dy'] * tn0['dy'])
    df.loc[df['dt'] > 0,'speed'] = df.loc[df['dt'] > 0,'dist'] / tn0["dt"]

    df['n'] = df.groupby(['hash']).cumcount()
    df['n2'] = df.groupby(['hash']).cumcount(ascending=False)
    
    unwanted = ['Unnamed: 0', 'Unnamed: 0.1','vmean','vmin','vmax','time_entry','time_exit']
    return df.drop(unwanted,axis=1).rename(index=str,columns={"trajectory_id":"id"}), targets.rename(index=str,columns={"trajectory_id":"id"}).set_index('id')

#process data
df_copy, targets = df_transform(df)

In [None]:
#up to 20 entries for each vehicle, some entries are missing
nth = df.groupby(['hash']).cumcount()
df[df["trajectory_id"].str.extract(r'(\d+)$').astype(int) == df.groupby(['hash']).cumcount()]
nth.hist(bins=40)
(nth + 1).value_counts() / df_copy['cumcount'].shape[0] #16 % have 1 entry only

In [None]:
#Distribution of X Coordinates
df_copy["x_entry"].plot.hist(bins=400,rot=90,figsize=(20,10))
df_copy["x_exit"].plot.hist(bins=400,alpha=0.5,rot=90,figsize=(20,10))

#quite similar for x_entry, x_exit - on average shifted ot the right, became closer together
print(df_copy['x_entry'].describe())
print(df_copy['x_exit'].describe())

In [None]:
#Y Coordinates
df_copy["y_entry"].plot.hist(bins=400,rot=90,figsize=(20,10))
df_copy["y_exit"].plot.hist(bins=400,alpha=0.5,rot=90,figsize=(20,10))

#quite similar for x_entry, x_exit - on average shifted down, became more closer together
print(df_copy['y_entry'].describe())
print(df_copy['y_exit'].describe())

In [None]:
df_copy.groupby(['incenter_entry','incenter_exit'])[['x_entry','y_entry']].describe()
plt.scatter(df_copy['x_entry'],df_copy['y_entry'],alpha=0.005)
plt.scatter(df_copy['x_exit'],df_copy['y_exit'],alpha=0.005)


In [None]:
X_in = df_copy[df_copy['incenter_entry'] == True]
X_out = df_copy[df_copy['incenter_entry'] == False]
X_in_in = X_in[X_in['incenter_exit'] == True]
X_in_out =  X_in[X_in['incenter_exit'] == False]
X_out_in = X_out[X_out['incenter_exit'] == True]
X_out_out =  X_out[X_out['incenter_exit'] == False]


In [None]:
plt.scatter(X_out_out['x_entry'],X_out_out['y_entry'],alpha=0.005,color='yellow')
plt.scatter(X_in_in['x_entry'],X_in_in['y_entry'],alpha=0.005,color='yellow')
plt.scatter(X_out_in['x_entry'],X_out_in['y_entry'],alpha=0.01,color='b')
plt.scatter(X_in_out['x_entry'],X_in_out['y_entry'],alpha=0.01,color='b')



In [None]:
plt.scatter(X_in_in['x_entry'],X_in_in['y_entry'],alpha=0.005)
plt.scatter(X_in_out['x_entry'],X_in_out['y_entry'],alpha=0.01)


In [None]:
plt.scatter(X_out_out['x_entry'],X_out_out['y_entry'],alpha=0.005)
plt.scatter(X_out_in['x_entry'],X_out_in['y_entry'],alpha=0.01)


In [None]:
X_out_out['dx'].hist(bins=200,xrot=90,figsize=(20,10),log=True,alpha=0.5,color='yellow')
X_out_in['dx'].hist(bins=200,xrot=90,figsize=(20,10),log=True,alpha=0.5,color='r')
X_in_out['dx'].hist(bins=200,xrot=90,figsize=(20,10),log=True,alpha=0.5,color='b')
X_in_in['dx'].hist(bins=200,xrot=90,figsize=(20,10),log=True,alpha=0.5,color='green')
df_copy.groupby(['incenter_entry','incenter_exit']).describe()

In [None]:
X_out_out['dy'].hist(bins=200,xrot=90,figsize=(10,5),log=True,alpha=0.5,color='yellow')
X_out_in['dy'].hist(bins=200,xrot=90,figsize=(10,5),log=True,alpha=0.5,color='r')
X_in_out['dy'].hist(bins=200,xrot=90,figsize=(10,5),log=True,alpha=0.5,color='b')
X_in_in['dy'].hist(bins=200,xrot=90,figsize=(10,5),log=True,alpha=0.5,color='green')

In [None]:
#Overall time
df_copy['totalsecs_entry'].hist(bins=960,figsize=(20,10))
df_copy['totalsecs_exit'].hist(bins=960,figsize=(20,10),alpha=0.5)

In [None]:
#quantiles = np.array(df_copy['totalsecs_entry'].quantile([0,.25, .5, .75,1]))
#df_copy['totalsecs_entry'].hist(bins=quantiles)
#quantiles = np.array(df_copy['totalsecs_exit'].quantile([0,.25, .5, .75,1]))
#df_copy['totalsecs_exit'].hist(bins=quantiles)



In [None]:
#Hours
(df_copy['totalsecs_entry'] / 3600).hist(bins=16,figsize=(20,10))
(df_copy['totalsecs_exit'] / 3600).hist(bins=16,figsize=(20,10),alpha=0.5)

In [None]:
#seasonality in minutes
(df_copy['minute_entry']).value_counts().sort_index().plot(figsize=(20,10))
(df_copy['minute_exit']).value_counts().sort_index().plot(figsize=(20,10))

In [None]:
quantile = np.array(df_copy['minuteentry'].quantile([0,0.25,0.5,0.75]))
quantile2 = np.array(df_copy['minute_exit'].quantile([0,0.25,0.5,0.75]))

In [None]:
quantile = np.array(df_copy['second_entry'].quantile([0,0.25,0.5,0.75]))
quantile2 = np.array(df_copy['second_exit'].quantile([0,0.25,0.5,0.75]))

In [None]:
#seasonality in seconds
(df_copy['second_entry']).value_counts().sort_index().plot(figsize=(20,10))
(df_copy['second_exit']).value_counts().sort_index().plot(figsize=(20,10))

In [None]:
pd.crosstab(df_copy["incenter_exit"],df_copy["incenter_entry"])

In [None]:
t0 = df_copy[df_copy["dt"] == 0]
tn0 = df_copy[df_copy["dt"] > 0]
tn0['vx'] = tn0["dx"] / tn0["dt"]
tn0['vy'] = tn0["dy"] / tn0["dt"]
tn0['dist'] = np.sqrt(tn0['dx'] * tn0['dx'] + tn0['dy'] * tn0['dy'])
tn0['speed'] = tn0['dist'] / tn0['dt']
mu_dx = tn0['dx'].mean()
std_dx = tn0['dx'].std()
mu_dy = tn0['dy'].mean()
std_dy = tn0['dy'].std()
tn0['norm_dx'] = (tn0['dx'] - mu_dx) / std_dx
tn0['norm_dy'] = (tn0['dy'] - mu_dy) / std_dy
tn0['chisq'] = tn0['norm_dx'] * tn0['norm_dx'] + tn0['norm_dy'] * tn0['norm_dy']

In [None]:
tn0.groupby(['incenter_entry','incenter_exit'])['dx'].hist(bins=200,xrot=90,figsize=(20,10),alpha=0.5,log=True)
tn0.groupby(['incenter_entry','incenter_exit'])['dx'].describe()

#those that are outside and enter the city tend to move west
#those that are inside and exit the city tend to move east

In [None]:
tn0.groupby(['incenter_entry','incenter_exit'])['dy'].hist(bins=200,xrot=90,figsize=(20,10),alpha=0.5,log=True)
tn0.groupby(['incenter_entry','incenter_exit'])['dy'].describe()

#those that are outside and enter the city tend to move south a lot more
#those that are inside and exit the city tend to move north a lot more

In [None]:
tn0.groupby(['incenter_entry','incenter_exit'])['vx'].hist(bins=200,xrot=90,figsize=(20,10),alpha=0.5,log=True)
tn0.groupby(['incenter_entry','incenter_exit'])['vx'].describe()

In [None]:
tn0.groupby(['incenter_entry','incenter_exit'])['vy'].hist(bins=200,xrot=90,figsize=(20,10),alpha=0.5,log=True)
tn0.groupby(['incenter_entry','incenter_exit'])['vy'].describe()

In [None]:
tn0.groupby(['incenter_entry','incenter_exit'])['dist'].hist(bins=200,xrot=90,figsize=(20,10),alpha=0.5,log=True)
tn0.groupby(['incenter_entry','incenter_exit'])['dist'].describe()

In [None]:
tn0.groupby(['incenter_entry','incenter_exit'])['speed'].hist(bins=200,xrot=90,figsize=(20,10),alpha=0.5,log=True)
tn0.groupby(['incenter_entry','incenter_exit'])['speed'].describe()

In [None]:
tn0.groupby(['incenter_entry','incenter_exit'])['chisq_entry'].hist(bins=100)
tn0.groupby(['incenter_entry','incenter_exit'])['chisq_entry'].describe()

In [None]:
tn0.columns
wanted = ['x_entry','x_exit','y_entry','y_exit',
          'norm_dx','norm_dy','chisq',
          'totalsecs_entry','totalsecs_exit','totalsecs_mid',
          'hour_entry','hour_exit','minute_entry','minute_exit','second_entry','second_exit',
         'dt','dx','dy','vx','vy','dist','speed',
          'incenter_entry','incenter_exit']


features = ['x_entry','y_entry',
          'totalsecs_entry','totalsecs_exit',
          'hour_entry','hour_exit','second_entry','second_exit',
           'dt','incenter_entry',
           'x_exit','y_exit','incenter_exit']

In [None]:
tn0[wanted].corr()['incenter_exit'].abs().sort_values(ascending=False)
#most correlated with x_exit - x_entry, y_entry, (totlasecs_exit), hour_exit, (totalsecs_entry) hour_entry', dt, incenter_entry, 
#most correlated with y_exit - y_entry, x_entry, incenter_entry, dt, (totalsecs_exit), hour_exit, (totalsecs_entry), hour_entry
#most correlated with incenter_exit - incenter_entry, y_entry, hour_entry, (totalsecs_exit), (totalsecs_entry), hour_exit, second_exit, x_entry

In [None]:
#normalise dt, time
(np.log(tn0['dt']) - np.log(tn0['dt']).mean()).hist(bins=100)
(np.log(tn0['dt']) - np.log(tn0['dt']).mean()).describe()
((2 * ((tn0['dt'] - tn0['dt'].min()) / (tn0['dt'].max() - tn0['dt'].min()))) - 1).hist(log=True)

In [None]:
#totalsecs_entry and totalsecs_exit are very different for tn0/df_copy vs final, otherwis they are similar


In [105]:
#it is more likely to be moving from the city to out compared to tn0, first_tn0
#more likely to stay where originally
#less likely to move from outside city to in
print('final\n',pd.crosstab(final_tn0['incenter_exit'],final_tn0['incenter_entry']) / final_tn0.shape[0], '\n')
print('first\n',pd.crosstab(first_tn0['incenter_exit'],first_tn0['incenter_entry']) / first_tn0.shape[0],'\n')
print('tn0\n',pd.crosstab(tn0['incenter_exit'],tn0['incenter_entry']) / tn0.shape[0])

KeyError: 'incenter_exit'

In [78]:
def aggregate(df,df3):
    df2 = df3.copy()
    
    #no need to aggregate for groups with just 1 entry
    hashgroup = df.groupby(['hash']).filter(lambda x: len(x) > 1).groupby(['hash'])
    features = {}
    
    features['ratio_in'] = hashgroup[['incenter_entry','incenter_exit']].apply(np.nanmean)
    features['avg_x'] = hashgroup[['x_entry','x_exit']].apply(np.nanmean)
    features['avg_y'] = hashgroup[['y_entry','y_exit']].apply(np.nanmean)
    features['total_dist'] = hashgroup['dist'].sum()
    df2['avg_speed'] = df2['total_dist'].divide(df2['total_dt'])
    
    for f in features:
        df2 = df2.merge(features[f].rename(f),left_on="hash",right_index=True,how='outer')
        
        
   #mean, median, max, min of x, y positions

    #position at origin
    f = ['hash','x_entry','y_entry','totalsecs_entry']
    fs = df.loc[df['n'] == 0][f]
    fs.columns = ['hash','x_0','y_0','t_0']
    df2 = df2.merge(fs,left_on="hash",right_on="hash",how='outer')
    df2["total_dt"] = df2["totalsecs_entry"] - df2["t_0"]  
    df2['net_dy'] = df2["y_entry"] - df2["y_0"]  
    df2['net_dx'] = df2["x_entry"] - df2["x_0"]
    
    
    #prior 2 positions
    f = ['hash',"x_exit","y_exit","x_entry","y_entry","totalsecs_exit",'totalsecs_entry',
         'dx','dy','dist','dt','vx','vy','speed']
    fs = df_copy.loc[df_copy['n2'] == 1][f]
    fs.columns = ['hash',"x_nm1","y_nm1","x_nm2","y_nm2","t_nm1",'t_nm2',
         'dx_nm2','dy_nm2','dist_nm2','dt_nm2','vx_nm2','vy_nm2','speed_nm2']
    
    df2 = df2.merge(fs,left_on="hash",right_on="hash",how='outer')
    df2['dt_nm1'] = df2['totalsecs_entry'] - df2['t_nm1']
    df2['dx_nm1'] = df2['x_entry'] - df2['x_nm1']
    df2['dy_nm1'] = df2['y_entry'] - df2['y_nm1']
    df2['dist_nm1'] = np.sqrt(df2['dx_nm1'] * df2['dx_nm1'] + df2['dy_nm1'] * df2['dy_nm1'])
    df2['vx_nm1'] = df2['dx_nm1'].divide(df2['dt_nm1'])
    df2['vy_nm1'] = df2['dy_nm1'].divide(df2['dt_nm1'])
    df2['speed_nm1'] = df2['dist_nm1'].divide(df2['dt_nm1'])

    return df2


final = df_copy[df_copy['hour_exit'] >= 15]
unknowns = ['x_exit','y_exit','dx','dy','vx','vy','dist','speed','n2','incenter_exit']
final = final.drop(unknowns,axis=1)
final = aggregate(df_copy,final)
final = final.set_index('id')

KeyError: 'id'

In [99]:
features = ['x_entry','y_entry',
          'totalsecs_entry','totalsecs_exit',
          'hour_entry','hour_exit','second_entry','second_exit',
           'dt','incenter_entry']
final_tn0 = final[final['dt'] > 0 ]
pd.merge(targets,final_tn0[features],left_index=True,right_index=True,how='outer').corr()['incenter_exit'].abs().sort_values(ascending=False)
#most correalted to incenter_exit - y_entry, dt, x_entry, second_exit, (totalsecs_exit), (totalsecs_entry), second_entry
#most correlated to x_exit - x_entry, y_entry, (totalsecs_exit), (totalsecs_entry), hour_entry, icnenter_entry, second_exit
#most correlated to y_exit - y_entry, x_entry, incenter_entry, (totalsecs_exit), dt, hour_exit, second_entry

incenter_exit      1.000000
incenter_entry     0.715186
x_exit             0.597513
x_entry            0.492121
y_exit             0.411266
y_entry            0.333567
totalsecs_exit     0.043013
totalsecs_entry    0.029132
hour_entry         0.014753
dt                 0.009748
hour_exit          0.003725
second_entry       0.003593
second_exit        0.003160
Name: incenter_exit, dtype: float64

In [97]:
final_tn0 = final[final['dt'] > 0 ]
origin_features = ['ratio_in','total_dist','x_0','y_0','t_0','total_dt','net_dy','net_dx','avg_speed','avg_x','avg_y']
#final.merge(pd.DataFrame(df_copy.loc[df_copy['n'] == 0,"x_entry"].rename('x_0')),how='outer')['x_0']

#f = ['hash',"x_exit","y_exit","x_entry","y_entry","totalsecs_exit",'totalsecs_entry','vx','vy','speed']
pd.merge(targets,final_tn0[origin_features],left_index=True,right_index=True,how='outer').corr()['incenter_exit'].abs().sort_values(ascending=False)

incenter_exit    1.000000
ratio_in         0.669356
x_exit           0.597513
avg_x            0.452412
y_exit           0.411266
x_0              0.344539
avg_y            0.313389
y_0              0.235726
net_dx           0.125480
total_dt         0.072345
t_0              0.070367
net_dy           0.055329
avg_speed        0.021460
total_dist       0.013163
Name: incenter_exit, dtype: float64

In [93]:
priors = ['x_nm1', 'y_nm1', 'x_nm2', 'y_nm2',
       't_nm1', 't_nm2', 'dx_nm2', 'dy_nm2', 'dist_nm2', 'dt_nm2', 'vx_nm2',
       'vy_nm2', 'speed_nm2', 'dt_nm1', 'dx_nm1', 'dy_nm1', 'dist_nm1',
       'avg_speed', 'vx_nm1', 'vy_nm1', 'speed_nm1']
pd.merge(targets,final_tn0[priors],left_index=True,right_index=True,how='outer').corr()['incenter_exit'].abs().sort_values(ascending=False)

incenter_exit    1.000000
x_exit           0.597513
x_nm1            0.449537
x_nm2            0.424332
y_exit           0.411266
y_nm1            0.300542
y_nm2            0.282230
dx_nm1           0.094703
dy_nm1           0.069142
dx_nm2           0.060394
dy_nm2           0.036272
dt_nm1           0.024212
avg_speed        0.021460
t_nm1            0.020889
t_nm2            0.020446
dist_nm1         0.013198
vx_nm2           0.012688
speed_nm2        0.005002
dt_nm2           0.004741
vy_nm2           0.003890
dist_nm2         0.003730
vx_nm1           0.003120
speed_nm1        0.003034
vy_nm1           0.001982
Name: incenter_exit, dtype: float64

In [None]:
#x_exit -  x_entry 0.783121, x_nm1 - 0.720306, x_nm2 - 0.686521, x_0  0.625893
#        ,  incenter_entry - 0.489912, ratio_in 0.479
#       -  , y_entry 0.263798 , y_nm1 - 0.240767, y_nm2 - 0.224722, 
#.      -  y_0 0.181784, net_dx - 0.113624, totalsecs_exit - 0.038642

#y_exit -  y_entry 0.734019, y_nm1 - 0.660640, y_nm2 - 0.635616,  y_0 - 0.613635, 
#       - incenter_entry - 0.321814, ratio_in - 0.315254
#       -  x_entry - 0.251824, x_nm1 - 0.228930, x_nm2 - 0.219261,  x_0 - 0.190885, net_dx - 0.048044, totalsecs_exit - 0.020937, dt - 0.018662

#incenter_exit - incenter_entry - 0.715186, ratio_in - 0.669356,  x_entry - 0.492121, 
#.             - x_nm1 -0.449537, x_nm2 - 0.424332, , x0 - 0.344539,  
#.             - y_entry - 0. 333567, y_nm1 -  0.300542, y_nm2 -  0.282230,  y0 - 0.235726,  
#              - total_dt - 0.072345, t0 - 0.070367, 
#              - totalsecs_exit - 0.043013, totalsecs_entry - 0.029132

bestfeatures = ['x_entry','y_entry','incenter_entry','x_nm1','x_nm2','y_nm1','y_nm2','total_dt','x_0','y_0']

In [None]:
#Kaggle Taxi Trajectory Prediction
#https://www.kaggle.com/c/pkdd-15-predict-taxi-service-trajectory-i/discussion/14994#latest-83247
#http://blog.kaggle.com/2015/07/27/taxi-trajectory-winners-interview-1st-place-team-%F0%9F%9A%95/
#https://www.kaggle.com/c/pkdd-15-predict-taxi-service-trajectory-i/discussion/15020#latest-89300

#use KNN to compare points to similar trips
#kernel regression
#aggregate last leg of journey
#MLP
#Gaussian Process
#Kalman Filter
#Hidden Markov Model https://datascience.stackexchange.com/questions/27709/how-to-classify-movement-data-time-series-in-real-time

#https://datascience.stackexchange.com/questions/19392/using-time-series-data-from-a-sensor-for-ml/19394#19394
#https://datascience.stackexchange.com/questions/18481/how-to-train-model-to-predict-events-30-minutes-prior-from-multi-dimensionnal-t/18483#18483

#https://warwick.ac.uk/fac/cross_fac/complexity/study/emmcs/outcomes/studentprojects/gandhi.pdf
#https://arxiv.org/pdf/1508.00021.pdf