In [1]:
#libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

#import metrics
from sklearn.model_selection import cross_val_score
from sklearn.metrics import f1_score
pd.options.mode.chained_assignment = None


#read csv files
df = pd.read_csv("data_train.csv")
df2 = pd.read_csv("data_test.csv")

I have to predict whether the car will be in the city center, based on starting x,y coordinates, time_entry, time_exit.
I can do this by:
predicting a binary categorical variable - in city or not,
predicting x,y coordinates

Stratify by - time, hash_id, in city to begin with or not


#Baseline for Final Entries 0.866589156610405

#submission 1 - Baseline, (instationary_start = instationary_end)  = Logistic Regression 
#submission 2 - K-Nearest Neighbours, trained on in group, out group respectively
#submission 3 - XGBoost on overall set
#Submission 4 - XGoost, improvement by adding origin

From data exploration - prior x,y coordinates are best predictors, decaying in relevance for each prior time entry

In [2]:
def df_transform(df2):
    #time - stratify into categories, e.g. hours
    df = df2.copy()
    df[["hour_entry", "minute_entry", "second_entry"]] = df["time_entry"].str.extract(r'(\d+):(\d+):(\d+)').astype(int)
    df[["hour_exit", "minute_exit", "second_exit"]] = df["time_exit"].str.extract(r'(\d+):(\d+):(\d+)').astype(int)
    df["t_nm1"] = (3600 * df["hour_entry"] + 60 * df["minute_entry"] + df["second_entry"]) 
    df["t_n"] = (3600 * df["hour_exit"] + 60 * df["minute_exit"] + df["second_exit"])
    
    [x_lower, x_upper, y_lower, y_upper] = [3750901.5068,3770901.5068, -19268905.6133,-19208905.6133]
    df["in_entry"] =  ((df["x_entry"] >= x_lower) & (df["x_entry"] <= x_upper) & \
                             (df["y_entry"] >= y_lower) &  (df["y_entry"] <= y_upper)) * 1
    df["in_exit"] =  ((df["x_exit"] >= x_lower) & (df["x_exit"] <= x_upper) & \
                             (df["y_exit"] >= y_lower) &  (df["y_exit"] <= y_upper)) * 1
    
    
    #unknowns = ['x_exit','y_exit','dx','dy','vx','vy','dist','speed','n2','incenter_exit']
    targets = df[df['hour_exit'] >= 15][['trajectory_id','x_exit','y_exit','in_exit']]
    df.loc[df['hour_exit'] >= 15,['x_exit','y_exit','in_exit']] = np.nan

    df["dt"] = df["t_n"] - df["t_nm1"]
   
    df['n'] = df.groupby(['hash']).cumcount()
    df['n2'] = df.groupby(['hash']).cumcount(ascending=False)
    
    unwanted = ['Unnamed: 0', 'Unnamed: 0.1','vmean','vmin','vmax','time_entry','time_exit',
               'hour_entry','hour_exit','minute_entry','minute_exit','second_entry','second_exit']
    return df.drop(unwanted,axis=1).rename(index=str,columns={"trajectory_id":"id"}), targets.rename(index=str,columns={"trajectory_id":"id"})

#process data
df_copy, targets = df_transform(df)

df2_copy, _ = df_transform(df2)

In [None]:
def aggregate(df,df3):
    df2 = df3.copy()
    
    #no need to aggregate for groups with just 1 entry
    #hashgroup = df.groupby(['hash']).filter(lambda x: len(x) > 1).groupby(['hash'])
    #hashgroup = df.groupby(['hash'])
    #features = {}
    #features['ratio_in'] = hashgroup[['in_entry','in_exit']].apply(np.nanmean)
    #features['avg_x'] = hashgroup[['x_entry','x_exit']].apply(np.nanmean)
    #features['avg_y'] = hashgroup[['y_entry','y_exit']].apply(np.nanmean)
    #features['total_dist'] = hashgroup['dist'].sum()
    
    #for f in features:
    #    df2 = df2.merge(features[f].rename(f),left_on="hash",right_index=True,how='outer')
        
   #mean, median, max, min of x, y positions
`
    #position at origin
    f = ['hash','x_entry','y_entry','t_nm1','x_exit','y_exit']
    fs = df.loc[df['n'] == 0][f]
    fs.columns = ['hash','x_0','y_0','t_0','x_1','y_1']
    df2 = df2.merge(fs,left_on="hash",right_on="hash",how='inner')
    df2["sum_dt_nm1"] = df2["t_nm1"] - df2['t_0']  
    #df2['avg_speed'] = df2['total_dist'].divide(df2['sum_dt_nm1'])
    df2.loc[df2['x_1'].isna() == True,'x_1'] = df2.loc[df2['x_1'].isna() == True,'x_0']
    df2.loc[df2['y_1'].isna() == True,'y_1'] = df2.loc[df2['y_1'].isna() == True,'y_0']
    
    #prior 2 positions
    f = ['hash',"x_exit","y_exit","x_entry","y_entry","t_nm1",'t_n']#,
         #'dx','dy','dist','dt','vx','vy','speed'
    fs = df.loc[df['n2'] == 1][f]
    fs.columns = ['hash',"x_nm2","y_nm2","x_nm3","y_nm3","t_nm3",'t_nm2']#,
         #'dx_nm2','dy_nm2','dist_nm2','dt_nm2','vx_nm2','vy_nm2','speed_nm2']
    
    df2 = df2.merge(fs,left_on="hash",right_on="hash",how='outer')
    
    #impute missing values
    #df2.loc[df2['x_nm2'].isna(),'x_nm2'] = df2.loc[df2['x_nm2'].isna(),'x_0']
    #df2.loc[df2['y_nm2'].isna(),'y_nm2'] = df2.loc[df2['y_nm2'].isna(),'y_0']
    #df2.loc[df2['x_nm3'].isna(),'x_nm3'] = df2.loc[df2['x_nm3'].isna(),'x_0']
    #df2.loc[df2['y_nm3'].isna(),'y_nm3'] = df2.loc[df2['y_nm3'].isna(),'y_0']
    #df2.loc[df2['t_nm3'].isna(),'t_nm3'] = df2.loc[df2['t_nm3'].isna(),'t_nm1']
    #df2.loc[df2['t_nm2'].isna(),'t_nm2'] = df2.loc[df2['t_nm2'].isna(),'t_nm1']
                                                              
            
    #df2['dt_nm1'] = df2['t_m1'] - df2['t_nm2']
    df2['dx_nm1'] = df2['x_entry'] - df2['x_nm2']
    df2['dy_nm1'] = df2['y_entry'] - df2['y_nm2']
    df2['dx_nm2'] = df2['x_nm2'] - df2['x_nm3']
    df2['dy_nm2'] = df2['y_nm2'] - df2['y_nm3']
    #df2['dist_nm1'] = np.sqrt(df2['dx_nm1'] * df2['dx_nm1'] + df2['dy_nm1'] * df2['dy_nm1'])
    #df2['vx_nm1'] = df2['dx_nm1'].divide(df2['dt_nm1'])
    #df2['vy_nm1'] = df2['dy_nm1'].divide(df2['dt_nm1'])
    #df2['speed_n'] = df2['dist_n'].divide(df2['dt_nm1'])

    return df2


final = df_copy[df_copy['t_n'] >= 15*60*60]
unknowns = ['x_exit','y_exit','n2','in_exit','dx','dy','vx','vy','dist','speed']
final = final.drop(unknowns,axis=1)
final = aggregate(df_copy,final)
final = final.set_index('id')
final_tn0 = final[final['dt'] > 0]

            
final2 = df2_copy[df2_copy['t_n'] >= 15*60*60]
final2 = final2.drop(unknowns,axis=1)
final2 = aggregate(df2_copy,final2)
final2 = final2.set_index('id')
final2_tn0 = final2[final2['dt'] > 0]

In [None]:
# Baseline overall - f1: 0.8876970163859333
#f1_score(df_copy["incenter_entry"],df_copy["incenter_exit"])
#pd.crosstab(df_copy["incenter_entry"],df_copy["incenter_exit"])
#classes imbalanced, out : in = 2.348 : 1 
#challenge is to predict those that move from inside to outside




# Baseline for final  - 0.866589156610405
#baseline for final_tn0 - 0.7211841921519341, 0.7424115867620404 with XGBoost
#f1_score(final["incenter_entry"],final["incenter_exit"])
#pd.crosstab(final["incenter_entry"],final["incenter_exit"])
#Class imbalance rises to 2.7 : 1


#Using XGBOOST
#features = ['x_entry', 'y_entry','dt','totalsecs_entry', 'incenter_entry'] Best initial features on overall set
#improvement to 0.8732285592178957 by training on 

In [None]:
#Feature Selection; XGboost
import xgboost as xgb

bestfeatures = ['x_entry','y_entry','x_nm2','x_nm3','y_nm2','y_nm3',
                'x_1','y_1','x_0','y_0',
                'dt','sum_dt_nm1','in_entry',
                ] #+ ['dx_nm1','dy_nm1','t_nm1','dx_nm2','dy_nm2']

data = pd.merge(targets,final_tn0,left_index=True,right_index=True,how='inner')
xgbc=xgb.XGBClassifier(n_estimators=100,
                         max_depth=8,
                         learning_rate=0.2,
                         gamma=0.3,
                         early_stopping_rounds=5,
                      scale_pos_weights = 2.5) 
print(cross_val_score(xgbc,data[bestfeatures],data["in_exit"],cv=2,scoring="f1").mean())

xgbc.fit(data[bestfeatures],data["in_exit"])
xgb.plot_importance(xgbc)
pd.crosstab(data['in_exit'],xgbc.predict(data[bestfeatures]))


In [None]:
def normalise(df2, df3):
    df = df2.copy()
    whole = df3[df3['dt'] > 0]
    
    df["norm_dt"] = (df["dt"] - df["dt"].mean()) / df["dt"].std()
    df["minmax_dt"] = 2 * (df["dt"] - df["dt"].min()) / (df["dt"].max() - df["dt"].min()) - 1
    df['norm_in_entry'] = 2 * df2['in_entry'] - 1

    #Feature Selection; XGboost

    xs = ['x_entry','x_nm2','x_nm3','x_0','x_1']
    ys = ['y_entry','y_nm2','y_nm3','y_0','y_1']
    
    
    mu_x = np.nanmean(np.array(whole[['x_entry','x_exit']]).reshape(-1,1))
    std_x = np.nanstd(np.array(whole[['x_entry','x_exit']]).reshape(-1,1))
    mu_y = np.nanmean(np.array(whole[['y_entry','y_exit']]).reshape(-1,1))
    std_y = np.nanstd(np.array(whole[['y_entry','y_exit']]).reshape(-1,1))
    
    
    for x in xs:
        df['norm_'+str(x)] = (df[x] - df['x_entry'].mean()) / df['x_entry'].std()
    for y in ys:
        df['norm_'+str(y)] = (df[y] - df['y_entry'].mean()) / df['y_entry'].std()
    
    df['norm_dx_nm1'] = (df['dx_nm1'] / df['x_entry'].std())
    df['norm_dy_nm1'] = (df['dy_nm2'] / df['y_entry'].std())
    df['norm_dx_nm2'] = (df['dx_nm1'] / df['x_entry'].std())
    df['norm_dy_nm2'] = (df['dy_nm2'] / df['y_entry'].std())
    
    return df

test = normalise(final_tn0,df_copy)

In [None]:
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier ,RadiusNeighborsClassifier

#cross val #0.8752232230385206
test = normalise(final_tn0,df_copy)

#bestfeatures = ['x_entry','y_entry','x_nm2','x_nm3','y_nm2','y_nm3',
#                'x_1','y_1','x_0','y_0',
#                'dt','in_entry',
#                ]

#bestfeatures = ['x_entry','y_entry','in_entry','dt','x_nm2','y_nm2','x_nm3','y_nm3','x_1','y_1','x_0','y_0']
`
bestfeatures = ['x_entry','y_entry','x_nm2','y_nm2','x_0', 'y_0','x_1','y_1','dt','in_entry`']
norm_features = ['norm_' + x for x in bestfeatures] 

#

#data = pd.merge(test,targets,left_index=True,right_index=True,how='inner')
#print(cross_val_score(mlpc,data[bestfeatures],data["in_exit"],cv=2,scoring="f1").mean())



data = pd.merge(test,targets,left_index=True,right_index=True,how='inner')
data_in =  data[data['in_entry'] == True]
data_out =  data[data['in_entry'] == False]
mlpc = MLPClassifier(warm_start=True,solver='adam',alpha=0.5,early_stopping = True, activation='relu',max_iter=200)
print(cross_val_score(mlpc,data[norm_features],data["in_exit"],cv=2,scoring="f1").mean())
#knn = KNeighborsClassifier(p=1,n_neighbors=50,weights='uniform')
#knn = RadiusNeighborsClassifier()
#print(cross_val_score(knn,data_in[norm_features],data_in["in_exit"],cv=2,scoring="f1").mean())


#for traj>1
#0.8846013995893918 n-neighbors = 8
#Best features: Best params - p = 1, manhatten distance, weights = distance
#norm_features = ["norm_xentry","norm_yentry",'norm_timedelta','incenter_start','norm_xmean','norm_ymean',
#                 'norm_xorigin','norm_yorigin','norm_xmedian','norm_ymedian']



#Best features for those with 1 traj, 
#norm_features = ["norm_xentry","norm_yentry",'norm_timedelta','incenter_start']
#params - n-neighbors = 8, p=1, weights= distance





#print(cross_val_score(knn_final,norm_df[norm_features],norm_df["incenter_end"],cv=2,scoring="f1").mean())
#knn_final.fit(norm_final[norm_features],norm_final["incenter_end"])
#norm_final["knn"] = norm_final[norm_features]


# Final - 0.7525773195876289
#from sklearn.svm import LinearSVC
#svc = SVC(gamma='auto')
#print(cross_val_score(svc,data[bestfeatures],data["in_exit"],cv=2,scoring="f1").mean())





In [None]:
print(f1_score(data_in['in_exit'],data_in['in_entry']))
pd.crosstab(data_out['in_exit'],data_out['in_entry'])

In [None]:
data.groupby(['in_entry','in_exit'])['norm_dx_nm2'].hist(bins=100)

In [None]:
from sklearn.gaussian_process import GaussianProcessClassifier
gpc = GaussianProcessClassifier()

bestfeatures = ['in_entry']
norm_features = ['norm_' + x for x in bestfeatures]



#Final -0.7650575946895743
print(cross_val_score(gpc,data_in[norm_features],data_in["in_exit"],cv=2,scoring="f1").mean())





In [None]:
import tensorflow as tf
from tf.keras.models import Sequential()

model = tf.keras.models.Sequential([
  tf.keras.layers.Flatten(input_shape=(,1)),
  tf.keras.layers.Dense(512, activation=tf.nn.relu),
  tf.keras.layers.Dropout(0.2),
  tf.keras.layers.Dense(10, activation=tf.nn.softmax)
])
model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

model.fit(X_train, y_train, epochs=5)
model.evaluate(X_test, y_test)


In [None]:
[x_lower, x_upper, y_lower, y_upper] = [3750901.5068,3770901.5068, -19268905.6133,-19208905.6133]

    
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet, HuberRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.multioutput import MultiOutputRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import DotProduct, WhiteKernel
from sklearn.svm import SVR


lr = LinearRegression()
ridge = Ridge()
lasso = ElasticNet()
kr = KernelRidge()
#rf = RandomForestRegressor(n_estimators=100)

#mor = MultiOutputRegressor(lr)
dp = DotProduct()

mor = MultiOutputRegressor(lr)


X = test

bestfeatures = ['x_entry','y_entry','x_nm2','y_nm2','x_0', 'y_0','x_1','y_1','dt','in_entry']
norm_features = ['norm_' + x for x in bestfeatures] 

print(cross_val_score(mor,data_in[bestfeatures],data_in[['x_exit','y_exit']], cv=2).mean())
print(cross_val_score(mor,data_out[bestfeatures],data_out[['x_exit','y_exit']], cv=2).mean())




#mor.fit(X[bestfeatures], X['x_exit','y_exit'])
#x = pd.DataFrame(mor.predict(X[["norm_yentry","norm_xentry","norm_timedelta","norm_totalsecsentry"]]))
#x.columns=['x_preds','y_preds']
#X = X.reset_index()
#X[['x_preds','y_preds']] = x
#X[['x_preds']] *=  df_copy["x_entry"].std()
#X[['x_preds']] +=  df_copy["x_entry"].mean()
#X[['y_preds']] *=  df_copy["y_entry"].std()
#X[['y_preds']] +=  df_copy["y_entry"].mean()


#X['target'] = (X["x_preds"] >= x_lower) & (X["x_preds"] <= x_upper) & (X["y_preds"] >= y_lower) & (X["y_preds"] <= y_upper)

#f1_score(X['target'], X['incenter_end'])

In [None]:
final2.loc[final2['dt'] > 0,"target"] = xgbc.predict(final2_tn0[bestfeatures])
final2.loc[final2['dt'] == 0,"target"] = final2.loc[final2['dt'] == 0,"in_entry"]
#X_pred = output.values.reshape(-1,10)
#output["target"] = knn.predict(X_pred)
#output[['trajectory_id','target']]
final2['target'].to_csv("output.csv",index=True,header=True)

In [None]:
final2['target'].value_counts()