In [2]:
#libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

#import metrics
from sklearn.model_selection import cross_val_score
from sklearn.metrics import f1_score
pd.options.mode.chained_assignment = None


#read csv files
df = pd.read_csv("data_train.csv")
df2 = pd.read_csv("data_test.csv")

In [3]:
def df_transform(df2):
    df = df2.copy()
    df[["hour_entry", "minute_entry", "second_entry"]] = df["time_entry"].str.extract(r'(\d+):(\d+):(\d+)').astype(int)
    df[["hour_exit", "minute_exit", "second_exit"]] = df["time_exit"].str.extract(r'(\d+):(\d+):(\d+)').astype(int)
    df["t_nm1"] = (3600 * df["hour_entry"] + 60 * df["minute_entry"] + df["second_entry"]) 
    df["t_n"] = (3600 * df["hour_exit"] + 60 * df["minute_exit"] + df["second_exit"])
    
    [x_lower, x_upper, y_lower, y_upper] = [3750901.5068,3770901.5068, -19268905.6133,-19208905.6133]
    df["target"] =  ((df["x_exit"] >= x_lower) & (df["x_exit"] <= x_upper) & \
                             (df["y_exit"] >= y_lower) &  (df["y_exit"] <= y_upper)) * 1
    
    targets = df[df['hour_exit'] >= 15][['trajectory_id','x_exit','y_exit','target']]
    df.loc[df['hour_exit'] >= 15,['x_exit','y_exit']] = np.nan
    
    unwanted = ['Unnamed: 0', 'Unnamed: 0.1','vmean','vmin','vmax','time_entry','time_exit',
               'hour_entry','hour_exit','minute_entry','minute_exit','second_entry','second_exit','target']
    return df.drop(unwanted,axis=1).rename(index=str,columns={"trajectory_id":"id"}),\
           targets.rename(index=str,columns={"trajectory_id":"id"})

#process data
df_copy, targets = df_transform(df)
df2_copy, targets2 = df_transform(df2)

In [4]:
def aggregate(df):
    df_copy = df.copy()
    X = df_copy[['hash','id','x_entry','y_entry','t_nm1']]
    X2 = df_copy[['hash','id','x_exit','y_exit','t_n']]
    X.columns = ['hash','id','x','y','t']
    X2.columns = ['hash','id','x','y','t']
    X3 = pd.concat([X,X2])
    X3 = X3.drop_duplicates(subset=['hash','id','x','t'],keep='last')
    X3 = X3.sort_values(by='t',kind='mergesort')
    X3['n'] = X3.groupby(['hash']).cumcount()
    X3['n2'] = X3.groupby(['hash']).cumcount(ascending=False)
    
    final = X3[X3['n2'] == 0][['hash','id','t','n','n2']]
    right = pd.DataFrame(X3.groupby(['hash'])['x'].apply(lambda x : np.nansum(np.abs(np.diff(x)))))
    right.columns = ['absum_dx']                
    final = pd.merge(final,right,left_on='hash',right_index=True,how='inner')
    right = pd.DataFrame(X3.groupby(['hash'])['y'].apply(lambda x : np.nansum(np.abs(np.diff(x)))))
    right.columns = ['absum_dy']
    final = pd.merge(final,right,left_on='hash',right_index=True,how='inner')
    
    params = ['x_nm','y_nm','t_nm']
    for i in range (1,4):
        current_params = [x+str(i) for x in params]
        right = X3[X3['n2'] == i][['hash','x','y','t']]
        right.columns = ['hash'] + current_params
        final = pd.merge(final,right,left_on='hash',right_on='hash',how='outer')

    params = ['x_','y_','t_']

    for i in range (0,2):
        current_params = [x+str(i) for x in params]
        right = X3[X3['n'] == i][['hash','x','y','t']]
        right.columns = ['hash'] + current_params
        final = pd.merge(final,right,left_on='hash',right_on='hash',how='outer')
    return final.drop(['hash','n2'],axis=1)

    #fill values with not enough length

final = aggregate(df_copy)
final2 = aggregate(df2_copy)

In [5]:
def df_transform2(final, targets):
    data = final.copy()
    [x_lower, x_upper, y_lower, y_upper] = [3750901.5068,3770901.5068, -19268905.6133,-19208905.6133]
    data['dt'] = data['t'] - data['t_nm1']
    data['sum_dt'] = data['t'] - data['t_0']
    data['dist'] = np.sqrt(data['absum_dy'] * data['absum_dy'] + data['absum_dx'] * data['absum_dx'])

    data['in_entry'] = ((data["x_nm1"] >= x_lower) & (data["x_nm1"] <= x_upper) & \
                                 (data["y_nm1"] >= y_lower) &  (data["y_nm1"] <= y_upper)) * 1
    #data.loc[data['x_nm2'].isna() == True,'x_nm2'] = data.loc[data['x_nm2'].isna() == True,'x_nm1']
#data.loc[data['y_nm2'].isna() == True,'y_nm2'] = data.loc[data['y_nm2'].isna() == True,'y_nm1']
#data.loc[data['x_nm3'].isna() == True,'x_nm3'] = data.loc[data['x_nm3'].isna() == True,'x_nm2']
#data.loc[data['y_nm3'].isna() == True,'y_nm3'] = data.loc[data['y_nm3'].isna() == True,'y_nm2']
    
    data['dx'] = data['x_nm1'] - data['x_nm2']
    data['dy'] = data['y_nm1'] - data['y_nm2']
    data['dx2'] = data['x_1'] - data['x_0']
    data['dy2'] = data['y_1'] - data['y_0']
    data['dx3'] = data['x_nm1'] - data['x_0']
    data['dy3'] = data['y_nm1'] - data['y_0']
    data['dx4'] = data['x_nm1'] - data['x_nm3']
    data['dy4'] = data['y_nm1'] - data['y_nm3']

    return data

final = df_transform2(final,targets)
final2 = df_transform2(final2,targets2)
data = pd.merge(final,targets,left_on='id',right_on='id',how='inner')


In [None]:
import xgboost as xgb
#Best - CV 0.8870269975190004, Train 0.9619019941600586, n_estimators = 200, max_depth = 9, gamma = 0.5
# CV - 0.8878599904986961, 0.915913887716336 - n_estimators = 100, max_depth = 8, gamma=0.3
#CV - 0.8872853828082421, Train: 0.9491159026661493- n_estimators = 100, max_depth = 10, gamma =0.2

features2 = ['x_nm1','y_nm1','dt','x_nm2','y_nm2','x_nm3','y_nm3',
             'x_0','y_0','dist','t_0','y_1','x_1','t_1'] +\
            ['dx','dy','dx2','dy2','dx4','dy4']

features = ['x_nm1','y_nm1','dt','x_nm3','y_nm3',
             'x_0','y_0','dist','t_0','y_1','x_1','t_1'] +\
            ['dx','dy','dx2','dy2','dx4','dy4','t_nm1']
 



xgbc=xgb.XGBClassifier(n_estimators=200, max_depth=9,gamma=0.5,scale_pos_weights=2.5) 
print("CV: {}".format(cross_val_score(xgbc,data[features],data["target"],cv=2,scoring="f1").mean()))

xgbc.fit(data[features],data["target"])
xgb.plot_importance(xgbc)
data['test'] = xgbc.predict(data[features])
print("Train: {}".format(f1_score(data['target'],data['test'])))
pd.crosstab(data['test'],data['target'])


CV: 0.885202450180633


In [45]:
data.columns

Index(['id', 't', 'n', 'absum_dx', 'absum_dy', 'x_nm1', 'y_nm1', 't_nm1',
       'x_nm2', 'y_nm2', 't_nm2', 'x_nm3', 'y_nm3', 't_nm3', 'x_0', 'y_0',
       't_0', 'x_1', 'y_1', 't_1', 'dt', 'sum_dt', 'dist', 'in_entry', 'dx',
       'dy', 'dx2', 'dy2', 'dx3', 'dy3', 'dx4', 'dy4', 'x_exit', 'y_exit',
       'target', 'test'],
      dtype='object')

In [38]:
final.columns

Index(['id', 't', 'n', 'absum_dx', 'absum_dy', 'x_nm1', 'y_nm1', 't_nm1',
       'x_nm2', 'y_nm2', 't_nm2', 'x_nm3', 'y_nm3', 't_nm3', 'x_0', 'y_0',
       't_0', 'x_1', 'y_1', 't_1', 'dt', 'sum_dt', 'dist', 'in_entry', 'dx',
       'dy', 'dx2', 'dy2', 'dx3', 'dy3', 'dx4', 'dy4'],
      dtype='object')

In [41]:
final2['target'] = xgbc.predict(final2[features])
#final2 = final2.set_index('id')
final2['target'].to_csv("output.csv",index=True,header=True)

In [None]:
def normalise(df):
    df_copy = df.copy()
    
    
    for x in range(1,4):
        df_copy['norm_xnm' + str(x)] = (df_copy['x_nm'+str(i)] - df_copy['x_nm1'].mean()) / df_copy['x_nm1'].std()
        df_copy['norm_ynm' + str(x)] = (df_copy['y_nm'+str(i)] - df_copy['y_nm1'].mean()) / df_copy['y_nm1'].std()
    
    df_copy['norm_x0'] = (df_copy['x_0'] - df_copy['x_nm1'].mean()) / df_copy['x_nm1'].mean()
    df_copy['norm_y0'] = (df_copy['y_0'] - df_copy['y_nm1'].mean()) / df_copy['y_nm1'].mean()
    df_copy['norm_dt'] = (df_copy['dt'] - df_copy['dt'].mean()) / df_copy['dt'].std()
    
    return df_copy

In [None]:
data.loc[data['x_nm2'].isna() == True,'x_nm2'] = data.loc[data['x_nm2'].isna() == True,'x_nm1']
data.loc[data['y_nm2'].isna() == True,'y_nm2'] = data.loc[data['y_nm2'].isna() == True,'y_nm1']
data.loc[data['x_nm3'].isna() == True,'x_nm3'] = data.loc[data['x_nm3'].isna() == True,'x_nm2']
data.loc[data['y_nm3'].isna() == True,'y_nm3'] = data.loc[data['y_nm3'].isna() == True,'y_nm2']


normfeatures = ['norm_xnm1', 'norm_ynm1','norm_x0','norm_y0','norm_xnm2','norm_ynm2','norm_xnm3','norm_ynm3']
test = normalise(data)
test2 = test[test['dt'] > 0]



