In [1]:
import numpy as np
import pandas as pd

import xgboost as xgb
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

In [2]:
def timestr_to_secs(timestr):
    return sum(x * int(t) for x, t in zip([3600, 60, 1], timestr.split(':'))) # zip => {(3600,HH), (60,MM), (1,SS)}

In [3]:
def prepare_dataset(data, test_set=False):
    
    # Change time to secs
    time_entry = data.time_entry

    time_exit = data.time_exit

    time_entry_secs = map(timestr_to_secs, time_entry)

    time_exit_secs = map(timestr_to_secs, time_exit)

    data['time_entry_secs'] = list(time_entry_secs)

    data['time_exit_secs'] = list(time_exit_secs)
    
    # Last trajectory of each device    
    data_last_row = data.groupby('hash').last()
    
    # All previous trajectories except from last
    data_prev = data.set_index('trajectory_id').drop(data_last_row.trajectory_id)
    
    # Removing devices that only have one trajectory
    single_devices = set(data_prev.hash).symmetric_difference(data_last_row.index)

    data_last_row = data_last_row.drop(list(single_devices))
    
    # Setting new variables for previous trajectories
    prev_x_entry = data_prev.groupby('hash').first().x_entry
    prev_y_entry = data_prev.groupby('hash').first().y_entry
    prev_x_exit = data_prev.groupby('hash').last().x_exit
    prev_y_exit = data_prev.groupby('hash').last().y_exit
    prev_time_entry_secs = data_prev.groupby('hash').first().time_entry_secs
    prev_time_exit_secs = data_prev.groupby('hash').last().time_exit_secs
    prev_vmean = data_prev.groupby('hash').mean().vmean

    data_last_row['prev_x_entry'] = prev_x_entry
    data_last_row['prev_y_entry'] = prev_y_entry
    data_last_row['prev_x_exit'] = prev_x_exit
    data_last_row['prev_y_exit'] = prev_y_exit
    data_last_row['prev_time_entry_secs'] = prev_time_entry_secs
    data_last_row['prev_time_exit_secs'] = prev_time_exit_secs
    data_last_row['prev_vmean'] = prev_vmean
    
    #Setting new variables for devices with single trajectory
    data_single_devices = data.set_index('hash').loc[single_devices,:]

    data_single_devices['prev_x_entry'] = data_single_devices.x_entry
    data_single_devices['prev_y_entry'] = data_single_devices.y_entry
    data_single_devices['prev_x_exit'] = data_single_devices.x_exit
    data_single_devices['prev_y_exit'] = data_single_devices.y_exit
    data_single_devices['prev_time_entry_secs'] = data_single_devices.time_entry_secs
    data_single_devices['prev_time_exit_secs'] = data_single_devices.time_exit_secs
    data_single_devices['prev_vmean'] = data_single_devices.vmean
    
    data_new = pd.concat([data_last_row, data_single_devices]).sample(frac=1, random_state=11)
    
    if test_set == False:
        city_center_bool = (data_new.x_exit >= 3750901.5068) & (data_new.x_exit <= 3770901.5068) & (data_new.y_exit >= -19268905.6133) & (data_new.y_exit <= -19208905.6133)
        city_center = list(map(int, city_center_bool))

        data_new['city_center'] = city_center
    
    return data_new

In [4]:
dataset = pd.read_csv(r'data_train.csv')

data = prepare_dataset(dataset)

data.head()

Unnamed: 0_level_0,Unnamed: 0,trajectory_id,time_entry,time_exit,vmax,vmin,vmean,x_entry,y_entry,x_exit,...,time_entry_secs,time_exit_secs,prev_x_entry,prev_y_entry,prev_x_exit,prev_y_exit,prev_time_entry_secs,prev_time_exit_secs,prev_vmean,city_center
hash,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
7b2f32798d8b0eeb88f258806aaedb6e_19,390631,traj_7b2f32798d8b0eeb88f258806aaedb6e_19_2,15:00:57,15:00:57,0.0,0.0,0.0,3763440.364,-19286285.0,3763440.364,...,54057,54057,3763599.551,-19288121.42,3763520.514,-19285929.25,427,52358,,0
442a8450dd1ef5b74331994f4fdb6f2e_19,217764,traj_442a8450dd1ef5b74331994f4fdb6f2e_19_11,15:01:10,15:11:09,22.68,22.68,22.68,3756273.615,-19226931.32,3756711.101,...,54070,54669,3748638.211,-19162480.32,3746438.538,-19289533.53,18086,53371,7.56,0
0e439e0b14e411904a2617d807201423_11,45927,traj_0e439e0b14e411904a2617d807201423_11_0,15:31:29,15:31:29,,,,3771563.347,-19114672.78,3771563.347,...,55889,55889,3771563.347,-19114672.78,3771563.347,-19114672.78,55889,55889,,0
72705dd6d48ab5661a8a2793fef783ab_5,364108,traj_72705dd6d48ab5661a8a2793fef783ab_5_5,15:13:00,15:13:00,0.0,0.0,0.0,3775979.392,-19197153.77,3775979.392,...,54780,54780,3775699.98,-19202260.01,3775494.039,-19195083.0,26533,33568,0.0,0
c6b67535c8129788e716ff7c4f1e7d74_11,632248,traj_c6b67535c8129788e716ff7c4f1e7d74_11_7,14:53:20,15:03:10,0.0,0.0,0.0,3769866.838,-19326080.9,3769871.291,...,53600,54190,3772438.319,-19332828.19,3769860.159,-19328067.38,31167,47832,0.0,0


In [5]:
X = data.filter(['x_entry', 'y_entry', 'time_entry_secs', 'time_exit_secs',
                 'vmean', 'prev_x_entry', 'prev_y_entry', 
                 'prev_x_exit', 'prev_y_exit', 'prev_time_entry_secs', 
                 'prev_time_exit_secs', 'prev_vmean'], axis=1).to_numpy()

y = np.concatenate(data.filter(['city_center']).to_numpy())

X.shape, y.shape

((134063, 12), (134063,))

In [6]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=11)

model = xgb.XGBClassifier(max_depth=9, min_child_weight=1)
model.fit(X_train, y_train)

predictions = model.predict(X_val)
y_pred = [round(value) for value in predictions]
acc = accuracy_score(y_val, y_pred)

acc*100.0

94.06631111774139

In [7]:
model = xgb.XGBClassifier(max_depth=9, min_child_weight=1)
model.fit(X, y)
print(model)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=9, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)


In [8]:
test_dataset = pd.read_csv(r'data_test.csv')

test_data = prepare_dataset(test_dataset, test_set=True)

test_data.head()

Unnamed: 0_level_0,Unnamed: 0,trajectory_id,time_entry,time_exit,vmax,vmin,vmean,x_entry,y_entry,x_exit,y_exit,time_entry_secs,time_exit_secs,prev_x_entry,prev_y_entry,prev_x_exit,prev_y_exit,prev_time_entry_secs,prev_time_exit_secs,prev_vmean
hash,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
e51525de5797475dc36ca72f1055a261_19,181109,traj_e51525de5797475dc36ca72f1055a261_19_7,15:35:11,15:35:11,,,,3757548.224,-19359563.61,3757569.374,-19360178.87,56111,56111,3757476.979,-19358681.46,3757569.374,-19360178.87,30030,50641,
df5bcc30e43009d823fb2b942aa6a15c_29,176874,traj_df5bcc30e43009d823fb2b942aa6a15c_29_8,14:59:14,15:25:49,0.0,0.0,0.0,3769795.594,-19342024.73,3769776.67,-19342372.03,53954,55549,3764658.199,-19297656.33,3769776.67,-19342372.03,27494,32041,0.0
2c2c8fa91ab62e0d10b6140fdad8dcc9_15,35392,traj_2c2c8fa91ab62e0d10b6140fdad8dcc9_15_22,15:09:28,15:37:09,0.0,0.0,0.0,3757146.36,-19260584.42,3756900.344,-19262550.69,54568,56229,3758831.737,-19050582.97,3756900.344,-19262550.69,22848,53351,0.663
2944b5417e636bafcf0b810ce1a8f3cd_3,32910,traj_2944b5417e636bafcf0b810ce1a8f3cd_3_5,15:15:36,15:15:36,,,,3752536.62,-19269620.55,3752573.355,-19270376.05,54936,54936,3752490.979,-19270421.84,3752573.355,-19270376.05,34923,49327,
f99fd92169519f3b0425d6e68e429179_9,197916,traj_f99fd92169519f3b0425d6e68e429179_9_6,15:14:37,15:19:55,0.0,0.0,0.0,3749995.196,-19130109.83,3756156.73,-19084532.46,54877,55195,3755542.246,-19145157.55,3756156.73,-19084532.46,31910,43314,7.893232


In [9]:
X_test = test_data.filter(['x_entry', 'y_entry', 'time_entry_secs', 'time_exit_secs',
                 'vmean', 'prev_x_entry', 'prev_y_entry', 
                 'prev_x_exit', 'prev_y_exit', 'prev_time_entry_secs', 
                 'prev_time_exit_secs', 'prev_vmean'], axis=1).to_numpy()

X_test.shape

(33515, 12)

In [10]:
predictions_test = model.predict(X_test)
test_pred = [round(val) for val in predictions_test]

In [11]:
test_traj_id = test_data.trajectory_id

submission_data = {'id': test_traj_id, 'target': test_pred}

submission_df = pd.DataFrame(submission_data)

submission_df.head()

Unnamed: 0_level_0,id,target
hash,Unnamed: 1_level_1,Unnamed: 2_level_1
e51525de5797475dc36ca72f1055a261_19,traj_e51525de5797475dc36ca72f1055a261_19_7,0
df5bcc30e43009d823fb2b942aa6a15c_29,traj_df5bcc30e43009d823fb2b942aa6a15c_29_8,0
2c2c8fa91ab62e0d10b6140fdad8dcc9_15,traj_2c2c8fa91ab62e0d10b6140fdad8dcc9_15_22,0
2944b5417e636bafcf0b810ce1a8f3cd_3,traj_2944b5417e636bafcf0b810ce1a8f3cd_3_5,0
f99fd92169519f3b0425d6e68e429179_9,traj_f99fd92169519f3b0425d6e68e429179_9_6,0


In [12]:
# submission_df.to_csv('submission.csv', encoding='utf-8', index=False)

### My submitted model achieved an accuracy of 0.882475 on the test set which qualified me as a U.K finalist.