## Import libraries

In [1]:
!pip install xgboost -q
!pip install lightgbm -q
!pip install imbalanced-learn -q

[33mYou are using pip version 10.0.1, however version 19.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m
[33mYou are using pip version 10.0.1, however version 19.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m
[33mYou are using pip version 10.0.1, however version 19.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [2]:
import lightgbm as lgb
import numpy as np
import pandas as pd
import xgboost as xgb
from imblearn.combine import SMOTEENN
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import AdaBoostRegressor, RandomForestRegressor
from sklearn.linear_model import Ridge
from sklearn.metrics import confusion_matrix, f1_score, mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.tree import DecisionTreeRegressor

## Load data

In [3]:
df_train = pd.read_csv('data_train.csv', index_col=0)
df_test = pd.read_csv('data_test.csv', index_col=0)

## Initialize

In [4]:
X_MIN = 3750901.5068
X_MAX = 3770901.5068
Y_MIN = -19268905.6133
Y_MAX = -19208905.6133

In [5]:
def preprocess(df):
    df = df.drop(['vmax', 'vmin', 'vmean'], axis=1)

    df['step'] = df['trajectory_id'].apply(lambda x: int(x.split('_')[3]))

    df[['time_entry', 'time_exit']] = df[['time_entry', 'time_exit']].apply(pd.to_datetime, format='%H:%M:%S')
    df['time_entry_hour'] = df['time_entry'].apply(lambda x: x.hour)
    df['time_entry_minute'] = df['time_entry'].apply(lambda x: x.minute)
    df['time_entry_second'] = df['time_entry'].apply(lambda x: x.second)
    df['time_exit_hour'] = df['time_exit'].apply(lambda x: x.hour)
    df['time_exit_minute'] = df['time_exit'].apply(lambda x: x.minute)
    df['time_exit_second'] = df['time_exit'].apply(lambda x: x.second)

    df['time_delta'] = (df['time_exit'] - df['time_entry']).apply(lambda x: x.seconds)

    start_end = df.groupby(['hash']).agg({'step': [np.min, np.max]}).reset_index()
    start_end.columns = ['_'.join(tup).rstrip('_') for tup in start_end.columns.values]
    df = df.merge(start_end, how='left', on='hash')

    for idx in range(len(df)):
        if df.at[idx, 'x_entry'] >= X_MIN and df.at[idx, 'x_entry'] <= X_MAX and df.at[idx, 'y_entry'] >= Y_MIN and df.at[idx, 'y_entry'] <= Y_MAX:
            df.at[idx, 'entry_in_cc'] = 1
        else:
            df.at[idx, 'entry_in_cc'] = 0
        
        if df.at[idx, 'x_exit'] >= X_MIN and df.at[idx, 'x_exit'] <= X_MAX and df.at[idx, 'y_exit'] >= Y_MIN and df.at[idx, 'y_exit'] <= Y_MAX:
            df.at[idx, 'actual'] = 1
        else:
            df.at[idx, 'actual'] = 0

        if df.at[idx, 'step'] == df.at[idx, 'step_amin']:
            df.at[idx, 'start_point'] = 1
            df.at[idx, 'end_point'] = 0
            df.at[idx, 'other_point'] = 0
        elif df.at[idx, 'step'] == df.at[idx, 'step_amax']:
            df.at[idx, 'start_point'] = 0
            df.at[idx, 'end_point'] = 1
            df.at[idx, 'other_point'] = 0
        else:
            df.at[idx, 'start_point'] = 0
            df.at[idx, 'end_point'] = 0
            df.at[idx, 'other_point'] = 1

    features = ['entry_in_cc', 'start_point', 'end_point', 'other_point', 'actual']
    for feature in features:
        df[feature] = df[feature].astype('int')
        
    cols_to_drop = ['time_entry', 'time_exit', 'step_amin', 'step_amax']
    df = df.drop(cols_to_drop, axis=1)

    df = df.sort_values(['hash', 'step']).reset_index(drop=True)

    return df


def get_target(df):
    for idx in range(len(df)):
        if df.at[idx, 'x_exit'] >= X_MIN and df.at[idx, 'x_exit'] <= X_MAX and df.at[idx, 'y_exit'] >= Y_MIN and df.at[idx, 'y_exit'] <= Y_MAX:
            df.at[idx, 'target'] = 1
        else:
            df.at[idx, 'target'] = 0
    df['target'] = df['target'].astype(int)
    
    return df

## Pre-process

In [6]:
df = preprocess(df_train)

In [7]:
hash_count = df.groupby('hash').size().reset_index(name='count')
df = df[~df['hash'].isin(hash_count[hash_count['count'] <= 1]['hash'])]

In [8]:
scaler = StandardScaler()
# scaler = MinMaxScaler()
num_cols = list(df.columns.drop(['hash', 'trajectory_id', 'actual', 'entry_in_cc', 'start_point', 'end_point', 'other_point']))
scaler.fit(df[num_cols])
df[num_cols] = scaler.transform(df[num_cols])

  return self.partial_fit(X, y)


In [9]:
# X = df.drop(['x_exit', 'y_exit'], axis=1)
# y = df[['x_exit', 'y_exit']]

# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [106]:
X = df.drop(['actual'], axis=1)
y = df['actual']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [10]:
X_train = df.drop(['actual'], axis=1)
y_train = df['actual']

In [107]:
print(df.actual.value_counts())
print(y_train.value_counts())
print(y_test.value_counts())

0    569973
1    242979
Name: actual, dtype: int64
0    455822
1    194539
Name: actual, dtype: int64
0    114151
1     48440
Name: actual, dtype: int64


In [11]:
# smote = SMOTE()
smote = SMOTEENN()
X_resampled, y_resampled = smote.fit_resample(X_train.drop(['hash', 'trajectory_id'], axis=1), y_train)

X_resampled = pd.DataFrame(X_resampled, columns=X_train.columns.drop(['hash', 'trajectory_id']))
X_resampled['actual'] = y_resampled
y_resampled = X_resampled[['x_exit', 'y_exit']]
X_resampled = X_resampled.drop(['x_exit', 'y_exit'], axis=1)

X_train = X_resampled.copy()
y_train = y_resampled.copy()

In [109]:
X_test = X_test.reset_index(drop=True)
X_test['actual'] = y_test.values
y_test = X_test[['x_exit', 'y_exit']]
X_test = X_test.drop(['x_exit', 'y_exit'], axis=1)

In [110]:
X_resampled.actual.value_counts()

1    455822
0    455822
Name: actual, dtype: int64

## LightGBM

In [12]:
lgbm_reg = MultiOutputRegressor(lgb.LGBMRegressor(boosting_type='gbdt', n_estimators=200, learning_rate=0.1, num_leaves=91))
# lgbm_reg.fit(X_train.drop(['hash', 'trajectory_id', 'actual'], axis=1), y_train)
lgbm_reg.fit(X_train.drop(['actual'], axis=1), y_train)

MultiOutputRegressor(estimator=LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
       importance_type='split', learning_rate=0.1, max_depth=-1,
       min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
       n_estimators=200, n_jobs=-1, num_leaves=91, objective=None,
       random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
       subsample=1.0, subsample_for_bin=200000, subsample_freq=0),
           n_jobs=None)

In [112]:
lgbm_y_pred = lgbm_reg.predict(X_test.drop(['hash', 'trajectory_id', 'actual'], axis=1))
mean_squared_error(y_test, lgbm_y_pred, multioutput='raw_values')

array([0.11836692, 0.14382242])

In [113]:
lgbm_result = pd.concat([X_test.reset_index(drop=True), pd.DataFrame(lgbm_y_pred, columns=['x_exit', 'y_exit'])], axis=1)
lgbm_result[num_cols] = scaler.inverse_transform(lgbm_result[num_cols])
lgbm_result = get_target(lgbm_result)

In [114]:
confusion_matrix(lgbm_result.actual, lgbm_result.target)

array([[102236,  11915],
       [  3056,  45384]])

In [115]:
f1_score(lgbm_result.actual, lgbm_result.target)

0.8584155325849496

## AdaBoost

In [13]:
ada_reg = MultiOutputRegressor(AdaBoostRegressor(n_estimators=100, learning_rate=0.1))
# ada_reg.fit(X_train.drop(['hash', 'trajectory_id', 'actual'], axis=1), y_train)
ada_reg.fit(X_train.drop(['actual'], axis=1), y_train)

MultiOutputRegressor(estimator=AdaBoostRegressor(base_estimator=None, learning_rate=0.1, loss='linear',
         n_estimators=100, random_state=None),
           n_jobs=None)

In [117]:
ada_y_pred = ada_reg.predict(X_test.drop(['hash', 'trajectory_id', 'actual'], axis=1))
mean_squared_error(y_test, ada_y_pred, multioutput='raw_values')

array([0.18881536, 0.21625191])

In [118]:
ada_result = pd.concat([X_test.reset_index(drop=True), pd.DataFrame(ada_y_pred, columns=['x_exit', 'y_exit'])], axis=1)
ada_result[num_cols] = scaler.inverse_transform(ada_result[num_cols])
ada_result = get_target(ada_result)

In [119]:
confusion_matrix(ada_result.actual, ada_result.target)

array([[92211, 21940],
       [ 2374, 46066]])

In [120]:
f1_score(ada_result.actual, ada_result.target)

0.7911993542071003

## XGBoost

In [14]:
xgb_reg = MultiOutputRegressor(xgb.XGBRegressor(boosting_type='dart', n_estimators=200, subsample=0.8, max_depth=9))
# xgb_reg.fit(X_train.drop(['hash', 'trajectory_id', 'actual'], axis=1), y_train)
xgb_reg.fit(X_train.drop(['actual'], axis=1), y_train)

MultiOutputRegressor(estimator=XGBRegressor(base_score=0.5, booster='gbtree', boosting_type='dart',
       colsample_bylevel=1, colsample_bytree=1, gamma=0,
       importance_type='gain', learning_rate=0.1, max_delta_step=0,
       max_depth=9, min_child_weight=1, missing=None, n_estimators=200,
       n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=0.8),
           n_jobs=None)

In [248]:
xgb_y_pred = xgb_reg.predict(X_test.drop(['hash', 'trajectory_id', 'actual'], axis=1))
mean_squared_error(y_test, xgb_y_pred, multioutput='raw_values')

array([0.11800964, 0.14317444])

In [249]:
xgb_result = pd.concat([X_test.reset_index(drop=True), pd.DataFrame(xgb_y_pred, columns=['x_exit', 'y_exit'])], axis=1)
xgb_result[num_cols] = scaler.inverse_transform(xgb_result[num_cols])
xgb_result = get_target(xgb_result)

In [250]:
confusion_matrix(xgb_result.actual, xgb_result.target)

array([[102506,  11645],
       [  2983,  45457]])

In [251]:
f1_score(xgb_result.actual, xgb_result.target)

0.8614011483579995

## Random Forest

In [15]:
rf_reg = MultiOutputRegressor(RandomForestRegressor(n_estimators=100))
# rf_reg.fit(X_train.drop(['hash', 'trajectory_id', 'actual'], axis=1), y_train)
rf_reg.fit(X_train.drop(['actual'], axis=1), y_train)

MultiOutputRegressor(estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False),
           n_jobs=None)

In [127]:
rf_y_pred = rf_reg.predict(X_test.drop(['hash', 'trajectory_id', 'actual'], axis=1))
mean_squared_error(y_test, rf_y_pred, multioutput='raw_values')

array([0.12070546, 0.14705123])

In [128]:
rf_result = pd.concat([X_test.reset_index(drop=True), pd.DataFrame(rf_y_pred, columns=['x_exit', 'y_exit'])], axis=1)
rf_result[num_cols] = scaler.inverse_transform(rf_result[num_cols])
rf_result = get_target(rf_result)

In [129]:
confusion_matrix(rf_result.actual, rf_result.target)

array([[102985,  11166],
       [  3449,  44991]])

In [130]:
f1_score(rf_result.actual, rf_result.target)

0.8602732391942407

## K-Nearest Neighbors

In [16]:
knn_reg = MultiOutputRegressor(KNeighborsRegressor())
# knn_reg.fit(X_train.drop(['hash', 'trajectory_id', 'actual'], axis=1), y_train)
knn_reg.fit(X_train.drop(['actual'], axis=1), y_train)

MultiOutputRegressor(estimator=KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
          metric_params=None, n_jobs=None, n_neighbors=5, p=2,
          weights='uniform'),
           n_jobs=None)

In [132]:
knn_y_pred = knn_reg.predict(X_test.drop(['hash', 'trajectory_id', 'actual'], axis=1))
mean_squared_error(y_test, knn_y_pred, multioutput='raw_values')

array([0.16032696, 0.18884696])

In [133]:
knn_result = pd.concat([X_test.reset_index(drop=True), pd.DataFrame(knn_y_pred, columns=['x_exit', 'y_exit'])], axis=1)
knn_result[num_cols] = scaler.inverse_transform(knn_result[num_cols])
knn_result = get_target(knn_result)

In [134]:
confusion_matrix(knn_result.actual, knn_result.target)

array([[103238,  10913],
       [  4201,  44239]])

In [135]:
f1_score(knn_result.actual, knn_result.target)

0.8541007027569696

## Multi-Layer Perceptron

In [17]:
mlp_reg = MultiOutputRegressor(MLPRegressor(hidden_layer_sizes=(100,), activation='relu', solver='adam', max_iter=200, tol=1e-4, verbose=False))
# mlp_reg.fit(X_train.drop(['hash', 'trajectory_id', 'actual'], axis=1), y_train)
mlp_reg.fit(X_train.drop(['actual'], axis=1), y_train)

MultiOutputRegressor(estimator=MLPRegressor(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
       random_state=None, shuffle=True, solver='adam', tol=0.0001,
       validation_fraction=0.1, verbose=False, warm_start=False),
           n_jobs=None)

In [137]:
mlp_y_pred = mlp_reg.predict(X_test.drop(['hash', 'trajectory_id', 'actual'], axis=1))
mean_squared_error(y_test, mlp_y_pred, multioutput='raw_values')

array([0.12688234, 0.15089818])

In [138]:
mlp_result = pd.concat([X_test.reset_index(drop=True), pd.DataFrame(mlp_y_pred, columns=['x_exit', 'y_exit'])], axis=1)
mlp_result[num_cols] = scaler.inverse_transform(mlp_result[num_cols])
mlp_result = get_target(mlp_result)

In [139]:
confusion_matrix(mlp_result.actual, mlp_result.target)

array([[101445,  12706],
       [  3104,  45336]])

In [140]:
f1_score(mlp_result.actual, mlp_result.target)

0.8515242012734545

## Decision Tree

In [18]:
dt_reg = MultiOutputRegressor(DecisionTreeRegressor())
# dt_reg.fit(X_train.drop(['hash', 'trajectory_id', 'actual'], axis=1), y_train)
dt_reg.fit(X_train.drop(['actual'], axis=1), y_train)

MultiOutputRegressor(estimator=DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=None, splitter='best'),
           n_jobs=None)

In [142]:
dt_y_pred = dt_reg.predict(X_test.drop(['hash', 'trajectory_id', 'actual'], axis=1))
mean_squared_error(y_test, dt_y_pred, multioutput='raw_values')

array([0.23148138, 0.28909889])

In [143]:
dt_result = pd.concat([X_test.reset_index(drop=True), pd.DataFrame(dt_y_pred, columns=['x_exit', 'y_exit'])], axis=1)
dt_result[num_cols] = scaler.inverse_transform(dt_result[num_cols])
dt_result = get_target(dt_result)

In [144]:
confusion_matrix(dt_result.actual, dt_result.target)

array([[105762,   8389],
       [  6805,  41635]])

In [145]:
f1_score(dt_result.actual, dt_result.target)

0.8456897952551187

## Ridge Regression

In [19]:
ridge_reg = MultiOutputRegressor(Ridge(alpha=0.1, solver='sag', max_iter=5000))
# ridge_reg.fit(X_train.drop(['hash', 'trajectory_id', 'actual'], axis=1), y_train)
ridge_reg.fit(X_train.drop(['actual'], axis=1), y_train)

MultiOutputRegressor(estimator=Ridge(alpha=0.1, copy_X=True, fit_intercept=True, max_iter=5000,
   normalize=False, random_state=None, solver='sag', tol=0.001),
           n_jobs=None)

In [147]:
ridge_y_pred = ridge_reg.predict(X_test.drop(['hash', 'trajectory_id', 'actual'], axis=1))
mean_squared_error(y_test, ridge_y_pred, multioutput='raw_values')

array([0.17265291, 0.20880679])

In [148]:
ridge_result = pd.concat([X_test.reset_index(drop=True), pd.DataFrame(ridge_y_pred, columns=['x_exit', 'y_exit'])], axis=1)
ridge_result[num_cols] = scaler.inverse_transform(ridge_result[num_cols])
ridge_result = get_target(ridge_result)

In [149]:
confusion_matrix(ridge_result.actual, ridge_result.target)

array([[104899,   9252],
       [  5176,  43264]])

In [150]:
f1_score(ridge_result.actual, ridge_result.target)

0.8570862553983915

## Stacking

In [None]:
# stacked_df = X[['hash', 'trajectory_id']].copy()
# stacked_df['lgb_x_exit'], stacked_df['lgb_y_exit'] = lgbm_reg.predict(X.drop(['hash', 'trajectory_id', 'actual'], axis=1)).T
# stacked_df['ada_x_exit'], stacked_df['ada_y_exit'] = ada_reg.predict(X.drop(['hash', 'trajectory_id', 'actual'], axis=1)).T
# stacked_df['xgb_x_exit'], stacked_df['xgb_y_exit'] = xgb_reg.predict(X.drop(['hash', 'trajectory_id', 'actual'], axis=1)).T
# stacked_df['rf_x_exit'], stacked_df['rf_y_exit'] = rf_reg.predict(X.drop(['hash', 'trajectory_id', 'actual'], axis=1)).T
# stacked_df['knn_x_exit'], stacked_df['knn_y_exit'] = knn_reg.predict(X.drop(['hash', 'trajectory_id', 'actual'], axis=1)).T
# stacked_df['mlp_x_exit'], stacked_df['mlp_y_exit'] = mlp_reg.predict(X.drop(['hash', 'trajectory_id', 'actual'], axis=1)).T
# stacked_df['dt_x_exit'], stacked_df['dt_y_exit'] = dt_reg.predict(X.drop(['hash', 'trajectory_id', 'actual'], axis=1)).T
# stacked_df['ridge_x_exit'], stacked_df['ridge_y_exit'] = ridge_reg.predict(X.drop(['hash', 'trajectory_id', 'actual'], axis=1)).T
# stacked_df[['x_exit', 'y_exit']] = y.copy()

In [None]:
# new_X_train = stacked_df.drop(['x_exit', 'y_exit'], axis=1).loc[X_train.index]
# new_X_test = stacked_df.drop(['x_exit', 'y_exit'], axis=1).loc[X_test.index]
# new_y_train = stacked_df[['x_exit', 'y_exit']].loc[y_train.index]
# new_y_test = stacked_df[['x_exit', 'y_exit']].loc[y_test.index]

In [20]:
stacked_df = df.copy()
stacked_df['lgb_x_exit'], stacked_df['lgb_y_exit'] = lgbm_reg.predict(df.drop(['hash', 'trajectory_id', 'x_exit', 'y_exit', 'actual'], axis=1)).T
stacked_df['ada_x_exit'], stacked_df['ada_y_exit'] = ada_reg.predict(df.drop(['hash', 'trajectory_id', 'x_exit', 'y_exit', 'actual'], axis=1)).T
stacked_df['xgb_x_exit'], stacked_df['xgb_y_exit'] = xgb_reg.predict(df.drop(['hash', 'trajectory_id', 'x_exit', 'y_exit', 'actual'], axis=1)).T
stacked_df['rf_x_exit'], stacked_df['rf_y_exit'] = rf_reg.predict(df.drop(['hash', 'trajectory_id', 'x_exit', 'y_exit', 'actual'], axis=1)).T
stacked_df['knn_x_exit'], stacked_df['knn_y_exit'] = knn_reg.predict(df.drop(['hash', 'trajectory_id', 'x_exit', 'y_exit', 'actual'], axis=1)).T
stacked_df['mlp_x_exit'], stacked_df['mlp_y_exit'] = mlp_reg.predict(df.drop(['hash', 'trajectory_id', 'x_exit', 'y_exit', 'actual'], axis=1)).T
stacked_df['dt_x_exit'], stacked_df['dt_y_exit'] = dt_reg.predict(df.drop(['hash', 'trajectory_id', 'x_exit', 'y_exit', 'actual'], axis=1)).T
stacked_df['ridge_x_exit'], stacked_df['ridge_y_exit'] = ridge_reg.predict(df.drop(['hash', 'trajectory_id', 'x_exit', 'y_exit', 'actual'], axis=1)).T

new_X_train, new_X_test, new_y_train, new_y_test = train_test_split(stacked_df.drop(['x_exit', 'y_exit'], axis=1),
                                                                    stacked_df[['x_exit', 'y_exit']], test_size=0.2)

In [87]:
cols = [
    'lgb_x_exit', 'lgb_y_exit',
#     'ada_x_exit', 'ada_y_exit',
    'xgb_x_exit', 'xgb_y_exit',
    'rf_x_exit', 'rf_y_exit',
    'knn_x_exit', 'knn_y_exit',
    'mlp_x_exit', 'mlp_y_exit',
    'dt_x_exit', 'dt_y_exit',
#     'ridge_x_exit', 'ridge_y_exit'
]

### Stacking Model

In [88]:
stack_reg = MultiOutputRegressor(Ridge(alpha=0.1, solver='sag', max_iter=5000))
stack_reg.fit(new_X_train[cols], new_y_train)

MultiOutputRegressor(estimator=Ridge(alpha=0.1, copy_X=True, fit_intercept=True, max_iter=5000,
   normalize=False, random_state=None, solver='sag', tol=0.001),
           n_jobs=None)

In [89]:
stack_y_pred = stack_reg.predict(new_X_test[cols])
mean_squared_error(new_y_test, stack_y_pred, multioutput='raw_values')

array([0.02742537, 0.02678705])

In [90]:
stack_result = pd.concat([new_X_test.reset_index(drop=True), pd.DataFrame(stack_y_pred, columns=['x_exit', 'y_exit'])], axis=1)
stack_result[num_cols] = scaler.inverse_transform(stack_result[num_cols])
stack_result = get_target(stack_result)

In [91]:
confusion_matrix(stack_result.actual, stack_result.target)

array([[109375,   4710],
       [  1834,  46672]])

In [92]:
f1_score(stack_result.actual, stack_result.target)

0.9344866250200224

## Test set

In [27]:
test_set = preprocess(df_test)
test_set[num_cols] = scaler.transform(test_set.fillna(0)[num_cols])

final_df = test_set[['hash', 'trajectory_id']].copy()
final_df['lgb_x_exit'], final_df['lgb_y_exit'] = lgbm_reg.predict(test_set.drop(['hash', 'trajectory_id', 'x_exit', 'y_exit', 'actual'], axis=1)).T
final_df['ada_x_exit'], final_df['ada_y_exit'] = ada_reg.predict(test_set.drop(['hash', 'trajectory_id', 'x_exit', 'y_exit', 'actual'], axis=1)).T
final_df['xgb_x_exit'], final_df['xgb_y_exit'] = xgb_reg.predict(test_set.drop(['hash', 'trajectory_id', 'x_exit', 'y_exit', 'actual'], axis=1)).T
final_df['rf_x_exit'], final_df['rf_y_exit'] = rf_reg.predict(test_set.drop(['hash', 'trajectory_id', 'x_exit', 'y_exit', 'actual'], axis=1)).T
final_df['knn_x_exit'], final_df['knn_y_exit'] = knn_reg.predict(test_set.drop(['hash', 'trajectory_id', 'x_exit', 'y_exit', 'actual'], axis=1)).T
final_df['mlp_x_exit'], final_df['mlp_y_exit'] = mlp_reg.predict(test_set.drop(['hash', 'trajectory_id', 'x_exit', 'y_exit', 'actual'], axis=1)).T
final_df['dt_x_exit'], final_df['dt_y_exit'] = dt_reg.predict(test_set.drop(['hash', 'trajectory_id', 'x_exit', 'y_exit', 'actual'], axis=1)).T
final_df['ridge_x_exit'], final_df['ridge_y_exit'] = ridge_reg.predict(test_set.drop(['hash', 'trajectory_id', 'x_exit', 'y_exit', 'actual'], axis=1)).T

  from ipykernel import kernelapp as app


In [86]:
new_test_set = test_set.copy()
new_test_set['x_exit'], new_test_set['y_exit'] = stack_reg.predict(final_df.drop(['hash', 'trajectory_id'], axis=1)[cols]).T
new_test_set[num_cols] = scaler.inverse_transform(new_test_set[num_cols])
new_test_set = get_target(new_test_set)

target_id = df_test[~df_test['x_exit'].notnull()]['trajectory_id']

new_test_set[new_test_set['trajectory_id'].isin(target_id)][['trajectory_id', 'target']].to_csv('./submission.csv', header=['id', 'target'], index=False)

print(len(new_test_set[new_test_set['trajectory_id'].isin(target_id)]))

new_test_set = new_test_set[~new_test_set['trajectory_id'].isin(target_id)]

print(confusion_matrix(new_test_set.actual, new_test_set.target))
print(f1_score(new_test_set.actual, new_test_set.target))

33515
[[108002   9989]
 [  4413  47018]]
0.8671867795422268


In [213]:
test_set = preprocess(df_test)
test_set[num_cols] = scaler.transform(test_set.fillna(0)[num_cols])

test_set['x_exit'], test_set['y_exit'] = ridge_reg.predict(test_set.drop(['hash', 'trajectory_id', 'x_exit', 'y_exit', 'actual'], axis=1)).T
test_set[num_cols] = scaler.inverse_transform(test_set[num_cols])
test_set = get_target(test_set)

target_id = df_test[~df_test['x_exit'].notnull()]['trajectory_id']

test_set[test_set['trajectory_id'].isin(target_id)][['trajectory_id', 'target']].to_csv('./submission.csv', header=['id', 'target'], index=False)

print(len(test_set[test_set['trajectory_id'].isin(target_id)]))

new_test_set = test_set[~test_set['trajectory_id'].isin(target_id)]

print(confusion_matrix(new_test_set.actual, new_test_set.target))
print(f1_score(new_test_set.actual, new_test_set.target))

  from ipykernel import kernelapp as app


33515
[[108920   9071]
 [  5701  45730]]
0.8609458543565027
