In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
import numpy as np

In [2]:
import warnings
from sklearn.exceptions import DataConversionWarning

warnings.filterwarnings(action='ignore', category=DataConversionWarning)

In [3]:
X_train = pd.read_csv("X_train.csv", index_col="Index")
y_train = pd.read_csv("y_train.csv")
X_test = pd.read_csv("X_test.csv", index_col="Index")

In [4]:
X_train.drop_duplicates(inplace=True)
X_test.drop_duplicates(inplace=True)

In [5]:
# NaN in these three following columns means there were no trades => We fill these NaN with 0
X_train['OTR'].fillna(0, inplace=True)
X_train['OCR'].fillna(0, inplace=True)
X_train['OMR'].fillna(0, inplace=True)
X_test['OTR'].fillna(0, inplace=True)
X_test['OCR'].fillna(0, inplace=True)
X_test['OMR'].fillna(0, inplace=True)

In [6]:
# We replace the NaN of the statistics over the observed time delta between two trades on the trading venue TV_1 with their average
columns = ['min_dt_TV1', 'mean_dt_TV1', 'med_dt_TV1']
for col in columns:
        X_train[col] = X_train[col].fillna(np.mean(X_train[col]))
        X_test[col] = X_test[col].fillna(np.mean(X_test[col]))

In [7]:
# We drop the remaining irrelevant columns
X_train.dropna(axis=1, inplace=True)
X_test.dropna(axis=1, inplace=True)

In [8]:
# We retrieve y_train in the right format
Xy_train = pd.merge(X_train, y_train, on='Trader')
y_train_formatted = Xy_train['type'].to_frame()

In [9]:
# We retrieve the 'Traders' column to keep trace of them for later
X_train_traders = X_train['Trader'].to_frame()
X_test_traders = X_test['Trader'].to_frame()
# Dropping the categorical features
X_train.drop(columns=['Share', 'Day', 'Trader'], inplace=True)
X_test.drop(columns=['Share', 'Day', 'Trader'], inplace=True)

# Testing different models

In [10]:
# # Splitting the training set into train and test
# X_tr, X_te, y_tr, y_te = train_test_split(X_train, y_train_formatted, test_size=0.3, random_state=42)

In [11]:
# from sklearn.neighbors import KNeighborsClassifier

# model_knn = KNeighborsClassifier()
# preds_knn = model_knn.fit(X_tr, y_tr).predict(X_te)
# print('KNN accuracy %s' % f1_score(preds_knn, y_te, average='micro'))

In [12]:
# from sklearn.ensemble import RandomForestClassifier

# model_rf = RandomForestClassifier()
# preds_rf = model_rf.fit(X_tr, y_tr).predict(X_te)
# print('Random Forest accuracy %s' % f1_score(preds_rf, y_te, average='micro'))

In [13]:
# from lightgbm import LGBMClassifier

# model_lgbm = LGBMClassifier(boosting_type='dart', n_estimators=5000,
#                             subsample=0.9, colsample_bytree=0.9, subsample_freq=1, uniform_drop=True)
# preds_lgbm = model_lgbm.fit(X_tr, y_tr).predict(X_te)
# print('LightGBM accuracy %s' % f1_score(preds_lgbm, y_te, average='micro'))

In [14]:
# from xgboost import XGBClassifier

# model_xgb = XGBClassifier(booster='dart', n_estimators=1000, learning_rate=0.1, max_depth=5, subsample=0.9,
#                      colsample_bytree = 0.9)
# preds_xgb = model_xgb.fit(X_tr, y_tr).predict(X_te)
# print('XGBoost accuracy %s' % f1_score(preds_xgb, y_te, average='micro'))

In [15]:
# from catboost import CatBoostClassifier

# model_cb = CatBoostClassifier(n_estimators=5000, learning_rate=0.1, max_depth=10, verbose=0)
# preds_cb = model_cb.fit(X_tr, y_tr).predict(X_te)
# print('CatBoost accuracy %s' % f1_score(preds_cb, y_te, average='micro'))

In [16]:
# from sklearn.ensemble import VotingClassifier

# model_vot = VotingClassifier(estimators=[('rf', model_rf), ('lgbm', model_lgbm),
#                                               ('xgb', model_xgb), ('cb', model_cb)], voting='soft')
# preds_vot = model_vot.fit(X_tr, y_tr).predict(X_te)
# print('Soft Voting accuracy %s' % f1_score(preds_vot, y_te, average='micro'))

# Retrieving final results

In [27]:
def freq_predictions(traders, y_pred):

    res = pd.DataFrame(traders)
    res['pred'] = y_pred
    res['count'] = 1

    predictions = res.groupby(['Trader', 'pred']).count() / res.groupby(['Trader']).count()
    predictions = predictions.unstack(level=1).drop(columns=['pred']).fillna(0)
    predictions.columns = predictions.columns.get_level_values(1)
    
    return predictions

In [24]:
def final_classification(predictions):

    predictions.reset_index(inplace=True)
    predictions['type'] = 'NON HFT'
    
    for i in range(len(predictions)):
        if predictions.iloc[i]['HFT'] >= 0.85:
            predictions.at[i, 'type'] = 'HFT'
        elif predictions.iloc[i]['MIX'] > 0.5:
            predictions.at[i, 'type'] = 'MIX'
    predictions.drop(columns=['HFT','MIX','NON HFT'], inplace=True)
    
    return predictions

In [19]:
from lightgbm import LGBMClassifier

model_lgbm = LGBMClassifier(boosting_type='dart', n_estimators=5000,
                            subsample=0.9, colsample_bytree=0.9, subsample_freq=1, uniform_drop=True)
y_pred = model_lgbm.fit(X_train, y_train_formatted).predict(X_test)

In [28]:
predictions = freq_predictions(X_test_traders, y_pred)
predictions

pred,HFT,MIX,NON HFT
Trader,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Adelaide,0.000000,0.100000,0.900000
Alana,0.000000,0.000000,1.000000
Alcmene,0.062500,0.000000,0.937500
Alice,1.000000,0.000000,0.000000
Alices Sister,0.049180,0.076503,0.874317
...,...,...,...
Monstro,0.000000,1.000000,0.000000
Morgana,0.010695,0.973262,0.016043
The Doorknob,0.894737,0.065789,0.039474
The Doorman,0.977143,0.016190,0.006667


In [29]:
final_res = final_classification(predictions)
final_res

pred,Trader,type
0,Adelaide,NON HFT
1,Alana,NON HFT
2,Alcmene,NON HFT
3,Alice,HFT
4,Alices Sister,NON HFT
...,...,...
80,Monstro,MIX
81,Morgana,MIX
82,The Doorknob,HFT
83,The Doorman,HFT


In [30]:
title = 'final_sub.csv'
final_res.to_csv(title, index=False)