In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
import numpy as np

In [2]:
X_train = pd.read_csv("X_train.csv", index_col="Index")
y_train = pd.read_csv("y_train.csv")
X_test = pd.read_csv("X_test.csv", index_col="Index")

In [3]:
X_train.drop_duplicates(inplace=True)
X_test.drop_duplicates(inplace=True)

In [4]:
# NaN in these three following columns means there were no trades => We fill these NaN with 0
X_train['OTR'].fillna(0, inplace=True)
X_train['OCR'].fillna(0, inplace=True)
X_train['OMR'].fillna(0, inplace=True)
X_test['OTR'].fillna(0, inplace=True)
X_test['OCR'].fillna(0, inplace=True)
X_test['OMR'].fillna(0, inplace=True)

In [5]:
# We retrieve y_train in the right format
Xy_train = pd.merge(X_train, y_train, on='Trader')
y_train_formatted = Xy_train['type'].to_frame()

In [6]:
# We retrieve the 'Traders' column to keep trace of them for later
X_train_traders = X_train['Trader'].to_frame()
X_test_traders = X_test['Trader'].to_frame()
# Dropping the irrelevent columns
X_train.drop(columns=['Share', 'Day', 'Trader'], inplace=True)
X_test.drop(columns=['Share', 'Day', 'Trader'], inplace=True)

In [7]:
# We replace the NaN of Time delta between two trades with their average
for col in X_train.columns:
        X_train[col] = X_train[col].fillna(np.mean(X_train[col]))
        
for col in X_test.columns:
        X_test[col] = X_test[col].fillna(np.mean(X_test[col]))

In [8]:
# Splitting the training set into train and test
X_tr, X_te, y_tr, y_te = train_test_split(X_train, y_train_formatted, test_size=0.3, random_state=42)

# Testing different models

In [9]:
from sklearn.ensemble import RandomForestClassifier

model_rf = RandomForestClassifier()
preds_rf = model_rf.fit(X_tr, y_tr).predict(X_te)
print('Random Forest accuracy %s' % f1_score(preds_rf, y_te, average='micro'))

  preds_rf = model_rf.fit(X_tr, y_tr).predict(X_te)


Random Forest accuracy 0.9738605119467142


In [10]:
from lightgbm import LGBMClassifier

model_lgbm = LGBMClassifier(boosting_type='dart', n_estimators=5000, learning_rate=0.1, max_depth=-1,
                       num_leaves=16, subsample=0.9, colsample_bytree=0.9, subsample_freq=1,
                       uniform_drop=True)
preds_lgbm = model_lgbm.fit(X_tr, y_tr).predict(X_te)
print('LightGBM accuracy %s' % f1_score(preds_lgbm, y_te, average='micro'))

  return f(**kwargs)


LightGBM accuracy 0.9852457111877009


In [11]:
from xgboost import XGBClassifier

model_xgb = XGBClassifier(booster='dart', n_estimators=1000, learning_rate=0.1, max_depth=5, subsample=0.9,
                     colsample_bytree = 0.9)
preds_xgb = model_xgb.fit(X_tr, y_tr).predict(X_te)
print('XGBoost accuracy %s' % f1_score(preds_xgb, y_te, average='micro'))

  return f(**kwargs)


XGBoost accuracy 0.9840452309956239


# Trying a voting classifier

In [13]:
from sklearn.ensemble import VotingClassifier

model_vot = VotingClassifier(estimators=[('rf', model_rf), ('lgbm', model_lgbm),
                                              ('xgb', model_xgb)], voting='soft')
preds_vot = model_vot.fit(X_tr, y_tr).predict(X_te)
print('Soft Voting accuracy %s' % f1_score(preds_vot, y_te, average='micro'))

  return f(**kwargs)


Soft Voting accuracy 0.9846648336754057


# Grid Search to improve the models

In [None]:
# TODO

# Retrieving final results

In [14]:
def predictions(traders, y_pred):

    res = pd.DataFrame(traders)
    res['pred'] = y_pred
    res['count'] = 1

    predictions = res.groupby(['Trader', 'pred']).count() / res.groupby(['Trader']).count()
    predictions = predictions.unstack(level=1).drop(columns=['pred']).fillna(0)
    predictions.columns = predictions.columns.get_level_values(1)
    
    return predictions

In [15]:
def final_classification(predictions):

    predictions.reset_index(inplace=True)
    predictions['type'] = 'NON HFT'
    
    for i in range(len(predictions)):
        if predictions.iloc[i]['HFT'] >= 0.85:
            predictions.at[i, 'type'] = 'HFT'
        elif predictions.iloc[i]['MIX'] > 0.5:
            predictions.at[i, 'type'] = 'MIX'
    predictions.drop(columns=['HFT','MIX','NON HFT'], inplace=True)
    
    return predictions

In [None]:
# # Choice of best model

# final_model = ...

In [None]:
# y_pred = final_model.fit(X_train, y_train_formatted).predict(X_test)

In [None]:
# predictions = predictions(X_test_traders, y_pred)
# predictions

In [None]:
# final_res = final_classification(predictions)
# final_res

In [None]:
# title = '... .csv'
# final_res.to_csv(title, index=False)