In [1]:
%matplotlib inline
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import numpy as np
import os

In [2]:
filepath_X_train = os.path.join(os.path.abspath('..'), 'Files', 'AMF_train_X.csv')
filepath_y_train = os.path.join(os.path.abspath('..'), 'Files', 'AMF_train_Y.csv')
filepath_X_test = os.path.join(os.path.abspath('..'), 'Files', 'AMF_train_X.csv')


X_train_original = pd.read_csv(filepath_X_train)
y_train_original = pd.read_csv(filepath_y_train)
X_test_original = pd.read_csv(filepath_X_test)

In [3]:
# Looking at the data
X_train_original.head()
X_train_original.isna().sum()
X_train_original.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 105782 entries, 0 to 105781
Data columns (total 39 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   Index                       105782 non-null  int64  
 1   Share                       105782 non-null  object 
 2   Day                         105782 non-null  object 
 3   Trader                      105782 non-null  object 
 4   OTR                         104099 non-null  float64
 5   OCR                         105782 non-null  float64
 6   OMR                         58170 non-null   float64
 7   min_time_two_events         105782 non-null  float64
 8   mean_time_two_events        105782 non-null  float64
 9   10_p_time_two_events        105782 non-null  float64
 10  med_time_two_events         105782 non-null  float64
 11  25_p_time_two_events        105782 non-null  float64
 12  75_p_time_two_events        105782 non-null  float64
 13  90_p_time_two_

In [4]:
print(X_train_original.shape)
print(y_train_original.shape)
print(len(X_train_original["Trader"].unique()))

y_train_original.head()

(105782, 39)
(86, 2)
86


Unnamed: 0,Trader,type
0,Trader_285,MIX
1,Trader_114,NON HFT
2,Trader_110,NON HFT
3,Trader_57,NON HFT
4,Trader_128,NON HFT


In [5]:
X_with_label = pd.merge(X_train_original, y_train_original, how='left', on="Trader")

In [6]:
y_train_reshaped = X_with_label['type']
X_anonymized = X_with_label.drop(columns=['Trader', 'type'])
X_anonymized.head()

Unnamed: 0,Index,Share,Day,OTR,OCR,OMR,min_time_two_events,mean_time_two_events,10_p_time_two_events,med_time_two_events,...,min_dt_TV1_TV2,mean_dt_TV1_TV2,med_dt_TV1_TV2,min_dt_TV1_TV3,mean_dt_TV1_TV3,med_dt_TV1_TV3,min_dt_TV1_TV4,mean_dt_TV1_TV4,med_dt_TV1_TV4,NbSecondWithAtLeatOneTrade
0,1,Isin_8,Date_24,2.272727,8.333333,12.5,0.0,5117.8303,0.0,419.6885,...,,,,,,,,,,4
1,2,Isin_87,Date_29,1.696629,25.166667,21.571429,0.0,1846.968401,7.4e-05,0.003374,...,,,,,,,,,,15
2,3,Isin_87,Date_23,1.482759,47.3,118.25,0.0,686.30063,7.1e-05,0.000599,...,,,,,,,,,,63
3,4,Isin_12,Date_11,1.705882,14.5,29.0,0.0,2174.335265,0.0,6.152666,...,,,,,,,,,,4
4,5,Isin_87,Date_9,1.51773,26.75,,0.0,944.008551,7.1e-05,0.001364,...,,,,,,,,,,38


## Cleaning the dataset

In [7]:
# print(X_train["Share"][0].find('_'))

X_anonymized_clean = X_anonymized.copy()
X_anonymized_clean["Share"] = pd.to_numeric(X_anonymized["Share"].str[5::], downcast='integer')
X_anonymized_clean["Day"] = pd.to_numeric(X_anonymized["Day"].str[5::], downcast='integer')

X_anonymized_clean.head()


Unnamed: 0,Index,Share,Day,OTR,OCR,OMR,min_time_two_events,mean_time_two_events,10_p_time_two_events,med_time_two_events,...,min_dt_TV1_TV2,mean_dt_TV1_TV2,med_dt_TV1_TV2,min_dt_TV1_TV3,mean_dt_TV1_TV3,med_dt_TV1_TV3,min_dt_TV1_TV4,mean_dt_TV1_TV4,med_dt_TV1_TV4,NbSecondWithAtLeatOneTrade
0,1,8,24,2.272727,8.333333,12.5,0.0,5117.8303,0.0,419.6885,...,,,,,,,,,,4
1,2,87,29,1.696629,25.166667,21.571429,0.0,1846.968401,7.4e-05,0.003374,...,,,,,,,,,,15
2,3,87,23,1.482759,47.3,118.25,0.0,686.30063,7.1e-05,0.000599,...,,,,,,,,,,63
3,4,12,11,1.705882,14.5,29.0,0.0,2174.335265,0.0,6.152666,...,,,,,,,,,,4
4,5,87,9,1.51773,26.75,,0.0,944.008551,7.1e-05,0.001364,...,,,,,,,,,,38


In [8]:
X_anonymized_clean.isna().sum()

Index                             0
Share                             0
Day                               0
OTR                            1683
OCR                               0
OMR                           47612
min_time_two_events               0
mean_time_two_events              0
10_p_time_two_events              0
med_time_two_events               0
25_p_time_two_events              0
75_p_time_two_events              0
90_p_time_two_events              0
max_time_two_events               0
min_lifetime_cancel               0
mean_lifetime_cancel              0
10_p_lifetime_cancel              0
med_lifetime_cancel               0
25_p_lifetime_cancel              0
75_p_lifetime_cancel              0
90_p_lifetime_cancel              0
max_lifetime_cancel               0
NbTradeVenueMic                   0
MaxNbTradesBySecond               0
MeanNbTradesBySecond              0
min_dt_TV1                     4235
mean_dt_TV1                    4235
med_dt_TV1                  

In [9]:
X_anonymized_clean.dropna(axis="columns", inplace=True)
X_anonymized_clean.head()

Unnamed: 0,Index,Share,Day,OCR,min_time_two_events,mean_time_two_events,10_p_time_two_events,med_time_two_events,25_p_time_two_events,75_p_time_two_events,...,10_p_lifetime_cancel,med_lifetime_cancel,25_p_lifetime_cancel,75_p_lifetime_cancel,90_p_lifetime_cancel,max_lifetime_cancel,NbTradeVenueMic,MaxNbTradesBySecond,MeanNbTradesBySecond,NbSecondWithAtLeatOneTrade
0,1,8,24,8.333333,0.0,5117.8303,0.0,419.6885,10.722543,984.32056,...,682.15326,984.32056,682.15326,23151.838,23151.838,23151.838,1,7,2.75,4
1,2,87,29,25.166667,0.0,1846.968401,7.4e-05,0.003374,0.000204,8.768699,...,11.866026,177.93991,73.74323,808.2623,21433.684,21433.684,1,17,5.933333,15
2,3,87,23,47.3,0.0,686.30063,7.1e-05,0.000599,0.000129,5.725427,...,2.761036,187.99548,19.77734,418.23984,1953.6235,10842.464,1,20,5.063492,63
3,4,12,11,14.5,0.0,2174.335265,0.0,6.152666,0.000945,62.444176,...,286.01932,286.01932,286.01932,19187.719,19187.719,19187.719,1,8,4.25,4
4,5,87,9,26.75,0.0,944.008551,7.1e-05,0.001364,0.000146,2.22542,...,2.798452,1345.9528,662.57434,21903.783,23164.514,23164.514,1,19,3.710526,38


In [10]:
X_train, X_valid, y_train, y_valid = train_test_split(
    X_anonymized_clean, y_train_reshaped, train_size=0.7, random_state=15)

## Classifier & classification for each line

In [11]:
classifier = RandomForestClassifier()
classifier.fit(X_train, y_train)

RandomForestClassifier()

In [12]:
print('{:0.2%}'.format(classifier.score(X_valid, y_valid)))

98.61%


Very good accuracy simply on the lines

## Building the prediction at csv format

In [13]:
X_test = X_test_original.copy()

X_test.drop(columns='Trader', inplace=True)
X_test["Share"] = pd.to_numeric(X_test["Share"].str[5::], downcast='integer')
X_test["Day"] = pd.to_numeric(X_test["Day"].str[5::], downcast='integer')
X_test.dropna(axis="columns", inplace=True)


y_test = classifier.predict(X_test)

In [14]:
X_test_pred = X_test_original.copy()
X_test_pred['pred'] = y_test
X_test_pred.head()

Unnamed: 0,Index,Share,Day,Trader,OTR,OCR,OMR,min_time_two_events,mean_time_two_events,10_p_time_two_events,...,mean_dt_TV1_TV2,med_dt_TV1_TV2,min_dt_TV1_TV3,mean_dt_TV1_TV3,med_dt_TV1_TV3,min_dt_TV1_TV4,mean_dt_TV1_TV4,med_dt_TV1_TV4,NbSecondWithAtLeatOneTrade,pred
0,1,Isin_8,Date_24,Trader_10,2.272727,8.333333,12.5,0.0,5117.8303,0.0,...,,,,,,,,,4,NON HFT
1,2,Isin_87,Date_29,Trader_10,1.696629,25.166667,21.571429,0.0,1846.968401,7.4e-05,...,,,,,,,,,15,NON HFT
2,3,Isin_87,Date_23,Trader_10,1.482759,47.3,118.25,0.0,686.30063,7.1e-05,...,,,,,,,,,63,NON HFT
3,4,Isin_12,Date_11,Trader_10,1.705882,14.5,29.0,0.0,2174.335265,0.0,...,,,,,,,,,4,NON HFT
4,5,Isin_87,Date_9,Trader_10,1.51773,26.75,,0.0,944.008551,7.1e-05,...,,,,,,,,,38,NON HFT


In [15]:
# hft_preds = X_test_pred[X_test_pred["pred"] == 'HFT'].groupby(by='Trader').count().unstack(fill_value=0)
# mix_preds = X_test_pred[X_test_pred["pred"] == 'MIX'].groupby(by='Trader').count()["Index"]
# non_HFT_preds = X_test_pred[X_test_pred["pred"] == 'NON HFT'].groupby(by='Trader').count()["Index"]
# traders_list = [X_test_pred['Trader'].unique()]

# midx = pd.MultiIndex.from_product(traders_list, names=['Trader'])
# Because group by does not take into account the count 0s, we need to add them now

# hft_preds = hft_preds.reindex(midx, fill_value=0)

In [16]:
traders_list = np.unique(X_test_pred["Trader"])
preds_list = []

for trader in traders_list:
    df_reduced = X_test_pred[X_test_pred['Trader'] == trader]["pred"]
    counts = df_reduced.value_counts()

    # Non HFT
    try:
        non_hft_count = counts["NON HFT"]
    except:
        non_hft_count = 0

    # HFT
    try:
        hft_count = counts["HFT"]
    except:
        hft_count = 0

    # MIX
    try:
        mix_count = counts["MIX"]
    except:
        mix_count = 0

    total_count = hft_count + mix_count + non_hft_count
    if hft_count / total_count >= 0.85:
        pred = "HFT"
    elif mix_count / total_count >= 0.5:
        pred = "MIX"
    else:
        pred = "NON HFT"

    preds_list.append(pred)

In [17]:
data = {"Predictions": preds_list, "Traders": traders_list}
output = pd.DataFrame(data).set_index("Traders")
output

Unnamed: 0_level_0,Predictions
Traders,Unnamed: 1_level_1
Trader_10,NON HFT
Trader_105,NON HFT
Trader_107,MIX
Trader_110,NON HFT
Trader_114,NON HFT
...,...
Trader_51,NON HFT
Trader_54,NON HFT
Trader_57,NON HFT
Trader_59,NON HFT
