In [11]:
%matplotlib inline
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import numpy as np
import os
import seaborn as sns
import sys

filepath_functions = os.path.join(os.path.abspath('..'), 'Python files')
sys.path.insert(1, filepath_functions)

import Adrien_cleaning


In [12]:
filepath_X_train = os.path.join(os.path.abspath('..'), 'Files', 'AMF_train_X.csv')
filepath_y_train = os.path.join(os.path.abspath('..'), 'Files', 'AMF_train_Y.csv')
filepath_X_test = os.path.join(os.path.abspath('..'), 'Files', 'AMF_test_X.csv')


X_train_original = pd.read_csv(filepath_X_train)
y_train_original = pd.read_csv(filepath_y_train)
X_test_original = pd.read_csv(filepath_X_test)

X_train_original.set_index("Index", drop=True, inplace=True)
X_test_original.set_index("Index", drop=True, inplace=True)

In [13]:
# Looking at the data
print(X_train_original.info())
X_train_original.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 105782 entries, 1 to 105782
Data columns (total 38 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   Share                       105782 non-null  object 
 1   Day                         105782 non-null  object 
 2   Trader                      105782 non-null  object 
 3   OTR                         104099 non-null  float64
 4   OCR                         105782 non-null  float64
 5   OMR                         58170 non-null   float64
 6   min_time_two_events         105782 non-null  float64
 7   mean_time_two_events        105782 non-null  float64
 8   10_p_time_two_events        105782 non-null  float64
 9   med_time_two_events         105782 non-null  float64
 10  25_p_time_two_events        105782 non-null  float64
 11  75_p_time_two_events        105782 non-null  float64
 12  90_p_time_two_events        105782 non-null  float64
 13  max_time_two_e

Unnamed: 0_level_0,Share,Day,Trader,OTR,OCR,OMR,min_time_two_events,mean_time_two_events,10_p_time_two_events,med_time_two_events,...,min_dt_TV1_TV2,mean_dt_TV1_TV2,med_dt_TV1_TV2,min_dt_TV1_TV3,mean_dt_TV1_TV3,med_dt_TV1_TV3,min_dt_TV1_TV4,mean_dt_TV1_TV4,med_dt_TV1_TV4,NbSecondWithAtLeatOneTrade
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,Isin_8,Date_24,Trader_10,2.272727,8.333333,12.5,0.0,5117.8303,0.0,419.6885,...,,,,,,,,,,4
2,Isin_87,Date_29,Trader_10,1.696629,25.166667,21.571429,0.0,1846.968401,7.4e-05,0.003374,...,,,,,,,,,,15
3,Isin_87,Date_23,Trader_10,1.482759,47.3,118.25,0.0,686.30063,7.1e-05,0.000599,...,,,,,,,,,,63
4,Isin_12,Date_11,Trader_10,1.705882,14.5,29.0,0.0,2174.335265,0.0,6.152666,...,,,,,,,,,,4
5,Isin_87,Date_9,Trader_10,1.51773,26.75,,0.0,944.008551,7.1e-05,0.001364,...,,,,,,,,,,38


In [7]:
print("Shape of X_train:", X_train_original.shape, '\n')
print("Shape of y_train:", y_train_original.shape, '\n')
print("# traders in the dataset =", len(X_train_original["Trader"].unique()), '\n')
y_train_original.head()

Shape of X_train: (105782, 38) 

Shape of y_train: (86, 2) 

# traders in the dataset = 86 



Unnamed: 0,Trader,type
0,Trader_285,MIX
1,Trader_114,NON HFT
2,Trader_110,NON HFT
3,Trader_57,NON HFT
4,Trader_128,NON HFT


In [11]:
# Merging X and y to create a full dataset matrix
X_with_label = pd.merge(X_train_original, y_train_original, how='left', on="Trader")

# Looking at the duplicates and removing them
X_with_label_dups = X_with_label[X_with_label.duplicated() == True]

# Removing the dups
X_with_label.drop_duplicates(inplace=True)
X_test_original_no_dups = X_test_original.drop_duplicates()

# Seeing the distribution of traders in the duplicates
print(X_with_label_dups.value_counts("type"), '\n')
print(X_with_label.value_counts("type"))

type
MIX        10178
HFT         9414
NON HFT      116
dtype: int64 

type
MIX        41305
HFT        22536
NON HFT    22233
dtype: int64


## Cleaning the dataset

In [None]:
# print(X_train["Share"][0].find('_'))

# Converting Day and Trader to categorical data
# TODO one-hot encoding in pipeline
X_with_label_clean = X_with_label.copy()
X_with_label_clean["Share"] = pd.to_numeric(X_with_label["Share"].str[5::], downcast='integer')
X_with_label_clean["Day"] = pd.to_numeric(X_with_label["Day"].str[5::], downcast='integer')
X_with_label_clean.head()


In [None]:
X_with_label_clean.isna().sum()

## Correcting the NAs

In [None]:
X_anonymized_clean[["OTR", "OMR"]] = X_anonymized_clean[["OTR", "OMR"]].fillna(value=0)

### OTR, OCR et OMR

In [None]:
X_anonymized_clean["Total OR"] = X_anonymized_clean["OTR"] + X_anonymized_clean["OMR"] + X_anonymized_clean["OCR"]

X_anonymized_clean["OTR_new"] = X_anonymized_clean["OTR"] / X_anonymized_clean["Total OR"]
X_anonymized_clean["OMR_new"] = X_anonymized_clean["OMR"] / X_anonymized_clean["Total OR"]
X_anonymized_clean["OCR_new"] = X_anonymized_clean["OCR"] / X_anonymized_clean["Total OR"]


X_anonymized_clean.head()

In [None]:
X_anonymized_clean.isna().sum()

In [None]:
X_anonymized_clean.dropna(axis="columns", inplace=True)
X_anonymized_clean.head()

In [None]:
# Anonymizing X by getting rid of Trader and Type

y_train_reshaped = X_with_label['type']
X_anonymized = X_with_label.drop(columns=['Trader', 'type'])
X_anonymized.head()

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(
    X_anonymized_clean, y_train_reshaped, train_size=0.7, random_state=15)

## Classifier & classification for each line

In [None]:
classifier = RandomForestClassifier()
classifier.fit(X_train, y_train)

In [None]:
print('{:0.2%}'.format(classifier.score(X_valid, y_valid)))

Very good accuracy simply on the lines

## Building the prediction at csv format

In [None]:
X_test = X_test_original.copy()

X_test.drop(columns='Trader', inplace=True)
X_test["Share"] = pd.to_numeric(X_test["Share"].str[5::], downcast='integer')
X_test["Day"] = pd.to_numeric(X_test["Day"].str[5::], downcast='integer')
X_test.dropna(axis="columns", inplace=True)


y_test = classifier.predict(X_test)

In [None]:
X_test_pred = X_test_original.copy()
X_test_pred['pred'] = y_test
X_test_pred.head()

In [None]:
# hft_preds = X_test_pred[X_test_pred["pred"] == 'HFT'].groupby(by='Trader').count().unstack(fill_value=0)
# mix_preds = X_test_pred[X_test_pred["pred"] == 'MIX'].groupby(by='Trader').count()["Index"]
# non_HFT_preds = X_test_pred[X_test_pred["pred"] == 'NON HFT'].groupby(by='Trader').count()["Index"]
# traders_list = [X_test_pred['Trader'].unique()]

# midx = pd.MultiIndex.from_product(traders_list, names=['Trader'])
# Because group by does not take into account the count 0s, we need to add them now

# hft_preds = hft_preds.reindex(midx, fill_value=0)

In [None]:
traders_list = np.unique(X_test_pred["Trader"])
preds_list_AMF_calc = []
non_hft_percent = []
hft_percent = []
mix_percent = []


for trader in traders_list:
    df_reduced = X_test_pred[X_test_pred['Trader'] == trader]["pred"]
    counts = df_reduced.value_counts()

    # Non HFT
    try:
        non_hft_count = counts["NON HFT"]
    except:
        non_hft_count = 0

    # HFT
    try:
        hft_count = counts["HFT"]
    except:
        hft_count = 0

    # MIX
    try:
        mix_count = counts["MIX"]
    except:
        mix_count = 0

    total_count = hft_count + mix_count + non_hft_count
    hft_percent.append(hft_count / total_count)
    mix_percent.append(mix_count / total_count)
    non_hft_percent.append(non_hft_count / total_count)

    if hft_count / total_count >= 0.85:
        pred = "HFT"
    elif mix_count / total_count >= 0.5:
        pred = "MIX"
    else:
        pred = "NON HFT"

    preds_list_AMF_calc.append(pred)

In [None]:
output_table = {
    "Predictions": preds_list_AMF_calc, 
    "percent days HFT": hft_percent,
    "percent days mix": mix_percent,
    "percent days non hft": non_hft_percent,
    "Traders": traders_list}
output_viz_table = pd.DataFrame(output_table).set_index("Traders")
output_viz_table.to_csv('First output visualising.csv')

In [None]:
data = {"Predictions": preds_list_AMF_calc, "Traders": traders_list}

