In [2]:
%matplotlib inline
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import numpy as np
import os

In [3]:
filepath_X_train = os.path.join(os.path.abspath('..'), 'Files', 'AMF_train_X.csv')
filepath_y_train = os.path.join(os.path.abspath('..'), 'Files', 'AMF_train_Y.csv')
filepath_X_test = os.path.join(os.path.abspath('..'), 'Files', 'AMF_train_X.csv')


X_train_original = pd.read_csv(filepath_X_train)
y_train_original = pd.read_csv(filepath_y_train)
X_test_original = pd.read_csv(filepath_X_test)

In [4]:
X_with_label = pd.merge(X_train_original, y_train_original, how='left', on="Trader")
y_train_reshaped = X_with_label['type']
X_anonymized = X_with_label.drop(columns=['Trader', 'type'])
X_anonymized["Share"] = pd.to_numeric(X_anonymized["Share"].str[5::], downcast='integer')
X_anonymized["Day"] = pd.to_numeric(X_anonymized["Day"].str[5::], downcast='integer')
X_anonymized.dropna(axis="columns", inplace=True)

In [6]:
X_train, X_valid, y_train, y_valid = train_test_split(
    X_anonymized, y_train_reshaped, train_size=0.7, random_state=15)

classifier = RandomForestClassifier()
classifier.fit(X_train, y_train)

y_train_pred = classifier.predict(X_train)


In [7]:
X_test = X_test_original.copy()

X_test.drop(columns='Trader', inplace=True)
X_test["Share"] = pd.to_numeric(X_test["Share"].str[5::], downcast='integer')
X_test["Day"] = pd.to_numeric(X_test["Day"].str[5::], downcast='integer')
X_test.dropna(axis="columns", inplace=True)

y_test_first = classifier.predict(X_test)
X_test_first_pred = X_test_original.copy()
X_test_first_pred['pred'] = y_test_first

In [8]:
def pred_from_global_list_to_csv(X_test_with_pred):
    traders_list = np.unique(X_test_with_pred["Trader"])
    preds_list_AMF_calc = []
    non_hft_percent = []
    hft_percent = []
    mix_percent = []
    for trader in traders_list:
        df_reduced = X_test_with_pred[X_test_with_pred['Trader'] == trader]["pred"]
        counts = df_reduced.value_counts()

        # Non HFT
        try:
            non_hft_count = counts["NON HFT"]
        except:
            non_hft_count = 0

        # HFT
        try:
            hft_count = counts["HFT"]
        except:
            hft_count = 0

        # MIX
        try:
            mix_count = counts["MIX"]
        except:
            mix_count = 0

        total_count = hft_count + mix_count + non_hft_count
        hft_percent.append(hft_count / total_count)
        mix_percent.append(mix_count / total_count)
        non_hft_percent.append(non_hft_count / total_count)

        if hft_count / total_count >= 0.85:
            pred = "HFT"
        elif mix_count / total_count >= 0.5:
            pred = "MIX"
        else:
            pred = "NON HFT"

        preds_list_AMF_calc.append(pred)

    dic_percentages = {
        "Predictions": preds_list_AMF_calc, 
        "percent days HFT": hft_percent,
        "percent days mix": mix_percent,
        "percent days non hft": non_hft_percent,
        "Trader": traders_list}

    output_percentages_table = pd.DataFrame(dic_percentages).set_index("Trader")

    dic_output_AMF = {"Predictions": preds_list_AMF_calc, "Trader": traders_list}
    output_AMF_calcs = pd.DataFrame(dic_output_AMF).set_index("Trader")
    
    return output_percentages_table, output_AMF_calcs

In [9]:
first_percentages, first_output = pred_from_global_list_to_csv(X_test_first_pred)

In [10]:
# Comparing all outputs to see where they can be issues

X_comparison_preds = pd.merge(first_percentages, y_train_original, how='left', on="Trader")

In [11]:
X_comparison_preds[X_comparison_preds["Predictions"] != X_comparison_preds["type"]]

Unnamed: 0,Trader,Predictions,percent days HFT,percent days mix,percent days non hft,type
62,Trader_385,NON HFT,0.375,0.125,0.5,HFT


In [None]:
X_new = X_comparison_preds