## Packages

In [20]:
# primary EDA code
import pandas as pd
import numpy as np
import os
import random

# packages for plots
import matplotlib.pyplot as plt
import seaborn as sns

# warning ignore
import warnings
warnings.filterwarnings("ignore")

# packages for fft
import spectrum
from spectrum import Periodogram

# packages for ML
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.model_selection import KFold


## Data Import

In [21]:
def data_import(folder_path):
    df_gp = pd.DataFrame()
    df_test_gp = pd.DataFrame()
    for filename in os.listdir(folder_path):
        if filename.endswith(".txt"):
            stock = filename[0:4]
            #print("Loading stock data:", stock, ",")
            try:
                temp_df = pd.read_csv(os.path.join(folder_path, filename), delimiter= '\s+', header = None)
                temp_df.rename(columns={123:'rtn'}, inplace=True)
                temp_df = pd.concat([pd.Series([stock] * temp_df.shape[0], name = 'stock'), temp_df], axis=1)
                #Split the train-test sets
                num_split = int(len(temp_df)*0.95)
                temp_train_df = temp_df
                temp_train_df = temp_df.iloc[21:num_split,:]
                temp_test_df = temp_df.iloc[num_split:,:]
                df_gp = pd.concat([df_gp, temp_train_df])
                df_test_gp = pd.concat([df_test_gp, temp_test_df])
                continue
            except:
                print('The following file cannot be read: ' + folder_path + filename)
        else:
            continue
            
    # Drop NaN values
    df_gp = df_gp.dropna()
    df_test_gp = df_test_gp.dropna()
    
    # Set all columns except stock names to numerics
    for col in df_gp.columns:
        if col != 'stock':
            df_gp[col] = pd.to_numeric(df_gp[col],errors='coerce')          
    for col in df_test_gp.columns:
        if col != 'stock':
            df_test_gp[col] = pd.to_numeric(df_test_gp[col],errors='coerce')
            
    # Rename the columns
    osc_headers = []
    stk_headers = []
    macd_headers = []
    for i in range(41):
        osc_headers.append('osc' + str(i))
        stk_headers.append('stk' + str(i))
        macd_headers.append('macd' + str(i))
    df_gp.columns = ['stock'] + osc_headers + stk_headers + macd_headers + ['rtn']
    df_test_gp.columns = ['stock'] + osc_headers + stk_headers + macd_headers + ['rtn']
    
    # Reset index
    df_gp = df_gp.reset_index(drop=True)
    df_test_gp = df_test_gp.reset_index(drop=True)
    

    return df_gp, df_test_gp

In [22]:
# Import Sell Data
df_sell_gp, df_sell_test_gp = data_import('data/sell/')
df_sell_gp['rtn'] = df_sell_gp['rtn'].apply(lambda x: x*-1)
df_sell_test_gp['rtn'] = df_sell_test_gp['rtn'].apply(lambda x: x*-1)
# Import Buy Data
df_buy_gp, df_buy_test_gp = data_import('data/buy/')

# Combine Sell and Buy Data
df_gp = pd.concat([df_buy_gp, df_sell_gp])
df_gp = df_gp.reset_index(drop=True)
df_test_gp = pd.concat([df_buy_test_gp, df_sell_test_gp])
df_test_gp = df_test_gp.reset_index(drop=True)

# Check the data balance
print('Positive returns - train:', round(np.sum(df_gp.rtn >= 0)/len(df_gp.rtn) * 100, 2), '%')
print('Negative returns - train:', round(np.sum(df_gp.rtn < 0)/len(df_gp.rtn) * 100, 2), '%')

print('Positive returns - test:', round(np.sum(df_test_gp.rtn >= 0)/len(df_test_gp.rtn) * 100, 2), '%')
print('Negative returns - test:', round(np.sum(df_test_gp.rtn < 0)/len(df_test_gp.rtn) * 100, 2), '%')

Positive returns - train: 52.11 %
Negative returns - train: 47.89 %
Positive returns - test: 51.51 %
Negative returns - test: 48.49 %


In [4]:
def data_organize(df_gp):
    try:
        name_gp = df_gp.iloc[:, 0]
        osc_gp = df_gp.iloc[:, 1:42]
        stk_gp = df_gp.iloc[:, 42:83]
        macd_gp = df_gp.iloc[:, 83:124]
        rtn_gp = df_gp.iloc[:, 124]
        label_gp = np.sign(rtn_gp).map({1: 1, -1: 0, 0:0})
        results_gp = label_gp.map({1: 'EARN', 0: 'LOSS'})
    except:
        print('Please check the dataframe index')

    return name_gp, osc_gp, stk_gp, macd_gp, rtn_gp, label_gp, results_gp

In [23]:
name_gp, osc_gp, stk_gp, macd_gp, rtn_gp, label_gp, results_gp = data_organize(df_gp)
name_test_gp, osc_test_gp, stk_test_gp, macd_test_gp, rtn_test_gp, label_test_gp, results_test_gp = data_organize(df_test_gp)

## Feature Engineering

In [24]:
def smooth_generator(data):
    smooth_list = []
    for i in range(data.shape[0]):
        col = data.shape[1]
        
        smooth_list.append(np.var(np.diff(data.iloc[i,:])))
    smooth = pd.DataFrame(smooth_list, columns=["smooth"])

    return smooth

def derivative(data, space = 1, name = "macd"):
    dy = []
    for i in range(data.shape[0]):
        y = pd.Series(data.iloc[i,:])
        temp_dy = list(np.gradient(y, space))
        dy.append(temp_dy)

    col_name = []
    for i in range(data.shape[1]):
        col_name.append(name + "deriv"+ str(i))

    deriv_df = pd.DataFrame(dy, columns=col_name)

    return deriv_df

def psd_generator(data, NFFT = 100, name = "osc"):
    freq = []
    for i in range(data.shape[0]):
        data_osc = data.iloc[i,:]
        p = Periodogram(data_osc, NFFT=NFFT)
        temp_list = list(p.psd)
        freq.append(temp_list)
    col_name = []
    for i in range(int(NFFT/2)+1):
        col_name.append("psd"+str(i))

    psd_df = pd.DataFrame(freq, columns=col_name)
    return psd_df

def volatility(data):
    vol = []
    for i in range(data.shape[0]):
        vol.append(np.var(data.iloc[i,:]))
    vol = pd.DataFrame(vol, columns=["volatility"])
    return vol

def amplitude(data):
    amp = []
    for i in range(data.shape[0]):
        amp.append(np.var(np.diff(data.iloc[i,:]))/(np.mean(abs(data.iloc[i,:]))))
    amp = pd.DataFrame(amp, columns=["amplitude"])
    return amp

In [25]:
# Train Set
smooth_osc = smooth_generator(osc_gp)
psd_osc = psd_generator(osc_gp)
first_deriv_macd = derivative(macd_gp, name="macd_1st") 
second_deriv_macd = derivative(first_deriv_macd, name="macd_2nd") 
vol_stk = volatility(stk_gp)

In [26]:
# Test Set
smooth_osc_test = smooth_generator(osc_test_gp)
psd_osc_test = psd_generator(osc_test_gp)
first_deriv_macd_test = derivative(macd_test_gp, name="macd_1st") 
second_deriv_macd_test = derivative(first_deriv_macd_test, name="macd_2nd") 
vol_stk_test = volatility(stk_test_gp)

## Modeling

In [27]:
def random_forest_classifier(X_w_rtn, y, test_size=0.2, cv=2):
    
    results_summaries_collection = pd.DataFrame()
    results_prob_summaries_collection = pd.DataFrame()
    
    # n-fold Cross Validation
    kf = KFold(n_splits=cv)
    kf.get_n_splits(X_w_rtn)

    # Within each fold
    for train_index, valid_index in kf.split(X_w_rtn):
        # Define train/ validation set (convert X_w_rtn and y to np.array for indexing on the next line)
        X_train_w_rtn, X_valid_w_rtn = X_w_rtn.values[train_index], X_w_rtn.values[valid_index]
        y_train, y_valid = y.values[train_index], y.values[valid_index]
    
        # Separate returns from Feature matrix (convert X_w_rtn and y back to pandas)
        X_train = pd.DataFrame(X_train_w_rtn).iloc[:, 1:]
        X_valid = pd.DataFrame(X_valid_w_rtn).iloc[:, 1:]

        rtn_train = pd.DataFrame(X_train_w_rtn).iloc[:, 0]
        rtn_valid = pd.DataFrame(X_valid_w_rtn).iloc[:, 0]
    
        y_train = pd.Series(y_train)
        y_valid = pd.Series(y_valid)
    
        # Reset indices on all
        X_train = X_train.reset_index(drop=True)
        y_train = y_train.reset_index(drop=True)
        rtn_train = rtn_train.reset_index(drop=True)

        X_valid = X_valid.reset_index(drop=True)
        y_valid = y_valid.reset_index(drop=True)
        rtn_valid = rtn_valid.reset_index(drop=True)

        # Fit Random Forest
        rf1 = RandomForestClassifier(n_estimators = 10)
        rf2 = RandomForestClassifier(n_estimators = 15)
        rf3 = RandomForestClassifier(n_estimators = 20)
        
        eclf = VotingClassifier(estimators=[('rf1', rf1), ('rf2', rf2), ('rf3', rf3)], voting='soft', weights=[3,2,1], flatten_transform=True)
        eclf = eclf.fit(X_train, y_train)

        # Extracts the probabilities of being a good trade
        log_proba_set_valid = eclf.predict_proba(X_valid)
        proba_valid = []
        for prob in range(len(log_proba_set_valid)):
            proba_valid.append(log_proba_set_valid[prob][1])

        # Get probabilities summary table
        results_prob = pd.concat([pd.DataFrame(proba_valid), y_valid, rtn_valid], axis =1)
        results_prob.columns = ['prob', 'label', 'return']
        results_prob['prob'] = np.round(results_prob['prob'],1)

        results_prob_summary = pd.concat([results_prob.groupby(['prob']).mean()['label'], results_prob.groupby(['prob']).count()['label'], results_prob.groupby(['prob']).mean()['return']],axis=1)
        results_prob_summary.columns = ['WinRate', 'Count', 'Avg. Return']
        results_prob_summary['% of All Trades'] = np.round(results_prob_summary['Count']/np.sum(results_prob_summary['Count']),4)*100

        # Classify the Classes of the Trades
        trade_classes = ['Average'] * len(proba_valid)
        for i in range(len(proba_valid)):
            if proba_valid[i] > 0.89:
                trade_classes[i] = 'Excellent Buy'
            elif proba_valid[i] > 0.8:
                trade_classes[i] = 'Great Buy'
            elif proba_valid[i] < 0.1:
                trade_classes[i] = 'Excellent Sell'
            elif proba_valid[i] < 0.2:
                trade_classes[i] = 'Great Sell'

        # Put Results Together
        results = pd.concat([pd.DataFrame(proba_valid), pd.DataFrame(trade_classes), y_valid, rtn_valid], axis =1)
        results.columns = ['prob', 'trade_class', 'label', 'return']

        # Make Results Summary
        results_summary = pd.concat([results.groupby(['trade_class']).mean()['label'], results.groupby(['trade_class']).count()['label'], results.groupby(['trade_class']).mean()['return']],axis=1)
        results_summary.columns = ['WinRate', 'Count', 'Avg. Return']
        results_summary['% of All Trades'] = np.round(results_summary['Count']/np.sum(results_summary['Count']),4)*100

        # Store it in the Results Collection
        results_prob_summaries_collection = pd.concat([results_prob_summaries_collection, results_prob_summary])
        results_summaries_collection = pd.concat([results_summaries_collection, results_summary])
    
    results_summaries_collection_mean = results_summaries_collection.groupby(['trade_class']).mean().reindex(['Excellent Buy', 'Great Buy', 'Average', 'Great Sell', 'Excellent Sell'])
    results_prob_summaries_collection = results_prob_summaries_collection.groupby(['prob']).mean()
    results_summaries_collection_std = results_summaries_collection.groupby(['trade_class']).std().reindex(['Excellent Buy', 'Great Buy', 'Average', 'Great Sell', 'Excellent Sell'])
    results_summaries_collection_std.columns = ['WinRate(std)', 'Count(std)', 'Avg.Return(std)', '% of All Trades(std)']
    
    return eclf, results_prob_summaries_collection, results_summaries_collection_mean, results_summaries_collection_std

In [28]:
Feature_matrix_w_rtn = pd.concat([rtn_gp, smooth_osc, psd_osc, first_deriv_macd, second_deriv_macd, vol_stk], axis=1)
# Shuffle the Feature Matrix
Feature_matrix_w_rtn = Feature_matrix_w_rtn.sample(frac=1).reset_index(drop=True)
rtn_gp = Feature_matrix_w_rtn.iloc[:, 0]
label_gp = np.sign(rtn_gp).map({1: 1, -1: 0, 0:0})

In [30]:
rf, results_prob_summaries_collection, results_summaries_collection, results_summaries_collection_std = random_forest_classifier(Feature_matrix_w_rtn, label_gp)
results_prob_summaries_collection

Unnamed: 0_level_0,WinRate,Count,Avg. Return,% of All Trades
prob,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0.0,0.005714,180.5,-0.608333,0.185
0.1,0.051144,437.5,-0.80331,0.445
0.2,0.295675,1598.0,-0.242986,1.62
0.3,0.430588,8141.0,0.034085,8.27
0.4,0.465067,25832.5,0.039862,26.245
0.5,0.491359,33735.5,0.024299,34.27
0.6,0.521177,21615.5,0.034312,21.96
0.7,0.550961,5802.5,0.053765,5.895
0.8,0.707277,855.0,0.593793,0.87
0.9,0.961751,185.5,1.716999,0.185


In [31]:
results_summaries_collection

Unnamed: 0_level_0,WinRate,Count,Avg. Return,% of All Trades
trade_class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Excellent Buy,0.996094,128.0,1.668086,0.13
Great Buy,0.876964,349.5,1.309889,0.355
Average,0.488837,96841.0,0.034091,98.38
Great Sell,0.135939,786.5,-0.633341,0.8
Excellent Sell,0.019936,328.5,-0.742315,0.33


## Test Set

In [36]:
Feature_matrix_w_rtn_test = pd.concat([rtn_test_gp, smooth_osc_test, psd_osc_test, first_deriv_macd_test, second_deriv_macd_test, vol_stk_test], axis=1)
Feature_matrix_w_rtn_test = Feature_matrix_w_rtn_test.sample(frac=1).reset_index(drop=True)

# Separate returns from Feature matrix
X_test = Feature_matrix_w_rtn_test.iloc[:, 1:]
rtn_test = Feature_matrix_w_rtn_test.iloc[:, 0]
label_test_gp = np.sign(rtn_test).map({1: 1, -1: 0, 0:0})

# Extracts the probabilities of being a good trade
log_proba_set_test = rf.predict_proba(X_test)
proba_test = []
for prob in range(len(log_proba_set_test)):
    proba_test.append(log_proba_set_test[prob][1])

# Get probabilities summary table
results_prob_test = pd.concat([pd.DataFrame(proba_test), label_test_gp, rtn_test], axis =1)
results_prob_test.columns = ['prob', 'label', 'return']
results_prob_test['prob'] = np.round(results_prob_test['prob'],1)

results_prob_summary_test = pd.concat([results_prob_test.groupby(['prob']).mean()['label'], results_prob_test.groupby(['prob']).count()['label'], results_prob_test.groupby(['prob']).mean()['return']],axis=1)
results_prob_summary_test.columns = ['WinRate', 'Count', 'Avg. Return']
results_prob_summary_test['% of All Trades'] = np.round(results_prob_summary_test['Count']/np.sum(results_prob_summary_test['Count']),4)*100

    
# Classify the Classes of the Trades
trade_classes_test = ['Average'] * len(proba_test)
for i in range(len(proba_test)):
    if proba_test[i] > 0.89:
        trade_classes_test[i] = 'Excellent Buy'
    elif proba_test[i] > 0.8:
        trade_classes_test[i] = 'Great Buy'
    elif proba_test[i] < 0.1:
        trade_classes_test[i] = 'Excellent Sell'
    elif proba_test[i] < 0.2:
        trade_classes_test[i] = 'Great Sell'
    
# Put Test Set and its Results Together
test_results = pd.concat([pd.DataFrame(proba_test), pd.DataFrame(trade_classes_test), label_test_gp, rtn_test], axis =1)
test_results.columns = ['prob', 'trade_class', 'label', 'return']
test_summary = pd.concat([X_test, test_results], axis =1)

# Make Results Summary
test_results_summary = pd.concat([test_results.groupby(['trade_class']).mean()['label'], test_results.groupby(['trade_class']).count()['label'], test_results.groupby(['trade_class']).mean()['return']],axis=1)
test_results_summary.columns = ['WinRate', 'Count', 'Avg. Return']
test_results_summary['% of All Trades'] = np.round(test_results_summary['Count']/np.sum(test_results_summary['Count']),4)*100
test_results_summary.reindex(['Excellent Buy', 'Great Buy', 'Average', 'Great Sell', 'Excellent Sell'])


Unnamed: 0_level_0,WinRate,Count,Avg. Return,% of All Trades
trade_class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Excellent Buy,,,,
Great Buy,0.428571,14.0,-0.095,0.13
Average,0.495201,10418.0,0.044884,99.7
Great Sell,0.352941,17.0,-0.305882,0.16
Excellent Sell,,,,


In [37]:
results_prob_summary_test

Unnamed: 0_level_0,WinRate,Count,Avg. Return,% of All Trades
prob,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0.2,0.386667,75,-0.212,0.72
0.3,0.43879,727,-0.020041,6.96
0.4,0.473857,2735,0.056527,26.17
0.5,0.506101,3770,0.067183,36.08
0.6,0.513992,2430,0.060914,23.26
0.7,0.514241,632,-0.130443,6.05
0.8,0.564103,78,0.230641,0.75
0.9,0.5,2,0.045,0.02
