### Load Packages

In [1]:
# primary EDA code
import pandas as pd
import numpy as np
import os

# packages for plots
import matplotlib.pyplot as plt
import seaborn as sns

# warning ignore
import warnings
warnings.filterwarnings("ignore")

# packages for statistics
from statsmodels.tsa.api import ExponentialSmoothing, SimpleExpSmoothing, Holt
import statsmodels.formula.api as smf
import statsmodels.tsa.api as smt
import statsmodels.api as sm
import scipy.stats as scs

# packages for fft
import spectrum
from spectrum import Periodogram, data_cosine

# packages for ML
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.svm import SVC, SVR
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier

# packages for self-made logistic regression
from scipy.optimize import fmin_tnc

### Load Data & Split into parts

In [2]:
df_gp = pd.DataFrame()
for filename in os.listdir('data/buy/'):
    if filename.endswith(".txt"): 
        stock = filename[0:4]
        # print("Loading stock data:", stock, ",")
        temp_df = pd.read_csv(os.path.join('data/buy/', filename), delimiter= '\s+', header = None)
        temp_df.rename(columns={123:'rtn'}, inplace=True)
        temp_df = pd.concat([pd.Series([stock] * temp_df.shape[0], name = 'stock'), temp_df], axis=1)
        temp_df = temp_df.iloc[21:,:]
        df_gp = pd.concat([df_gp, temp_df])
        continue
    else:
        continue
        
df_gp = df_gp.reset_index(drop=True)

In [3]:
name_gp = df_gp.iloc[:, 0]
osc_gp = df_gp.iloc[:, 1:42]
stk_gp = df_gp.iloc[:, 42:83]
macd_gp = df_gp.iloc[:, 83:124]
rtn_gp = df_gp.iloc[:, 124]
label_gp = np.sign(rtn_gp).map({1: 1, -1: 0, 0:0})
results_gp = label_gp.map({1: 'EARN', 0: 'LOSS'})

### Data Wrangling - add Smoothiness & PSD

In [4]:
# define function to calculate smoothness
def smooth_generator(data):
    smooth_list = []
    for i in range(data.shape[0]):
        smooth_list.append(np.var(abs(np.diff(data.iloc[i,:]))))
    smooth = pd.DataFrame(smooth_list, columns=["smooth"]) 

    return smooth

In [5]:
# define function for psd calculation
def psd_generator(data, NFFT = 100, name = "osc"):
    freq = []
    for i in range(data.shape[0]):
        data_osc = data.iloc[i,:]
        p = Periodogram(data_osc, NFFT=NFFT)
        temp_list = list(p.psd)
        freq.append(temp_list)
    col_name = []
    for i in range(int(NFFT/2)+1):
        col_name.append("freq"+str(i))
    
    psd_df = pd.DataFrame(freq, columns=col_name) 
    return psd_df

In [6]:
# define function for numerical differentiation
def derivative(data, space = 1, name = "macd"):
    dy = []
    for i in range(data.shape[0]):
        y = pd.Series(data.iloc[i,:])
        temp_dy = list(np.gradient(y, space))
        dy.append(temp_dy)
    
    col_name = []
    for i in range(data.shape[1]):
        col_name.append(name + "deriv"+ str(i))
        
    deriv_df = pd.DataFrame(dy, columns=col_name) 
    
    return deriv_df

In [7]:
# Factor for smoothness
smooth_osc = smooth_generator(osc_gp)
# calculate the dy for macd
first_deriv_macd = derivative(macd_gp)    
# calculate the ddy for macd
second_deriv_macd = derivative(first_deriv_macd) 
# calculate the dy for osc (way to study curvature)
first_deriv_osc = derivative(osc_gp)

### Combine Selected Features

In [8]:
Feature_matrix_w_rtn = pd.concat([rtn_gp, osc_gp, smooth_osc, first_deriv_macd, second_deriv_macd, first_deriv_osc], axis=1)

In [66]:
#Split test set
X_w_rtn, X_test_w_rtn, y, y_test = train_test_split(Feature_matrix_w_rtn, label_gp, test_size=0.2)

#Separate returns from Feature matrix
X_test = X_test_w_rtn.iloc[:, 1:]
rtn_test = X_test_w_rtn.iloc[:, 0]

#reset indices on all
X_test = X_test.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)
rtn_test = rtn_test.reset_index(drop=True)

In [67]:
#Split train/validation set
X_train_w_rtn, X_valid_w_rtn, y_train, y_valid = train_test_split(X_w_rtn, y, test_size=0.2)

#Separate returns from Feature matrix
X_train = X_train_w_rtn.iloc[:, 1:]
X_valid = X_valid_w_rtn.iloc[:, 1:]

rtn_train = X_train_w_rtn.iloc[:, 0]
rtn_valid = X_valid_w_rtn.iloc[:, 0]

#reset indices on all
X_train = X_train.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)
rtn_train = rtn_train.reset_index(drop=True)
X_valid = X_valid.reset_index(drop=True)
y_valid = y_valid.reset_index(drop=True)
rtn_valid = rtn_valid.reset_index(drop=True)

### Random Forest Classifier - Log Proba Output

In [72]:
# Fit Logistic Regression
lr = RandomForestClassifier()
lr.fit(X_train, y_train)
#print("Training accuracy: ", round(lr.score(X_train, y_train),4))
#print("Validation accuracy: ", round(lr.score(X_valid, y_valid),4))

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [70]:
# Extracts the probabilities of being a good trade
predicitions = lr.predict_log_proba(X_valid)
pred_extracts = []
for prob in range(len(predicitions)):
    pred_extracts.append(-predicitions[prob][0])

# Classify the Classes of the Trades
# Log Proba Inf - Excellent
# Log Proba 2.3 or above - Great
# Log Proba 1.5 - 2.2 - Good
# Log Proba < 1.5 - Average

trade_classes = ['Average'] * len(pred_extracts)
for i in range(len(pred_extracts)):
    if pred_extracts[i] == np.inf:
        trade_classes[i] = 'Excellent'
    elif pred_extracts[i] > 2.2:
        trade_classes[i] = 'Great'
    elif pred_extracts[i] > 1.5:
        trade_classes[i] = 'Good'
    
results = pd.concat([pd.DataFrame(pred_extracts), pd.DataFrame(trade_classes), y_valid, rtn_valid], axis =1)
results.columns = ['prob', 'trade_class', 'label', 'return']


### Results Summary - Validation Set

In [77]:
results_summary = pd.concat([results.groupby(['trade_class']).mean()['label'], results.groupby(['trade_class']).count()['label'], results.groupby(['trade_class']).mean()['return']],axis=1)
results_summary.columns = ['WinRate', 'Count', 'Avg. Return']
results_summary['% of All Trades'] = np.round(results_summary['Count']/np.sum(results_summary['Count']),4)*100 
results_summary.reindex(['Excellent', 'Great', 'Good', 'Average'])


Unnamed: 0_level_0,WinRate,Count,Avg. Return,% of All Trades
trade_class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Excellent,0.976744,43,1.776279,0.26
Great,0.704545,132,1.012652,0.78
Good,0.552023,346,0.333295,2.06
Average,0.402171,16309,0.039046,96.9
