In [28]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sys
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import MinMaxScaler, FunctionTransformer, StandardScaler
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.svm import SVC

In [2]:
data_2014_raw = pd.read_csv("train/2014_Financial_Data.csv")
data_2014_raw["Year"] = "2014"
data_2014_raw.rename(columns={"2015 PRICE VAR [%]":"Next Year Price Var"}, inplace=True)
data_2014_raw["Identifier"] = data_2014_raw["Ticker"] + "-" + data_2014_raw["Year"]

data_2015_raw = pd.read_csv("train/2015_Financial_Data.csv")
data_2015_raw["Year"] = "2015"
data_2015_raw.rename(columns={"2016 PRICE VAR [%]":"Next Year Price Var"}, inplace=True)
data_2015_raw["Identifier"] = data_2015_raw["Ticker"] + "-" + data_2015_raw["Year"]

data_2016_raw = pd.read_csv("train/2016_Financial_Data.csv")
data_2016_raw["Year"] = "2016"
data_2016_raw.rename(columns={"2017 PRICE VAR [%]":"Next Year Price Var"}, inplace=True)
data_2016_raw["Identifier"] = data_2016_raw["Ticker"] + "-" + data_2016_raw["Year"]

data_2017_raw = pd.read_csv("train/2017_Financial_Data.csv")
data_2017_raw["Year"] = "2017"
data_2017_raw.rename(columns={"2018 PRICE VAR [%]":"Next Year Price Var"}, inplace=True)
data_2017_raw["Identifier"] = data_2017_raw["Ticker"] + "-" + data_2017_raw["Year"]

data_2018_raw = pd.read_csv("train/2018_Financial_Data.csv")
data_2018_raw["Year"] = "2018"
data_2018_raw.rename(columns={"2019 PRICE VAR [%]":"Next Year Price Var"}, inplace=True)
data_2018_raw["Identifier"] = data_2018_raw["Ticker"] + "-" + data_2018_raw["Year"]

data_2019_raw = pd.read_csv("test/2019_Financial_Data.csv")
data_2019_raw["Year"] = "2019"
data_2019_raw.rename(columns={"2019 PRICE VAR [%]":"Next Year Price Var"}, inplace=True)
data_2019_raw["Identifier"] = data_2019_raw["Ticker"] + "-" + data_2019_raw["Year"]
data_2019_raw = data_2019_raw.merge(data_2018_raw.loc[:,["Sector", "Ticker"]], on="Ticker", how="left")

In [3]:
# Based on the data that could be found on https://www.msn.com/en-ca/money/markets and also in 2019 data set
subset_to_keep = [
    "Ticker",
    "Revenue",
    "Revenue Growth",
    "Cost of Revenue",
    "Gross Profit",
    "SG&A Expense",
    # "R&D Expenses",
    "Operating Expenses",
    "Operating Income",
    "Net Income",
    # "Dividend per Share",
    "EPS",
    "EPS Diluted",
    "Cash and cash equivalents",
    "Receivables",
    # "Inventories",
    "Property, Plant & Equipment Net",
    # "Goodwill and Intangible Assets",
    "Total assets",
    "Payables",
    # "Total debt",
    "Total liabilities",
    # "Deferred revenue",
    # "Tax Liabilities",
    # "Other comprehensive income",
    "Retained earnings (deficit)",
    "Total shareholders equity",
    "Depreciation & Amortization",
    # "Stock-based compensation",
    "Operating Cash Flow",
    "Capital Expenditure",
    # "Acquisitions and disposals",
    # "Investment purchases and sales",
    "Investing Cash flow",
    # "Issuance (repayment) of debt",
    # "Issuance (buybacks) of shares",
    # "Dividend payments",
    "Financing Cash Flow",
    # "Effect of forex changes on cash",
    "Net cash flow / Change in cash",
    "Free Cash Flow",
    # "PE ratio",
    # "Debt to Equity",
    # "Interest Coverage",
    # "Dividend Yield",
    # "Payout Ratio",
    "ROE",
    "Sector",
    "Next Year Price Var",
    "Class",
    "Year",
    "Identifier"
]

In [4]:
# Keep only the most common data points and drop all rows with null values
pd.options.display.max_rows = 230
data_2014_orig = data_2014_raw.loc[:,subset_to_keep].dropna()
data_2015_orig = data_2015_raw.loc[:,subset_to_keep].dropna()
data_2016_orig = data_2016_raw.loc[:,subset_to_keep].dropna()
data_2017_orig = data_2017_raw.loc[:,subset_to_keep].dropna()
data_2018_orig = data_2018_raw.loc[:,subset_to_keep].dropna()
data_2019_orig = data_2019_raw.loc[:,subset_to_keep].dropna()

In [5]:
# The class in the following year is what to be predicted
# 2019 data is not included as we do not want to train model with data we want to predict
class_2014 = data_2014_orig["Ticker"].to_frame().merge(data_2015_orig[["Ticker","Class"]], on="Ticker", how="inner")
class_2015 = data_2015_orig["Ticker"].to_frame().merge(data_2016_orig[["Ticker","Class"]], on="Ticker", how="inner")
class_2016 = data_2016_orig["Ticker"].to_frame().merge(data_2017_orig[["Ticker","Class"]], on="Ticker", how="inner")
class_2017 = data_2017_orig["Ticker"].to_frame().merge(data_2018_orig[["Ticker","Class"]], on="Ticker", how="inner")
classes = [class_2014, class_2015, class_2016, class_2017]

Unnamed: 0,Ticker,Class
0,PG,1
1,VIPS,0
2,KR,0
3,RAD,1
4,GIS,1
...,...,...
3385,TRNS,1
3386,TSRI,1
3387,TZOO,1
3388,WSTG,1


In [18]:
# Keep only rows that have data in the following year and change Sector to categorical
data_2014 = data_2014_orig.merge(class_2014, on="Ticker", how="inner")
data_2015 = data_2015_orig.merge(class_2015, on="Ticker", how="inner")
data_2016 = data_2016_orig.merge(class_2016, on="Ticker", how="inner")
data_2017 = data_2017_orig.merge(class_2017, on="Ticker", how="inner")
data_list_all = [data_2014, data_2015, data_2016, data_2017]
for d in data_list_all:
    d.rename(columns={"Class_y": "Y", "Class_x": "Class"}, inplace=True)
    d["Sector"] = pd.Categorical(d["Sector"])
    d["Sector"] = d["Sector"].cat.codes
data_2014

Unnamed: 0,Ticker,Revenue,Revenue Growth,Cost of Revenue,Gross Profit,SG&A Expense,Operating Expenses,Operating Income,Net Income,EPS,...,Financing Cash Flow,Net cash flow / Change in cash,Free Cash Flow,ROE,Sector,Next Year Price Var,Class,Year,Identifier,Y
0,PG,7.440100e+10,-0.0713,3.903000e+10,3.537100e+10,2.146100e+10,2.146100e+10,1.391000e+10,1.164300e+10,4.1900,...,-7.279000e+09,2.618000e+09,1.011000e+10,0.1664,3,-9.323276,0,2014,PG-2014,1
1,VIPS,3.734148e+09,1.1737,2.805625e+09,9.285226e+08,3.441414e+08,7.939267e+08,1.345959e+08,1.358227e+08,0.2396,...,6.219136e+08,4.463169e+08,2.630410e+08,0.3294,3,-25.512193,0,2014,VIPS-2014,0
2,KR,9.837500e+10,0.0182,7.813800e+10,2.023700e+10,1.519600e+10,1.751200e+10,2.725000e+09,1.519000e+09,1.4700,...,1.361000e+09,1.630000e+08,1.243000e+09,0.2821,3,33.118297,1,2014,KR-2014,0
3,RAD,2.552641e+10,0.0053,1.820268e+10,7.323734e+09,6.561162e+09,6.586482e+09,7.372520e+08,2.494140e+08,4.6000,...,-3.201680e+08,1.695400e+07,3.681760e+08,-0.1180,3,2.752291,1,2014,RAD-2014,1
4,GIS,1.790960e+10,0.0076,1.153980e+10,6.369800e+09,3.474300e+09,3.412400e+09,2.957400e+09,1.824400e+09,2.9000,...,-1.824100e+09,1.259000e+08,1.884100e+09,0.2792,3,12.897715,1,2014,GIS-2014,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3385,TRNS,1.185080e+08,0.0553,8.871800e+07,2.979000e+07,2.308500e+07,2.308500e+07,6.705000e+06,3.984000e+06,0.5600,...,-6.588000e+06,-3.830000e+05,5.900000e+06,0.1324,9,-2.453386,0,2014,TRNS-2014,1
3386,TSRI,4.952987e+07,0.1028,4.125164e+07,8.278229e+06,8.253061e+06,8.253061e+06,2.516800e+04,-8.593600e+04,-0.0400,...,-2.614700e+04,9.608060e+05,4.909530e+05,-0.0097,9,29.362884,1,2014,TSRI-2014,1
3387,TZOO,1.532400e+08,-0.1019,1.917400e+07,1.340660e+08,1.125130e+08,1.162560e+08,1.781000e+07,1.306200e+07,0.8800,...,-4.693000e+06,-1.325100e+07,-5.400000e+06,0.3646,9,-31.167763,0,2014,TZOO-2014,1
3388,WSTG,3.407580e+08,0.1344,3.159480e+08,2.481000e+07,1.651300e+07,1.651300e+07,8.297000e+06,5.760000e+06,1.2400,...,-1.495000e+06,3.515000e+06,5.434000e+06,0.1456,9,7.779579,1,2014,WSTG-2014,1


In [7]:
# Training data per year
X_2014 = data_2014.loc[:,"Revenue":"Sector"]
y_2014 = data_2014["Y"]
X_2015 = data_2015.loc[:,"Revenue":"Sector"]
y_2015 = data_2015["Y"]
X_2016 = data_2016.loc[:,"Revenue":"Sector"]
y_2016 = data_2016["Y"]
X_2017 = data_2017.loc[:,"Revenue":"Sector"]
y_2017 = data_2017["Y"]

In [8]:
# Training data combined
data_all = pd.concat([data_2014, data_2015, data_2016, data_2017], join="outer")
X = data_all.loc[:,"Revenue":"Sector"]
X["Identifier"] = data_all["Identifier"]
X.set_index("Identifier", inplace=True)
y = data_all["Y"]

In [32]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y)
fib = [1, 2, 3, 5, 8, 13, 21, 34, 55, 89, 144, 233, 377, 610, 987, 1597, 2584, 4181, 6765, 10946, 17711, 28657, 46368]

In [10]:
# X_train, X_valid, y_train, y_valid = train_test_split(X, y)

# GaussianNB
gnb = GaussianNB().fit(X_train, y_train)
print("GaussianNB score on validation: " + str(gnb.score(X_valid, y_valid)))
print("GaussianNB score on train: "+ str(gnb.score(X_train, y_train)))

GaussianNB score on validation: 0.4503415659485024
GaussianNB score on train: 0.4474905842165192


In [12]:
# KNeighborsClassifier
# KNeighborsClassifier parameters
num_neighbours = fib[:10]
knctrain = []
kncvalid = []

for n in fib:
    knc = KNeighborsClassifier(n_neighbors=n).fit(X_train, y_train)
    knctrain.append(knc.score(X_train, y_train))
    kncvalid.append(knc.score(X_valid, y_valid))
#     print("KNeighborsClassifier score on validation with " + str(n) + ": " + str(knc.score(X_valid, y_valid)))
#     print("KNeighborsClassifier score on train with " + str(n) + ": "+ str(knc.score(X_train, y_train)))

In [14]:
knctrain

[1.0,
 0.7645616186388718,
 0.7681527546640974,
 0.7208548655513708,
 0.6852938600332837,
 0.6613821494262941,
 0.6466672505912237,
 0.6352807217307523,
 0.6251204344398704,
 0.6248576683892441,
 0.621616887098187,
 0.6209161776298502,
 0.6196023473767189,
 0.6156608566173251,
 0.6135587282123149,
 0.6104055356047998]

In [15]:
kncvalid

[0.5333683657383079,
 0.5060430898581187,
 0.5648975302154493,
 0.5759327377824488,
 0.5627955859169732,
 0.6019442984760904,
 0.6040462427745664,
 0.6108775617446137,
 0.6093011035207567,
 0.6142932212296374,
 0.6145559642669469,
 0.6163951655281135,
 0.6132422490803994,
 0.6150814503415659,
 0.6148187073042565,
 0.6129795060430898]

In [13]:
# KNeighborsClassifier with StandardScaler
# KNeighborsClassifier parameters
num_neighbours = fib[:10]
kncstrain = []
kncsvalid = []

for n in fib:
    kncs = make_pipeline(StandardScaler(), KNeighborsClassifier(n_neighbors=n)).fit(X_train, y_train)
    kncstrain.append(kncs.score(X_train, y_train))
    kncsvalid.append(kncs.score(X_valid, y_valid))
    print(str(n) + kncs.score(X_train, y_train))
    print(str(n) + kncs.score(X_valid, y_valid))
#     print("KNeighborsClassifier (StandardScaled) score on validation with " + str(n) + ": " + str(knc.score(X_valid, y_valid)))
#     print("KNeighborsClassifier (StandardScaled) score on train with " + str(n) + ": "+ str(knc.score(X_train, y_train)))

In [22]:
nn_activiation = "logistic"
nn_solver = "lbfgs"
nn_layers = [(1), (5), (10), (5, 1), (10, 5), (10, 5, 1)]
nn_layers_str = []

In [23]:
for nn_nodes in nn_layers:
    model_nn = make_pipeline(
        MinMaxScaler(),
        MLPClassifier(hidden_layer_sizes=nn_nodes, activation=nn_activiation, solver=nn_solver, max_iter=100000)
    )
    model_nn.fit(X_train, y_train)
    print(round(model_nn.score(X_valid, y_valid), 4))

0.5638
0.6135
0.6004
0.5638
0.5638
0.5638


In [24]:
nn_solver = "adam"

In [25]:
for nn_nodes in nn_layers:
    model_nn = make_pipeline(
        MinMaxScaler(),
        MLPClassifier(hidden_layer_sizes=nn_nodes, activation=nn_activiation, solver=nn_solver, max_iter=100000)
    )
    model_nn.fit(X_train, y_train)
    print(round(model_nn.score(X_valid, y_valid), 4))

0.5638
0.5638
0.5638
0.5638
0.5638
0.5638


In [26]:
# Logistic Regression Parameters
lr_c = [0.0001, 0.001, 0.01, 1, 10]
lr_max_iter = [987, 1597, 2584, 6765, 10946, 17711, 28657]
# lr_pca_vals = [2, 5, 8, 15, 25]
lr_parameters = np.array(np.meshgrid(lr_c, lr_max_iter)).reshape(2, len(lr_c) * len(lr_max_iter)).T

In [29]:
for this_lr_params in lr_parameters:
    lr_row_label = "(" + str(this_lr_params[0]) + ", " + str(this_lr_params[1]) + ")"
    model_svc = make_pipeline(
        StandardScaler(),
        LogisticRegression(C=this_lr_params[0], class_weight='balanced', solver='saga', max_iter=this_lr_params[1], n_jobs=2)
    )
    print("Running with C="+str(this_lr_params[0])+" max_iter="+str(this_lr_params[1]))
    model_svc.fit(X_train, y_train)
    print(round(model_svc.score(X_valid, y_valid), 4))

Running with C=0.0001 max_iter=987.0
0.5302
Running with C=0.001 max_iter=987.0
0.5539
Running with C=0.01 max_iter=987.0
0.5554
Running with C=1.0 max_iter=987.0




0.557
Running with C=10.0 max_iter=987.0




0.557
Running with C=0.0001 max_iter=1597.0
0.5302
Running with C=0.001 max_iter=1597.0
0.5539
Running with C=0.01 max_iter=1597.0
0.5554
Running with C=1.0 max_iter=1597.0




0.557
Running with C=10.0 max_iter=1597.0




0.557
Running with C=0.0001 max_iter=2584.0
0.53
Running with C=0.001 max_iter=2584.0
0.5539
Running with C=0.01 max_iter=2584.0
0.5554
Running with C=1.0 max_iter=2584.0




0.557
Running with C=10.0 max_iter=2584.0




0.5565
Running with C=0.0001 max_iter=6765.0
0.5294
Running with C=0.001 max_iter=6765.0
0.5539
Running with C=0.01 max_iter=6765.0
0.5554
Running with C=1.0 max_iter=6765.0
0.5565
Running with C=10.0 max_iter=6765.0




0.5565
Running with C=0.0001 max_iter=10946.0
0.5292
Running with C=0.001 max_iter=10946.0
0.5539
Running with C=0.01 max_iter=10946.0
0.5554
Running with C=1.0 max_iter=10946.0
0.5568
Running with C=10.0 max_iter=10946.0
0.5565
Running with C=0.0001 max_iter=17711.0
0.5297
Running with C=0.001 max_iter=17711.0
0.5539
Running with C=0.01 max_iter=17711.0
0.5554
Running with C=1.0 max_iter=17711.0
0.5565
Running with C=10.0 max_iter=17711.0
0.5565
Running with C=0.0001 max_iter=28657.0
0.5297
Running with C=0.001 max_iter=28657.0
0.5539
Running with C=0.01 max_iter=28657.0
0.5554
Running with C=1.0 max_iter=28657.0
0.5565
Running with C=10.0 max_iter=28657.0
0.5565


In [30]:
num_neighbours = [3, 5, 8, 13, 21, 34, 55, 89, 144, 233, 377, 610, 987, 1597]
kncstrain = []
kncsvalid = []

for n in num_neighbours:
    kncs = make_pipeline(MinMaxScaler(), KNeighborsClassifier(n_neighbors=n)).fit(X_train, y_train)
    kncstrain.append(kncs.score(X_train, y_train))
    kncsvalid.append(kncs.score(X_valid, y_valid))
    print(str(n) + " train: " + str(kncs.score(X_train, y_train)) + " valid: " + str(kncs.score(X_valid, y_valid)))

3 train: 0.7746343172462118 valid: 0.5685759327377824
5 train: 0.7249715336778488 valid: 0.5803993694167104
8 train: 0.6951037925899973 valid: 0.5756699947451392
13 train: 0.6755715161601121 valid: 0.5927482921702575


KeyboardInterrupt: 

In [31]:
for n in num_neighbours:
    kncs = make_pipeline(StandardScaler(), KNeighborsClassifier(n_neighbors=n)).fit(X_train, y_train)
    kncstrain.append(kncs.score(X_train, y_train))
    kncsvalid.append(kncs.score(X_valid, y_valid))
    print(str(n) + " train: " + str(kncs.score(X_train, y_train)) + " valid: " + str(kncs.score(X_valid, y_valid)))

3 train: 0.773846019094333 valid: 0.580136626379401
5 train: 0.7259350091968118 valid: 0.5895953757225434


KeyboardInterrupt: 