In [4]:
import os
import pandas as pd
import statsmodels.api as sm

In [67]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.feature_selection import SelectKBest, f_classif, chi2
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler

In [6]:
market = pd.read_csv(os.path.join('resources', 'cleaned_data.csv')).drop(columns=['Unnamed: 0'])

In [27]:
indicators = pd.DataFrame({
   'Indicator': market.columns.tolist()
})

filt = indicators[indicators['Indicator'].str.contains('Growth|growth|Margin|ratio|Ratio|Turnover|per|/| / | to |Per|return|To|days|Days') == False]

cond = filt['Indicator'].to_list()

# LOGISTIC CLASSIFICATION MODELS

## Full Dataset: 200+ predictors

In [77]:
y = market['Class']

In [78]:
x = market.drop(columns={'Ticker','Company Name','2019 PRICE VAR [%]','Class','Market Cap Size'})

X = pd.get_dummies(x)

X.head()

Unnamed: 0,Revenue,Revenue Growth,Cost of Revenue,Gross Profit,R&D Expenses,SG&A Expense,Operating Expenses,Operating Income,Interest Expense,Earnings before Tax,...,Sector_Communication Services,Sector_Consumer Cyclical,Sector_Consumer Defensive,Sector_Energy,Sector_Financial Services,Sector_Healthcare,Sector_Industrials,Sector_Real Estate,Sector_Technology,Sector_Utilities
0,15926000.0,-0.123,0.0,15926000.0,0.0,22534000.0,22534000.0,-6608000.0,0.0,-10501000.0,...,0,0,0,0,1,0,0,0,0,0
1,185553000.0,0.0245,132650000.0,52903000.0,0.0,39042000.0,51837000.0,1066000.0,2037000.0,-971000.0,...,0,0,0,0,1,0,0,0,0,0
2,267465000.0,-0.3004,179692000.0,87773000.0,49903000.0,48638000.0,98541000.0,-10768000.0,1106000.0,-9778000.0,...,0,0,0,0,0,0,0,0,1,0
3,433947000.0,0.0709,330414000.0,103533000.0,0.0,47755000.0,47743000.0,55790000.0,0.0,55939000.0,...,0,0,0,0,0,0,0,0,0,0
4,330867000.0,0.0504,121455000.0,209412000.0,0.0,22784000.0,129877000.0,79535000.0,52248000.0,19686000.0,...,0,0,0,0,0,0,0,1,0,0


In [79]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)

classifier = LogisticRegression()

classifier.fit(X_train, y_train)

print(f"Training Data Score: {classifier.score(X_train, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test, y_test)}")



Training Data Score: 0.691559569294894
Testing Data Score: 0.6833333333333333




In [80]:
y_pred = classifier.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.45      0.09      0.16       298
           1       0.70      0.95      0.81       662

    accuracy                           0.68       960
   macro avg       0.58      0.52      0.48       960
weighted avg       0.62      0.68      0.60       960



## Feature Selection: KBest method

In [75]:
selector = SelectKBest(f_classif, k=12)

X_new = selector.fit_transform(X, y)

selected_features = pd.DataFrame(selector.inverse_transform(X_new), 
                                 index=X.index, 
                                 columns=X.columns)
selected_features.head()

# Dropped columns have values of all 0s, so var is 0, drop them
selected_columns = selected_features.columns[selected_features.var() != 0]

# Get the valid dataset with the selected features.
X_final = selected_features[selected_columns]

  f = msb / msw


In [76]:
X_final.head()

Unnamed: 0,Operating Income,Earnings before Tax,Net Income,Net Income Com,Dividend per Share,EBIT,Consolidated Income,Market Cap,Enterprise Value,5Y Dividend per Share Growth (per Share),Sector_Financial Services,Sector_Healthcare
0,-6608000.0,-10501000.0,-10876000.0,-11082000.0,0.0,-10501000.0,-10876000.0,48483620.0,6546622.0,0.0,1.0,0.0
1,1066000.0,-971000.0,-704000.0,-1103000.0,0.02,1066000.0,-704000.0,48680350.0,69788350.0,0.0,1.0,0.0
2,-10768000.0,-9778000.0,-2146000.0,-2146000.0,0.0,-8672000.0,-2146000.0,305493400.0,336034400.0,0.0,0.0,0.0
3,55790000.0,55939000.0,42572000.0,42572000.0,0.32,55939000.0,42572000.0,1826434000.0,1824413000.0,0.2358,0.0,0.0
4,79535000.0,19686000.0,19686000.0,19686000.0,1.09,71934000.0,27202000.0,1896913000.0,3130413000.0,0.051,0.0,0.0


In [17]:
X_train, X_test, y_train, y_test = train_test_split(X_final, y, random_state=1, stratify=y)

classifier = LogisticRegression()

classifier.fit(X_train, y_train)

print(f"Training Data Score: {classifier.score(X_train, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test, y_test)}")

In [28]:
y_pred = classifier.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.55      0.12      0.20       298
           1       0.71      0.95      0.81       662

    accuracy                           0.70       960
   macro avg       0.63      0.54      0.51       960
weighted avg       0.66      0.70      0.62       960



# LINEAR REGRESSION MODELS

In [11]:
y = market["2019 PRICE VAR [%]"]

In [28]:
x = market[cond].drop(columns={'Ticker','Company Name','2019 PRICE VAR [%]','Class','Market Cap Size'})

X = pd.get_dummies(x)

X.head()

Unnamed: 0,Revenue,Cost of Revenue,Gross Profit,R&D Expenses,SG&A Expense,Interest Expense,Earnings before Tax,Income Tax Expense,Net Income - Non-Controlling int,Net Income - Discontinued ops,...,Sector_Communication Services,Sector_Consumer Cyclical,Sector_Consumer Defensive,Sector_Energy,Sector_Financial Services,Sector_Healthcare,Sector_Industrials,Sector_Real Estate,Sector_Technology,Sector_Utilities
0,15926000.0,0.0,15926000.0,0.0,22534000.0,0.0,-10501000.0,375000.0,0.0,0.0,...,0,0,0,0,1,0,0,0,0,0
1,185553000.0,132650000.0,52903000.0,0.0,39042000.0,2037000.0,-971000.0,-267000.0,0.0,0.0,...,0,0,0,0,1,0,0,0,0,0
2,267465000.0,179692000.0,87773000.0,49903000.0,48638000.0,1106000.0,-9778000.0,-7632000.0,0.0,0.0,...,0,0,0,0,0,0,0,0,1,0
3,433947000.0,330414000.0,103533000.0,0.0,47755000.0,0.0,55939000.0,13367000.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
4,330867000.0,121455000.0,209412000.0,0.0,22784000.0,52248000.0,19686000.0,0.0,7516000.0,0.0,...,0,0,0,0,0,0,0,1,0,0


In [29]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [30]:
reg = sm.OLS(y_train, X_train).fit()

reg.summary()

0,1,2,3
Dep. Variable:,2019 PRICE VAR [%],R-squared:,0.045
Model:,OLS,Adj. R-squared:,0.017
Method:,Least Squares,F-statistic:,1.604
Date:,"Mon, 11 May 2020",Prob (F-statistic):,0.000507
Time:,20:12:34,Log-Likelihood:,-15647.0
No. Observations:,2879,AIC:,31460.0
Df Residuals:,2795,BIC:,31960.0
Df Model:,83,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Revenue,-1.545e-07,1.19e-07,-1.297,0.195,-3.88e-07,7.91e-08
Cost of Revenue,1.546e-07,1.19e-07,1.298,0.195,-7.9e-08,3.88e-07
Gross Profit,1.544e-07,1.19e-07,1.296,0.195,-7.92e-08,3.88e-07
R&D Expenses,-2.375e-09,2.83e-09,-0.841,0.401,-7.91e-09,3.17e-09
SG&A Expense,-1.353e-10,1.02e-09,-0.133,0.894,-2.13e-09,1.86e-09
Interest Expense,-1.368e-07,7.59e-08,-1.801,0.072,-2.86e-07,1.21e-08
Earnings before Tax,3.913e-08,4.69e-08,0.835,0.404,-5.28e-08,1.31e-07
Income Tax Expense,-1.771e-07,8.7e-08,-2.035,0.042,-3.48e-07,-6.49e-09
Net Income - Non-Controlling int,9.812e-09,2.16e-08,0.454,0.650,-3.26e-08,5.22e-08

0,1,2,3
Omnibus:,2338.632,Durbin-Watson:,1.992
Prob(Omnibus):,0.0,Jarque-Bera (JB):,142839.019
Skew:,3.399,Prob(JB):,0.0
Kurtosis:,36.831,Cond. No.,1.28e+16


In [24]:
selector = SelectKBest(f_classif, k=20)

X_new = selector.fit_transform(X, y)

selected_features = pd.DataFrame(selector.inverse_transform(X_new), 
                                 index=X.index, 
                                 columns=X.columns)
selected_features.head()

# Dropped columns have values of all 0s, so var is 0, drop them
selected_columns = selected_features.columns[selected_features.var() != 0]

# Get the valid dataset with the selected features.
X_final = selected_features[selected_columns]

  f = msb / msw
  f = msb / msw


In [25]:
X_train, X_test, y_train, y_test = train_test_split(X_final, y, random_state=1)

In [26]:
reg = sm.OLS(y_train, X_train).fit()

reg.summary()

0,1,2,3
Dep. Variable:,2019 PRICE VAR [%],R-squared (uncentered):,0.051
Model:,OLS,Adj. R-squared (uncentered):,0.045
Method:,Least Squares,F-statistic:,9.002
Date:,"Mon, 11 May 2020",Prob (F-statistic):,3.11e-23
Time:,20:08:02,Log-Likelihood:,-15808.0
No. Observations:,2879,AIC:,31650.0
Df Residuals:,2862,BIC:,31750.0
Df Model:,17,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Preferred Dividends,1.012e-08,2.57e-08,0.393,0.694,-4.03e-08,6.06e-08
Dividend per Share,3.2773,0.853,3.840,0.000,1.604,4.951
priceFairValue,-1.891e-07,6.25e-07,-0.303,0.762,-1.41e-06,1.04e-06
operatingCashFlowPerShare,-4.79e-06,8.73e-06,-0.549,0.583,-2.19e-05,1.23e-05
cashPerShare,1.304e-06,1.79e-06,0.730,0.466,-2.2e-06,4.81e-06
payoutRatio,0.0653,0.092,0.709,0.478,-0.115,0.246
Revenue per Share,3.295e-07,1.39e-06,0.237,0.813,-2.4e-06,3.06e-06
Operating Cash Flow per Share,-4.79e-06,8.73e-06,-0.549,0.583,-2.19e-05,1.23e-05
Cash per Share,1.304e-06,1.79e-06,0.730,0.466,-2.2e-06,4.81e-06

0,1,2,3
Omnibus:,2382.73,Durbin-Watson:,1.908
Prob(Omnibus):,0.0,Jarque-Bera (JB):,144791.298
Skew:,3.507,Prob(JB):,0.0
Kurtosis:,37.027,Cond. No.,2.04e+22


# RANDOM FOREST REGRESSION MODELS

In [52]:
y = market["2019 PRICE VAR [%]"]

In [53]:
x = market[cond].drop(columns={'Ticker','Company Name','2019 PRICE VAR [%]','Class','Market Cap Size'})

X = pd.get_dummies(x)

X.head()

Unnamed: 0,Revenue,Cost of Revenue,Gross Profit,R&D Expenses,SG&A Expense,Interest Expense,Earnings before Tax,Income Tax Expense,Net Income - Non-Controlling int,Net Income - Discontinued ops,...,Sector_Communication Services,Sector_Consumer Cyclical,Sector_Consumer Defensive,Sector_Energy,Sector_Financial Services,Sector_Healthcare,Sector_Industrials,Sector_Real Estate,Sector_Technology,Sector_Utilities
0,15926000.0,0.0,15926000.0,0.0,22534000.0,0.0,-10501000.0,375000.0,0.0,0.0,...,0,0,0,0,1,0,0,0,0,0
1,185553000.0,132650000.0,52903000.0,0.0,39042000.0,2037000.0,-971000.0,-267000.0,0.0,0.0,...,0,0,0,0,1,0,0,0,0,0
2,267465000.0,179692000.0,87773000.0,49903000.0,48638000.0,1106000.0,-9778000.0,-7632000.0,0.0,0.0,...,0,0,0,0,0,0,0,0,1,0
3,433947000.0,330414000.0,103533000.0,0.0,47755000.0,0.0,55939000.0,13367000.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
4,330867000.0,121455000.0,209412000.0,0.0,22784000.0,52248000.0,19686000.0,0.0,7516000.0,0.0,...,0,0,0,0,0,0,0,1,0,0


In [54]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [62]:
X_scaler = StandardScaler().fit(X_train)
y_scaler = StandardScaler().fit(y_train.values.reshape(-1, 1))

X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)
y_train_scaled = y_scaler.transform(y_train.values.reshape(-1, 1))
y_test_scaled = y_scaler.transform(y_test.values.reshape(-1, 1))

In [63]:
regr = RandomForestRegressor(max_depth=2, random_state=0)

In [64]:
regr.fit(X_train_scaled, y_train_scaled)

  """Entry point for launching an IPython kernel.


RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=2,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=10,
                      n_jobs=None, oob_score=False, random_state=0, verbose=0,
                      warm_start=False)

In [66]:
regr.score(X_train_scaled, y_train_scaled)

0.09451597177295899

In [48]:
regr.predict(X_train.loc[2256].values.reshape(1,-1))

array([0.804159])