In [2]:
import os
import pandas as pd
import statsmodels.api as sm

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.feature_selection import SelectKBest, f_classif, chi2
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler

In [4]:
market = pd.read_csv(os.path.join('resources', 'cleaned_data.csv')).drop(columns=['Unnamed: 0'])

In [5]:
indicators = pd.DataFrame({
   'Indicator': market.columns.tolist()
})

filt = indicators[indicators['Indicator'].str.contains('Growth|growth|Margin|ratio|Ratio|Turnover|per|/| / | to |Per|return|To|days|Days') == False]

cond = filt['Indicator'].to_list()

# LOGISTIC CLASSIFICATION MODELS

## Full Dataset: 200+ predictors

In [47]:
y = market['Class']

In [48]:
x = market.drop(columns={'Ticker','Company Name','2019 PRICE VAR [%]','Class','Market Cap Size'})

X = pd.get_dummies(x)

X.head()

Unnamed: 0,Revenue,Revenue Growth,Cost of Revenue,Gross Profit,R&D Expenses,SG&A Expense,Operating Expenses,Operating Income,Interest Expense,Earnings before Tax,...,Sector_Communication Services,Sector_Consumer Cyclical,Sector_Consumer Defensive,Sector_Energy,Sector_Financial Services,Sector_Healthcare,Sector_Industrials,Sector_Real Estate,Sector_Technology,Sector_Utilities
0,15926000.0,-0.123,0.0,15926000.0,0.0,22534000.0,22534000.0,-6608000.0,0.0,-10501000.0,...,0,0,0,0,1,0,0,0,0,0
1,185553000.0,0.0245,132650000.0,52903000.0,0.0,39042000.0,51837000.0,1066000.0,2037000.0,-971000.0,...,0,0,0,0,1,0,0,0,0,0
2,267465000.0,-0.3004,179692000.0,87773000.0,49903000.0,48638000.0,98541000.0,-10768000.0,1106000.0,-9778000.0,...,0,0,0,0,0,0,0,0,1,0
3,433947000.0,0.0709,330414000.0,103533000.0,0.0,47755000.0,47743000.0,55790000.0,0.0,55939000.0,...,0,0,0,0,0,0,0,0,0,0
4,330867000.0,0.0504,121455000.0,209412000.0,0.0,22784000.0,129877000.0,79535000.0,52248000.0,19686000.0,...,0,0,0,0,0,0,0,1,0,0


In [49]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)

classifier = LogisticRegression()

classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)

print(f"Training Data Score: {classifier.score(X_train, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test, y_test)}")
print(' ')
print(classification_report(y_test, y_pred))



Training Data Score: 0.691559569294894
Testing Data Score: 0.6833333333333333
 
              precision    recall  f1-score   support

           0       0.45      0.09      0.16       298
           1       0.70      0.95      0.81       662

    accuracy                           0.68       960
   macro avg       0.58      0.52      0.48       960
weighted avg       0.62      0.68      0.60       960





In [39]:
report = classification_report(y_test, y_pred, output_dict=True)

## Feature Selection: KBest method

In [63]:
selector = SelectKBest(f_classif, k=18)

X_new = selector.fit_transform(X, y)

selected_features = pd.DataFrame(selector.inverse_transform(X_new), 
                                 index=X.index, 
                                 columns=X.columns)
selected_features.head()

# Dropped columns have values of all 0s, so var is 0, drop them
selected_columns = selected_features.columns[selected_features.var() != 0]

# Get the valid dataset with the selected features.
X_final = selected_features[selected_columns]

  f = msb / msw


In [65]:
X_final.head()

Unnamed: 0,Gross Profit,Operating Income,Earnings before Tax,Net Income,Net Income Com,Dividend per Share,EBITDA,EBIT,Consolidated Income,Dividend payments,Market Cap,Enterprise Value,5Y Dividend per Share Growth (per Share),3Y Dividend per Share Growth (per Share),Sector_Energy,Sector_Financial Services,Sector_Healthcare,Sector_Real Estate
0,15926000.0,-6608000.0,-10501000.0,-10876000.0,-11082000.0,0.0,-10065000.0,-10501000.0,-10876000.0,0.0,48483620.0,6546622.0,0.0,0.0,0.0,1.0,0.0,0.0
1,52903000.0,1066000.0,-971000.0,-704000.0,-1103000.0,0.02,2053000.0,1066000.0,-704000.0,-407000.0,48680350.0,69788350.0,0.0,0.0,0.0,1.0,0.0,0.0
2,87773000.0,-10768000.0,-9778000.0,-2146000.0,-2146000.0,0.0,21026000.0,-8672000.0,-2146000.0,0.0,305493400.0,336034400.0,0.0,0.0,0.0,0.0,0.0,0.0
3,103533000.0,55790000.0,55939000.0,42572000.0,42572000.0,0.32,73594000.0,55939000.0,42572000.0,-16728000.0,1826434000.0,1824413000.0,0.2358,0.133,0.0,0.0,0.0,0.0
4,209412000.0,79535000.0,19686000.0,19686000.0,19686000.0,1.09,179027000.0,71934000.0,27202000.0,-70228000.0,1896913000.0,3130413000.0,0.051,0.0473,0.0,0.0,0.0,1.0


In [64]:
X_train, X_test, y_train, y_test = train_test_split(X_final, y, random_state=1, stratify=y)

classifier = LogisticRegression()

classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)

print(f"Training Data Score: {classifier.score(X_train, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test, y_test)}")
print(' ')
print(classification_report(y_test, y_pred))

Training Data Score: 0.7089267106634248
Testing Data Score: 0.7072916666666667
 
              precision    recall  f1-score   support

           0       0.59      0.18      0.28       298
           1       0.72      0.94      0.82       662

    accuracy                           0.71       960
   macro avg       0.66      0.56      0.55       960
weighted avg       0.68      0.71      0.65       960





# LINEAR REGRESSION MODELS

In [None]:
y = market["2019 PRICE VAR [%]"]

In [None]:
x = market[cond].drop(columns={'Ticker','Company Name','2019 PRICE VAR [%]','Class','Market Cap Size'})

X = pd.get_dummies(x)

X.head()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [None]:
reg = sm.OLS(y_train, X_train).fit()

reg.summary()

In [None]:
selector = SelectKBest(f_classif, k=20)

X_new = selector.fit_transform(X, y)

selected_features = pd.DataFrame(selector.inverse_transform(X_new), 
                                 index=X.index, 
                                 columns=X.columns)
selected_features.head()

# Dropped columns have values of all 0s, so var is 0, drop them
selected_columns = selected_features.columns[selected_features.var() != 0]

# Get the valid dataset with the selected features.
X_final = selected_features[selected_columns]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_final, y, random_state=1)

In [None]:
reg = sm.OLS(y_train, X_train).fit()

reg.summary()

# RANDOM FOREST REGRESSION MODELS

In [6]:
y = market["2019 PRICE VAR [%]"]

In [7]:
x = market[cond].drop(columns={'Ticker','Company Name','2019 PRICE VAR [%]','Class','Market Cap Size'})

X = pd.get_dummies(x)

X.head()

Unnamed: 0,Revenue,Cost of Revenue,Gross Profit,R&D Expenses,SG&A Expense,Interest Expense,Earnings before Tax,Income Tax Expense,Net Income - Non-Controlling int,Net Income - Discontinued ops,...,Sector_Communication Services,Sector_Consumer Cyclical,Sector_Consumer Defensive,Sector_Energy,Sector_Financial Services,Sector_Healthcare,Sector_Industrials,Sector_Real Estate,Sector_Technology,Sector_Utilities
0,15926000.0,0.0,15926000.0,0.0,22534000.0,0.0,-10501000.0,375000.0,0.0,0.0,...,0,0,0,0,1,0,0,0,0,0
1,185553000.0,132650000.0,52903000.0,0.0,39042000.0,2037000.0,-971000.0,-267000.0,0.0,0.0,...,0,0,0,0,1,0,0,0,0,0
2,267465000.0,179692000.0,87773000.0,49903000.0,48638000.0,1106000.0,-9778000.0,-7632000.0,0.0,0.0,...,0,0,0,0,0,0,0,0,1,0
3,433947000.0,330414000.0,103533000.0,0.0,47755000.0,0.0,55939000.0,13367000.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
4,330867000.0,121455000.0,209412000.0,0.0,22784000.0,52248000.0,19686000.0,0.0,7516000.0,0.0,...,0,0,0,0,0,0,0,1,0,0


In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [9]:
X_scaler = StandardScaler().fit(X_train)
y_scaler = StandardScaler().fit(y_train.values.reshape(-1, 1))

X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)
y_train_scaled = y_scaler.transform(y_train.values.reshape(-1, 1))
y_test_scaled = y_scaler.transform(y_test.values.reshape(-1, 1))

In [10]:
regr = RandomForestRegressor(random_state=42)

regr.fit(X_train_scaled, y_train_scaled)

regr.score(X_train_scaled, y_train_scaled)

  This is separate from the ipykernel package so we can avoid doing imports until


0.7935565627496997

In [54]:
regr.feature_importances_;

In [26]:
y = market["2019 PRICE VAR [%]"]

In [27]:
x = market[cond].drop(columns={'Ticker','Company Name','2019 PRICE VAR [%]','Class','Market Cap Size'})

X = pd.get_dummies(x)

X.head()

Unnamed: 0,Revenue,Cost of Revenue,Gross Profit,R&D Expenses,SG&A Expense,Interest Expense,Earnings before Tax,Income Tax Expense,Net Income - Non-Controlling int,Net Income - Discontinued ops,...,Sector_Communication Services,Sector_Consumer Cyclical,Sector_Consumer Defensive,Sector_Energy,Sector_Financial Services,Sector_Healthcare,Sector_Industrials,Sector_Real Estate,Sector_Technology,Sector_Utilities
0,15926000.0,0.0,15926000.0,0.0,22534000.0,0.0,-10501000.0,375000.0,0.0,0.0,...,0,0,0,0,1,0,0,0,0,0
1,185553000.0,132650000.0,52903000.0,0.0,39042000.0,2037000.0,-971000.0,-267000.0,0.0,0.0,...,0,0,0,0,1,0,0,0,0,0
2,267465000.0,179692000.0,87773000.0,49903000.0,48638000.0,1106000.0,-9778000.0,-7632000.0,0.0,0.0,...,0,0,0,0,0,0,0,0,1,0
3,433947000.0,330414000.0,103533000.0,0.0,47755000.0,0.0,55939000.0,13367000.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
4,330867000.0,121455000.0,209412000.0,0.0,22784000.0,52248000.0,19686000.0,0.0,7516000.0,0.0,...,0,0,0,0,0,0,0,1,0,0


In [28]:
selector = SelectKBest(f_classif, k=40)

X_new = selector.fit_transform(X, y)

selected_features = pd.DataFrame(selector.inverse_transform(X_new), 
                                 index=X.index, 
                                 columns=X.columns)
selected_features.head()

# Dropped columns have values of all 0s, so var is 0, drop them
selected_columns = selected_features.columns[selected_features.var() != 0]

# Get the valid dataset with the selected features.
X_final = selected_features[selected_columns]

X_final.head()

  f = msb / msw


Unnamed: 0,Gross Profit,Income Tax Expense,Net Income - Non-Controlling int,Net Income - Discontinued ops,Preferred Dividends,EPS,EPS Diluted,Cash and cash equivalents,Short-term investments,Cash and short-term investments,...,Tangible Asset Value,Net Current Asset Value,Invested Capital,Average Payables,ROE,Sector_Communication Services,Sector_Consumer Defensive,Sector_Financial Services,Sector_Industrials,Sector_Utilities
0,15926000.0,375000.0,0.0,0.0,206000.0,-6.88,-6.88,27171000.0,14766000.0,41937000.0,...,49367000.0,40686000.0,15425000.0,1636500.0,0.0525,0.0,0.0,1.0,0.0,0.0
1,52903000.0,-267000.0,0.0,0.0,399000.0,-0.05,-0.05,12630000.0,0.0,12630000.0,...,341730000.0,-188939000.0,362838000.0,11171000.0,-0.0069,0.0,0.0,1.0,0.0,0.0
2,87773000.0,-7632000.0,0.0,0.0,0.0,-0.11,-0.11,58004000.0,0.0,58004000.0,...,457049000.0,56529000.0,410172000.0,36767000.0,-0.0065,0.0,0.0,0.0,0.0,0.0
3,103533000.0,13367000.0,0.0,0.0,0.0,0.81,0.81,1994000.0,27000.0,2021000.0,...,304462000.0,80163000.0,254397000.0,10791500.0,0.172,0.0,0.0,0.0,0.0,0.0
4,209412000.0,0.0,7516000.0,0.0,0.0,0.42,0.42,57272000.0,0.0,57272000.0,...,2198250000.0,-1276718000.0,3431750000.0,42342500.0,0.0245,0.0,0.0,0.0,0.0,0.0


In [29]:
X_train, X_test, y_train, y_test = train_test_split(X_final, y, random_state=42)

In [30]:
X_scaler = StandardScaler().fit(X_train)
y_scaler = StandardScaler().fit(y_train.values.reshape(-1, 1))

X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)
y_train_scaled = y_scaler.transform(y_train.values.reshape(-1, 1))
y_test_scaled = y_scaler.transform(y_test.values.reshape(-1, 1))

In [31]:
regr = RandomForestRegressor(random_state=42)

regr.fit(X_train_scaled, y_train_scaled)

regr.score(X_test_scaled, y_test_scaled)

  This is separate from the ipykernel package so we can avoid doing imports until


-0.4848046820661356

In [34]:
regr.feature_importances_;