In [27]:
# loading data
import preprocessing as loader

X,y,keys_X,keys_y = loader.extractData()

print("DATA LOADED")
print("Input Shape:",X.shape)
print("Output Shape:", y.shape)
print("Input Keys:",keys_X)
print("Output Keys:",keys_y)

DATA LOADED
Input Shape: (505, 78)
Output Shape: (505, 1)
Input Keys: ['Recent_price', 'PER', 'PSR', 'PBR', 'PEG', 'forPER', 'Beta', 'AnnualDividendRate', 'ROE(%)', 'ROA(%)', 'ProfitMargin(%)', 'TotalCash', 'TotalDebt', 'intangibleAssets', 'capitalSurplus', 'totalLiab', 'totalStockholderEquity', 'otherCurrentLiab', 'totalAssets', 'commonStock', 'otherCurrentAssets', 'retainedEarnings', 'otherLiab', 'goodWill', 'treasuryStock', 'otherAssets', 'cash', 'totalCurrentLiabilities', 'otherStockholderEquity', 'propertyPlantEquipment', 'totalCurrentAssets', 'longTermInvestments', 'netTangibleAssets', 'netReceivables', 'longTermDebt', 'inventory', 'accountsPayable', 'deferredLongTermAssetCharges', 'shortLongTermDebt', 'shortTermInvestments', 'minorityInterest', 'researchDevelopment', 'incomeBeforeTax', 'minorityInterest', 'netIncome', 'sellingGeneralAdministrative', 'grossProfit', 'ebit', 'operatingIncome', 'otherOperatingExpenses', 'interestExpense', 'incomeTaxExpense', 'totalRevenue', 'totalOp

In [6]:
from models.lasso.lasso import LassoRegression
from models.mlr.mlr import MLR
from models.tree.tree import Tree
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler

In [7]:
def standardize(X_train, X_test, y_train, y_test):
    """
    Standardize the training and testing features/target data using StandardScaler
    """
    scaler = StandardScaler()

    scaler.fit(X_train)
    # scale training and testing features data
    X_train = scaler.transform(X_train)
    X_test = scaler.transform(X_test)

    # scale training and testing target data
    scaler.fit(np.array(y_train).reshape(-1,1))
    y_train = scaler.transform(np.array(y_train).reshape(-1,1)).reshape(-1)
    y_test = scaler.transform(np.array(y_test).reshape(-1,1)).reshape(-1)
    return X_train, X_test, y_train, y_test


In [8]:
#perform a k fold train-test split
n_splits = 20
kf = KFold(n_splits=n_splits,random_state=42,shuffle=True)

# keep track of (best features, r2 scores) of each model in each split
model_res = {"MLR":[], "Lasso":[], "Tree":[]}

#iterate over k folt train-test split
for i, (train_index, test_index) in enumerate(kf.split(X)):
    X_train = X[train_index,:]
    X_test = X[test_index,:]
    y_train = y[train_index].reshape(-1)
    y_test = y[test_index].reshape(-1)
    
    X_train, X_test, y_train, y_test = standardize(X_train, X_test, y_train, y_test)
    
    #run each model
    mlr = MLR(X_train,X_test,y_train,y_test, keys_X)
    model_res["MLR"].append((mlr.best_features, mlr.error()))
    
    lasso = LassoRegression(X_train,X_test,y_train,y_test,keys_X)
    model_res["Lasso"].append((lasso.nonzero_coef_features, lasso.error()))
    
    tree = Tree(X_train,X_test,y_train,y_test, keys_X)
    model_res["Tree"].append((tree.best_features, tree.error()))
    

In [9]:
# important nonzero coefficient features from Lasso
best_lasso = max(model_res["Lasso"], key=lambda x: x[1])
print(f"Lasso best features with testing score: {best_lasso[1]}")
lasso_best_features = best_lasso[0]
print(lasso_best_features)

Lasso best features with testing score: 0.9553878227693062
['researchDevelopment', 'repurchaseOfStock', 'totalCashFromOperatingActivities', 'netIncomeFromContinuingOps', 'commonStock', 'grossProfit', 'ebit', 'changeToLiabilities', 'changeToAccountReceivables']


In [10]:
# important nonzero coefficient features from multiple linear regression
best_mlr = max(model_res["MLR"], key=lambda x: x[1])
print(f"Multiple linear regression best features with testing score: {best_mlr[1]}")
mlr_best_features = best_mlr[0]
print(mlr_best_features)

Multiple linear regression best features with testing score: 0.9539033705052715
['operatingIncome', 'netIncomeFromContinuingOps', 'incomeBeforeTax', 'totalCurrentLiabilities', 'totalOtherIncomeExpenseNet', 'totalOperatingExpenses', 'totalAssets', 'incomeTaxExpense', 'costOfRevenue', 'totalRevenue', 'accountsPayable', 'grossProfit', 'otherLiab', 'totalCashFromFinancingActivities', 'ebit', 'totalCashflowsFromInvestingActivities', 'totalLiab', 'changeInCash', 'otherAssets', 'netIncomeApplicableToCommonShares', 'totalCashFromOperatingActivities', 'cash', 'netIncome', 'otherCurrentLiab', 'longTermDebt', 'totalCurrentAssets', 'changeToNetincome', 'retainedEarnings', 'treasuryStock', 'otherStockholderEquity', 'netBorrowings', 'propertyPlantEquipment', 'otherCashflowsFromFinancingActivities', 'sellingGeneralAdministrative', 'capitalExpenditures', 'shortTermInvestments', 'effectOfExchangeRate', 'forPER', 'intangibleAssets', 'commonStock', 'netReceivables', 'changeToLiabilities', 'goodWill', 'in

In [11]:
# important nonzero coefficient features from multiple linear regression
best_tree = max(model_res["Tree"], key=lambda x: x[1])
print(f"Random forest best features with testing score: {best_tree[1]}")
tree_best_features = best_tree[0]
print(tree_best_features)

Random forest best features with testing score: 0.9769733713422849
['incomeBeforeTax', 'totalCashFromOperatingActivities', 'netIncomeApplicableToCommonShares', 'ebit', 'researchDevelopment', 'repurchaseOfStock', 'commonStock', 'incomeTaxExpense', 'totalStockholderEquity', 'totalRevenue', 'grossProfit', 'netIncomeFromContinuingOps', 'totalCashFromFinancingActivities', 'netIncome', 'netTangibleAssets', 'capitalExpenditures', 'operatingIncome', 'effectOfExchangeRate', 'PBR', 'forPER', 'totalCashflowsFromInvestingActivities', 'changeToNetincome', 'changeToAccountReceivables', 'cash', 'netBorrowings', 'PSR', 'Recent_price', 'depreciation', 'goodWill', 'treasuryStock', 'dividendsPaid', 'totalOperatingExpenses', 'retainedEarnings', 'totalOtherIncomeExpenseNet', 'sellingGeneralAdministrative', 'ROE(%)', 'costOfRevenue', 'totalCurrentAssets', 'shortTermInvestments', 'ROA(%)', 'netReceivables', 'otherCurrentLiab', 'PER', 'totalAssets', 'otherCurrentAssets', 'otherStockholderEquity', 'capitalSurp

In [26]:
# common important features between top 15 important features from lasso, random forest, and multiple linear regression
top_n = 15
set(mlr_best_features[:top_n]).intersection(set(tree_best_features[:top_n]), set(lasso_best_features[:top_n]))



{'ebit', 'grossProfit', 'netIncomeFromContinuingOps'}

In [13]:
# top 10 most important features from multiple linear regression
mlr_best_features[:10]

['operatingIncome',
 'netIncomeFromContinuingOps',
 'incomeBeforeTax',
 'totalCurrentLiabilities',
 'totalOtherIncomeExpenseNet',
 'totalOperatingExpenses',
 'totalAssets',
 'incomeTaxExpense',
 'costOfRevenue',
 'totalRevenue']

In [14]:
# top 10 most important features from lasso regression
lasso_best_features[:10]

['researchDevelopment',
 'repurchaseOfStock',
 'totalCashFromOperatingActivities',
 'netIncomeFromContinuingOps',
 'commonStock',
 'grossProfit',
 'ebit',
 'changeToLiabilities',
 'changeToAccountReceivables']

In [15]:
# top 10 most important features from random forest
tree_best_features[:10]

['incomeBeforeTax',
 'totalCashFromOperatingActivities',
 'netIncomeApplicableToCommonShares',
 'ebit',
 'researchDevelopment',
 'repurchaseOfStock',
 'commonStock',
 'incomeTaxExpense',
 'totalStockholderEquity',
 'totalRevenue']