In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split 
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, make_scorer, mean_absolute_error
from sklearn.model_selection import cross_val_score 
from math import sqrt
from sklearn.model_selection import KFold
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import GridSearchCV

In [2]:
data = pd.read_csv('DeltaBS_data.csv')
data

Unnamed: 0,Company,Time,Total Assets,Total Liabilities,Total Equity,Total Shares Out. on Filing Date,Book Value / Share,Tangible Book Value,Tangible Book Value Per Share,Total Debt,Net Debt,Price_Change
0,SIRI,2019-12-01,2976.0,1895.0,1081.0,68.1,0.25,-1438.0,-0.30,1458.0,1406.0,1.44
1,NEWT,2015-12-01,50.6,13.1,37.5,4.3,-2.25,34.0,-2.22,7.6,24.1,1.87
2,ADM,2019-12-01,3164.0,2926.0,238.0,-2.2,0.52,-1208.0,-2.08,1512.0,2654.0,5.38
3,PXD,2017-12-01,544.0,-324.0,868.0,0.5,4.94,872.0,4.96,-481.0,-25.0,-7.22
4,TROW,2018-12-01,153.9,106.5,47.4,-8.1,1.96,299.9,1.88,0.0,477.5,-12.61
5,PRU,2013-12-01,22546.0,25777.0,-3231.0,-2.0,-6.62,-3160.0,-6.49,3913.0,10574.0,38.89
6,LEG,2017-12-01,566.7,469.9,96.8,-3.5,0.84,63.8,0.49,291.9,47.7,-1.15
7,ADS,2012-12-01,3019.9,2667.4,352.5,-0.1,7.12,-148.4,-3.15,2387.1,1710.0,40.92
8,PPG,2018-12-01,-523.0,417.0,-940.0,-14.0,-2.49,-982.0,-4.28,870.0,1398.0,-14.59
9,VZ,2015-12-01,11559.0,7393.0,4166.0,-81.6,1.07,-9660.0,-2.82,-3699.0,2384.0,-0.56


In [3]:
prices = data['Price_Change'] 
data = data.drop(['Company', 'Time', 'Price_Change'], axis = 1)

col_list = []
for col in data.columns:
    col_list.append(col)
    
for col in col_list:
    print(col)
    data[col] = data[col].astype(float) # Converting columns to floats
    print("Done")

Total Assets
Done
Total Liabilities
Done
Total Equity
Done
Total Shares Out. on Filing Date
Done
Book Value / Share
Done
Tangible Book Value
Done
Tangible Book Value Per Share
Done
Total Debt
Done
Net Debt
Done


In [4]:
# Normalising the data
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler() 
scaled_values = scaler.fit_transform(data) 
data.loc[:,:] = scaled_values

data.insert(data.shape[1], 'Price_Change', prices)
data

Unnamed: 0,Total Assets,Total Liabilities,Total Equity,Total Shares Out. on Filing Date,Book Value / Share,Tangible Book Value,Tangible Book Value Per Share,Total Debt,Net Debt,Price_Change
0,0.567761,0.674890,0.524838,0.209157,0.001012,0.478161,0.156527,0.660160,0.609477,1.44
1,0.558627,0.667503,0.518225,0.206439,0.000961,0.488115,0.156493,0.649944,0.600695,1.87
2,0.568348,0.678937,0.519496,0.206162,0.001017,0.479716,0.156496,0.660541,0.617408,5.38
3,0.560168,0.666180,0.523488,0.206277,0.001108,0.493781,0.156620,0.646502,0.600383,-7.22
4,0.558950,0.667870,0.518288,0.205911,0.001047,0.489913,0.156566,0.649890,0.603576,-12.61
5,0.628862,0.768633,0.497513,0.206171,0.000871,0.466516,0.156418,0.677453,0.667741,38.89
6,0.560238,0.669296,0.518601,0.206107,0.001024,0.488316,0.156541,0.651947,0.600845,-1.15
7,0.567898,0.677922,0.520221,0.206252,0.001152,0.486881,0.156477,0.666705,0.611409,40.92
8,0.556836,0.669089,0.512031,0.205659,0.000956,0.481244,0.156457,0.656019,0.609426,-14.59
9,0.594558,0.696471,0.544388,0.202780,0.001028,0.422563,0.156483,0.623835,0.615692,-0.56


In [5]:
# Converting Data to Numpy Arrays
NpMatrix = data.to_numpy(dtype = None, copy = False)
X = NpMatrix[:,0:9] # Parameters
y = NpMatrix[:,9] # Price
print("X:", X)
print("\n")
print("Type X:", type(X))
print("Length of Individual X_train Vector:", len(X[1]))
print("Total Number of Training instances:", len(X))
print("\n")
print(y)
print("\n")
print("Type y:", type(y))
print("Length of Individual y_train vector", y[0])
print("Total number of y values", len(y))

X: [[0.56776064 0.67489009 0.52483793 ... 0.15652723 0.66016046 0.60947678]
 [0.55862704 0.66750314 0.51822526 ... 0.1564935  0.649944   0.60069462]
 [0.56834761 0.67893704 0.51949583 ... 0.15649596 0.66054083 0.61740799]
 ...
 [0.56725797 0.67263699 0.52745512 ... 0.1565369  0.65582847 0.60690295]
 [0.5811891  0.69430837 0.52074422 ... 0.15652372 0.67007121 0.60685211]
 [0.56018282 0.67036623 0.51676141 ... 0.15630548 0.65148239 0.60218744]]


Type X: <class 'numpy.ndarray'>
Length of Individual X_train Vector: 9
Total Number of Training instances: 4500


[  1.44   1.87   5.38 ...   5.83  77.77 334.81]


Type y: <class 'numpy.ndarray'>
Length of Individual y_train vector 1.44
Total number of y values 4500


In [6]:
params_RF = {'n_estimators':range(1,30),
            'max_depth' : range(1,16),
            }
mae = make_scorer(mean_absolute_error, greater_is_better= False)

RF = RandomForestRegressor()
gs_modelRF = GridSearchCV(RF, param_grid=params_RF, scoring =mae, cv=5)
grid_results = gs_modelRF.fit(X, y)

# Best parameters, best test score using these parameters, optimal KNN Configuration
print("Best Parameters", gs_modelRF.best_params_)
print("Best score on Test Data", gs_modelRF.best_score_)
print("Optimal Configuration", gs_modelRF.best_estimator_)

Best Parameters {'max_depth': 5, 'n_estimators': 27}
Best score on Test Data -15.849494821746067
Optimal Configuration RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=5,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=27,
                      n_jobs=None, oob_score=False, random_state=None,
                      verbose=0, warm_start=False)


In [7]:
# Summarize the results in a readable format
print("Best: {0}, using {1}".format(grid_results.cv_results_['mean_test_score'], grid_results.best_params_))
results_df = pd.DataFrame(grid_results.cv_results_)
results_df

Best: [-16.16036217 -16.40312473 -16.64992888 -16.60918414 -16.67880392
 -16.37538168 -16.78090812 -16.604801   -16.5781927  -16.69477431
 -16.68020018 -16.45742797 -16.46919506 -16.58453107 -16.61834508
 -16.57818599 -16.54349135 -16.52638442 -16.52893387 -16.52035725
 -16.45868563 -16.4837698  -16.76194269 -16.64405141 -16.57106331
 -16.60732193 -16.65514313 -16.69616479 -16.54399763 -16.98815552
 -16.92137297 -17.10911084 -16.52970458 -16.19375821 -16.45229865
 -16.91362302 -16.87039542 -16.54312873 -16.5341327  -16.39267381
 -16.57355386 -16.29227154 -16.48865751 -16.61793328 -17.0120926
 -16.86369144 -16.54993232 -16.21513786 -16.34657367 -16.56849672
 -16.37185063 -16.51226187 -16.39275925 -16.37956355 -16.5452982
 -16.57126993 -16.26111275 -16.53510259 -17.30596615 -16.81998233
 -16.55608785 -16.44083406 -16.83606342 -16.27036241 -16.87907466
 -16.38669593 -16.20617412 -16.31402338 -16.62396761 -16.48091325
 -16.03656469 -16.46036061 -16.53800303 -16.56190731 -16.31067072
 -16.4

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.006071,0.001794,0.001599,1.958929e-03,1,1,"{'max_depth': 1, 'n_estimators': 1}",-13.867580,-16.637306,-16.542277,-18.021797,-15.732851,-16.160362,1.362196,18
1,0.012188,0.006203,0.001600,1.959219e-03,1,2,"{'max_depth': 1, 'n_estimators': 2}",-14.922636,-17.009519,-16.097678,-18.118757,-15.867034,-16.403125,1.084771,144
2,0.009808,0.002251,0.000202,4.034042e-04,1,3,"{'max_depth': 1, 'n_estimators': 3}",-15.021825,-16.981022,-17.108297,-18.054534,-16.083966,-16.649929,1.026029,265
3,0.012785,0.001602,0.000798,1.596832e-03,1,4,"{'max_depth': 1, 'n_estimators': 4}",-15.248454,-16.935864,-16.498427,-18.320069,-16.043107,-16.609184,1.021581,246
4,0.012781,0.001594,0.001598,1.956648e-03,1,5,"{'max_depth': 1, 'n_estimators': 5}",-15.108888,-17.339095,-16.944299,-17.671346,-16.330392,-16.678804,0.903066,279
5,0.012773,0.001605,0.000810,1.619911e-03,1,6,"{'max_depth': 1, 'n_estimators': 6}",-15.195233,-16.813684,-16.089892,-17.948078,-15.830021,-16.375382,0.942036,132
6,0.014381,0.001957,0.000799,1.597214e-03,1,7,"{'max_depth': 1, 'n_estimators': 7}",-15.377045,-16.727670,-18.129288,-17.758363,-15.912175,-16.780908,1.048991,306
7,0.017565,0.001952,0.000799,1.597977e-03,1,8,"{'max_depth': 1, 'n_estimators': 8}",-15.224766,-16.657182,-17.331775,-18.321540,-15.488743,-16.604801,1.151391,243
8,0.019181,0.001601,0.001598,1.957114e-03,1,9,"{'max_depth': 1, 'n_estimators': 9}",-15.336021,-16.850933,-17.078952,-17.810351,-15.814706,-16.578193,0.890984,231
9,0.024760,0.004655,0.000000,0.000000e+00,1,10,"{'max_depth': 1, 'n_estimators': 10}",-14.958777,-16.971169,-17.469038,-18.046523,-16.028365,-16.694774,1.091869,284


In [8]:
results_df = results_df.sort_values(['rank_test_score'])
results_df

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
142,0.219824,2.321309e-03,0.001976,2.379155e-05,5,27,"{'max_depth': 5, 'n_estimators': 27}",-14.463839,-15.308561,-16.577945,-17.236022,-15.661107,-15.849495,0.969664,1
119,0.036751,1.597166e-03,0.000799,1.598358e-03,5,4,"{'max_depth': 5, 'n_estimators': 4}",-14.297767,-16.032889,-16.542047,-17.237278,-15.820882,-15.986173,0.975139,2
70,0.071910,7.986651e-03,0.000799,1.597881e-03,3,13,"{'max_depth': 3, 'n_estimators': 13}",-14.695090,-16.152938,-16.431476,-17.387009,-15.516310,-16.036565,0.901490,3
257,0.359638,2.720296e-03,0.002793,3.983498e-04,9,26,"{'max_depth': 9, 'n_estimators': 26}",-14.384085,-16.036436,-17.109023,-17.397874,-15.432197,-16.071923,1.103905,4
93,0.063542,1.233532e-02,0.000202,4.045486e-04,4,7,"{'max_depth': 4, 'n_estimators': 7}",-14.442686,-15.243175,-17.588770,-17.739252,-15.345891,-16.071955,1.337847,5
130,0.125433,3.195620e-03,0.000000,0.000000e+00,5,15,"{'max_depth': 5, 'n_estimators': 15}",-14.543177,-16.668344,-16.004139,-17.112816,-16.081442,-16.081984,0.869531,6
153,0.086967,1.715878e-03,0.000998,2.431402e-07,6,9,"{'max_depth': 6, 'n_estimators': 9}",-14.406669,-15.769166,-16.932343,-17.680126,-15.632042,-16.084069,1.129967,7
198,0.274000,1.690439e-03,0.001986,1.597255e-05,7,25,"{'max_depth': 7, 'n_estimators': 25}",-14.245855,-15.778597,-17.342149,-17.288910,-15.791781,-16.089458,1.148219,8
116,0.010394,1.963544e-03,0.000792,1.583385e-03,5,1,"{'max_depth': 5, 'n_estimators': 1}",-15.043789,-16.931237,-15.076322,-17.302011,-16.127329,-16.096138,0.927357,9
128,0.119041,1.113691e-02,0.000799,1.598072e-03,5,13,"{'max_depth': 5, 'n_estimators': 13}",-14.289947,-15.875686,-16.741975,-17.807272,-15.819707,-16.106917,1.160331,10
