In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split 
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, make_scorer, mean_absolute_error
from sklearn.model_selection import cross_val_score 
from math import sqrt
from sklearn.model_selection import KFold
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import GridSearchCV

In [2]:
data = pd.read_csv('BS_data.csv')
data

Unnamed: 0,Company,Time,Total Assets,Total Liabilities,Total Equity,Total Shares Out. on Filing Date,Book Value / Share,Tangible Book Value,Tangible Book Value Per Share,Total Debt,Net Debt,Price
0,RHI,2013-12-01,1490.3,570.6,919.6,135.3,6.80,718.3,5.31,1.4,-274.3,41.99
1,RCL,2011-12-01,19804.4,11396.6,8407.8,217.5,38.74,7442.4,34.29,8507.2,8245.0,24.77
2,NVDA,2020-01-01,17315.0,5111.0,12204.0,612.0,19.93,11537.0,18.84,2643.0,-8254.0,236.43
3,STC,2012-12-01,1291.2,710.8,580.4,21.1,29.31,263.5,13.58,71.2,-137.4,26.00
4,XRAY,2010-12-01,3258.0,1348.0,1909.9,142.1,12.98,457.6,3.23,786.6,246.6,34.17
5,GWW,2016-12-01,5694.3,3788.5,1905.8,58.8,30.57,684.7,11.64,2247.1,1972.9,232.25
6,SKM,2015-12-01,24288.4,11223.5,13064.9,70.6,183.55,9386.2,132.93,7039.9,5790.9,20.15
7,BAX,2011-12-01,19073.0,12245.0,6828.0,560.3,11.74,3442.0,6.14,5195.0,2290.0,26.88
8,MAR,2012-12-01,6342.0,7627.0,-1285.0,312.3,-4.13,-3274.0,-10.53,2935.0,2847.0,37.27
9,SJM,2019-04-01,16711.3,8740.8,7970.5,113.7,70.08,-5059.2,-44.48,5959.9,5858.6,122.63


In [3]:
prices = data['Price'] 
data = data.drop(['Company', 'Time', 'Price'], axis = 1)

col_list = []
for col in data.columns:
    col_list.append(col)
    
for col in col_list:
    print(col)
    data[col] = data[col].astype(float) # Converting columns to floats
    print("Done")

Total Assets
Done
Total Liabilities
Done
Total Equity
Done
Total Shares Out. on Filing Date
Done
Book Value / Share
Done
Tangible Book Value
Done
Tangible Book Value Per Share
Done
Total Debt
Done
Net Debt
Done


In [4]:
# Normalising the data
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler() 
scaled_values = scaler.fit_transform(data) 
data.loc[:,:] = scaled_values

data.insert(data.shape[1], 'Price', prices)
data

Unnamed: 0,Total Assets,Total Liabilities,Total Equity,Total Shares Out. on Filing Date,Book Value / Share,Tangible Book Value,Tangible Book Value Per Share,Total Debt,Net Debt,Price
0,0.001638,0.000683,0.028169,0.001691,0.000362,0.289974,0.001341,0.000003,0.185569,41.99
1,0.021935,0.013682,0.045186,0.002725,0.000484,0.305348,0.001492,0.017775,0.200646,24.77
2,0.019176,0.006134,0.053813,0.007687,0.000412,0.314710,0.001411,0.005522,0.171446,236.43
3,0.001418,0.000851,0.027398,0.000254,0.000448,0.288934,0.001384,0.000149,0.185811,26.00
4,0.003597,0.001616,0.030420,0.001776,0.000385,0.289378,0.001330,0.001644,0.186491,34.17
5,0.006298,0.004546,0.030410,0.000728,0.000453,0.289897,0.001374,0.004695,0.189546,232.25
6,0.026904,0.013474,0.055770,0.000877,0.001038,0.309792,0.002006,0.014709,0.196303,20.15
7,0.021124,0.014700,0.041596,0.007037,0.000380,0.296202,0.001345,0.010855,0.190107,26.88
8,0.007015,0.009155,0.023159,0.003917,0.000320,0.280846,0.001258,0.006132,0.191093,37.27
9,0.018507,0.010493,0.044193,0.001419,0.000604,0.276765,0.001081,0.012453,0.196423,122.63


In [5]:
# Converting Data to Numpy Arrays
NpMatrix = data.to_numpy(dtype = None, copy = False)
X = NpMatrix[:,0:9] # Parameters
y = NpMatrix[:,9] # Price
print("X:", X)
print("\n")
print("Type X:", type(X))
print("Length of Individual X_train Vector:", len(X[1]))
print("Total Number of Training instances:", len(X))
print("\n")
print(y)
print("\n")
print("Type y:", type(y))
print("Length of Individual y_train vector", y[0])
print("Total number of y values", len(y))

X: [[1.63843386e-03 6.82611560e-04 2.81693214e-02 ... 1.34060792e-03
  2.92521072e-06 1.85568632e-01]
 [2.19349988e-02 1.36816489e-02 4.51864494e-02 ... 1.49174313e-03
  1.77752519e-02 2.00645972e-01]
 [1.91761262e-02 6.13437854e-03 5.38134120e-02 ... 1.41116898e-03
  5.52237995e-03 1.71446269e-01]
 ...
 [6.79067888e-03 4.97604015e-03 3.06086506e-02 ... 1.34050362e-03
  4.01380699e-03 1.89340224e-01]
 [2.35835177e-03 1.36510305e-03 2.83540777e-02 ... 1.34947368e-03
  1.52340795e-03 1.86742708e-01]
 [7.21358632e-04 8.08087212e-04 2.60515545e-02 ... 1.31145520e-03
  1.21500717e-03 1.86722887e-01]]


Type X: <class 'numpy.ndarray'>
Length of Individual X_train Vector: 9
Total Number of Training instances: 5000


[ 41.99  24.77 236.43 ...  49.01   8.07  34.69]


Type y: <class 'numpy.ndarray'>
Length of Individual y_train vector 41.99
Total number of y values 5000


In [7]:
params_RF = {'n_estimators':range(1,30),
            'max_depth' : range(1,15),
            }
mae = make_scorer(mean_absolute_error, greater_is_better= False)

RF = RandomForestRegressor()
gs_modelRF = GridSearchCV(RF, param_grid=params_RF, scoring =mae, cv=5)
grid_results = gs_modelRF.fit(X, y)

# Best parameters, best test score using these parameters, optimal KNN Configuration
print("Best Parameters", gs_modelRF.best_params_)
print("Best score on Test Data", gs_modelRF.best_score_)
print("Optimal Configuration", gs_modelRF.best_estimator_)

Best Parameters {'max_depth': 14, 'n_estimators': 26}
Best score on Test Data -30.931235554723884
Optimal Configuration RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=14,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=26,
                      n_jobs=None, oob_score=False, random_state=None,
                      verbose=0, warm_start=False)


In [8]:
# Summarize the results in a readable format
print("Best: {0}, using {1}".format(grid_results.cv_results_['mean_test_score'], grid_results.best_params_))
results_df = pd.DataFrame(grid_results.cv_results_)
results_df

Best: [-52.68428475 -52.14814314 -52.15120429 -52.58417933 -52.81312778
 -52.49813479 -51.93244851 -52.20784226 -51.88001058 -52.20509957
 -52.21277788 -51.88244867 -51.75472392 -51.9021576  -51.71188438
 -52.10633147 -51.68965099 -51.77522155 -51.73362148 -52.0479306
 -52.06782207 -52.02790348 -51.99900343 -52.01420003 -51.93401094
 -52.10477844 -51.98174266 -51.86038079 -51.9267666  -49.07060706
 -47.13191524 -47.38282894 -46.2208889  -47.27166643 -47.35844447
 -46.57768901 -46.78927689 -46.68336711 -46.23659877 -46.70442714
 -47.04422635 -47.01845848 -46.40978127 -46.74573988 -46.62192945
 -46.71289001 -46.55525797 -46.80607875 -46.32138465 -46.36887728
 -46.2057837  -46.523543   -46.65215915 -46.67541341 -46.18775024
 -46.70712178 -46.62741914 -46.83968083 -45.32170634 -44.7389874
 -44.42357445 -43.2660323  -43.09669804 -43.21648017 -43.80788297
 -42.5694354  -43.0553293  -43.30613854 -42.91187684 -43.23766365
 -42.63336508 -42.6261694  -42.64865912 -43.18144364 -42.74249801
 -42.9

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.004684,0.000406,0.000799,3.995431e-04,1,1,"{'max_depth': 1, 'n_estimators': 1}",-59.059965,-50.155737,-52.030746,-51.558888,-50.616087,-52.684285,3.256192,405
1,0.006579,0.001338,0.000000,0.000000e+00,1,2,"{'max_depth': 1, 'n_estimators': 2}",-55.793010,-51.928020,-51.570916,-50.499101,-50.949669,-52.148143,1.887982,398
2,0.009175,0.001163,0.000600,4.902275e-04,1,3,"{'max_depth': 1, 'n_estimators': 3}",-54.958990,-51.340606,-51.208110,-51.647314,-51.601001,-52.151204,1.413279,399
3,0.011369,0.000489,0.000599,4.888895e-04,1,4,"{'max_depth': 1, 'n_estimators': 4}",-57.453674,-50.965316,-51.292777,-50.912140,-52.296990,-52.584179,2.485100,404
4,0.012765,0.000399,0.000200,3.993034e-04,1,5,"{'max_depth': 1, 'n_estimators': 5}",-56.160265,-55.092334,-51.021044,-50.875981,-50.916015,-52.813128,2.322122,406
5,0.014967,0.000631,0.000599,4.891618e-04,1,6,"{'max_depth': 1, 'n_estimators': 6}",-55.774873,-51.292781,-52.292958,-51.282963,-51.847099,-52.498135,1.681219,403
6,0.018939,0.002897,0.001197,9.780897e-04,1,7,"{'max_depth': 1, 'n_estimators': 7}",-55.925500,-50.983853,-51.090432,-50.383631,-51.278826,-51.932449,2.018878,388
7,0.023138,0.000745,0.000999,6.323352e-04,1,8,"{'max_depth': 1, 'n_estimators': 8}",-56.164213,-52.015454,-50.961264,-50.819494,-51.078786,-52.207842,2.022183,401
8,0.023137,0.001933,0.001197,3.989227e-04,1,9,"{'max_depth': 1, 'n_estimators': 9}",-55.037808,-51.611253,-51.324686,-50.459059,-50.967246,-51.880011,1.625087,384
9,0.026514,0.003569,0.000997,3.162980e-07,1,10,"{'max_depth': 1, 'n_estimators': 10}",-56.299013,-51.114125,-52.055062,-50.424670,-51.132628,-52.205100,2.111539,400


In [9]:
results_df = results_df.sort_values(['rank_test_score'])
results_df

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
402,0.615601,0.019664,0.006184,1.937393e-03,14,26,"{'max_depth': 14, 'n_estimators': 26}",-32.694133,-32.277660,-31.124715,-29.536958,-29.022711,-30.931236,1.452210,1
405,0.674724,0.021118,0.005585,5.040851e-04,14,29,"{'max_depth': 14, 'n_estimators': 29}",-32.982215,-33.160927,-31.418668,-28.889920,-28.528412,-30.996028,1.966478,2
399,0.529224,0.016169,0.004588,4.891121e-04,14,23,"{'max_depth': 14, 'n_estimators': 23}",-33.111084,-32.711268,-31.669174,-29.838634,-27.835895,-31.033211,1.958965,3
370,0.517451,0.021259,0.004776,3.939603e-04,13,23,"{'max_depth': 13, 'n_estimators': 23}",-32.456177,-32.775372,-31.416893,-29.378832,-29.189453,-31.043346,1.506192,4
393,0.381778,0.013960,0.003381,4.794114e-04,14,17,"{'max_depth': 14, 'n_estimators': 17}",-32.777218,-34.396784,-30.952342,-29.117700,-28.130989,-31.075007,2.299786,5
397,0.477325,0.019546,0.004383,4.732575e-04,14,21,"{'max_depth': 14, 'n_estimators': 21}",-31.910308,-32.391553,-32.066587,-30.120001,-28.932455,-31.084181,1.335395,6
394,0.420084,0.017396,0.003790,3.984276e-04,14,18,"{'max_depth': 14, 'n_estimators': 18}",-32.715662,-33.062771,-30.266957,-30.984019,-28.721076,-31.150097,1.601061,7
398,0.507546,0.011750,0.004595,4.959736e-04,14,22,"{'max_depth': 14, 'n_estimators': 22}",-32.663544,-33.260458,-31.803407,-29.724312,-28.674543,-31.225253,1.749511,8
403,0.616878,0.010835,0.004998,2.017838e-05,14,27,"{'max_depth': 14, 'n_estimators': 27}",-32.710183,-34.205317,-31.328153,-29.803476,-28.263633,-31.262152,2.091995,9
401,0.566620,0.012703,0.004981,6.159634e-04,14,25,"{'max_depth': 14, 'n_estimators': 25}",-33.076522,-33.580101,-31.627627,-29.527473,-28.880885,-31.338522,1.868089,10
