In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split 
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, make_scorer, mean_absolute_error
from sklearn.model_selection import GridSearchCV

In [54]:
#data = pd.read_csv('BS_train_data.csv')
#data

In [2]:
data = pd.read_csv('BS_data.csv')
data

Unnamed: 0,Company,Time,Total Assets,Total Liabilities,Total Equity,Total Shares Out. on Filing Date,Book Value / Share,Tangible Book Value,Tangible Book Value Per Share,Total Debt,Net Debt,Price
0,RHI,2013-12-01,1490.3,570.6,919.6,135.3,6.80,718.3,5.31,1.4,-274.3,41.99
1,RCL,2011-12-01,19804.4,11396.6,8407.8,217.5,38.74,7442.4,34.29,8507.2,8245.0,24.77
2,NVDA,2020-01-01,17315.0,5111.0,12204.0,612.0,19.93,11537.0,18.84,2643.0,-8254.0,236.43
3,STC,2012-12-01,1291.2,710.8,580.4,21.1,29.31,263.5,13.58,71.2,-137.4,26.00
4,XRAY,2010-12-01,3258.0,1348.0,1909.9,142.1,12.98,457.6,3.23,786.6,246.6,34.17
5,GWW,2016-12-01,5694.3,3788.5,1905.8,58.8,30.57,684.7,11.64,2247.1,1972.9,232.25
6,SKM,2015-12-01,24288.4,11223.5,13064.9,70.6,183.55,9386.2,132.93,7039.9,5790.9,20.15
7,BAX,2011-12-01,19073.0,12245.0,6828.0,560.3,11.74,3442.0,6.14,5195.0,2290.0,26.88
8,MAR,2012-12-01,6342.0,7627.0,-1285.0,312.3,-4.13,-3274.0,-10.53,2935.0,2847.0,37.27
9,SJM,2019-04-01,16711.3,8740.8,7970.5,113.7,70.08,-5059.2,-44.48,5959.9,5858.6,122.63


In [3]:
prices = data['Price'] 
data = data.drop(['Company', 'Time', 'Price'], axis = 1)

col_list = []
for col in data.columns:
    col_list.append(col)
    
for col in col_list:
    print(col)
    data[col] = data[col].astype(float) # Converting columns to floats
    print("Done")

Total Assets
Done
Total Liabilities
Done
Total Equity
Done
Total Shares Out. on Filing Date
Done
Book Value / Share
Done
Tangible Book Value
Done
Tangible Book Value Per Share
Done
Total Debt
Done
Net Debt
Done


In [4]:
# Normalising the data
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler() 
scaled_values = scaler.fit_transform(data) 
data.loc[:,:] = scaled_values

data.insert(data.shape[1], 'Price', prices)
data

Unnamed: 0,Total Assets,Total Liabilities,Total Equity,Total Shares Out. on Filing Date,Book Value / Share,Tangible Book Value,Tangible Book Value Per Share,Total Debt,Net Debt,Price
0,0.001638,0.000683,0.028169,0.001691,0.000362,0.289974,0.001341,0.000003,0.185569,41.99
1,0.021935,0.013682,0.045186,0.002725,0.000484,0.305348,0.001492,0.017775,0.200646,24.77
2,0.019176,0.006134,0.053813,0.007687,0.000412,0.314710,0.001411,0.005522,0.171446,236.43
3,0.001418,0.000851,0.027398,0.000254,0.000448,0.288934,0.001384,0.000149,0.185811,26.00
4,0.003597,0.001616,0.030420,0.001776,0.000385,0.289378,0.001330,0.001644,0.186491,34.17
5,0.006298,0.004546,0.030410,0.000728,0.000453,0.289897,0.001374,0.004695,0.189546,232.25
6,0.026904,0.013474,0.055770,0.000877,0.001038,0.309792,0.002006,0.014709,0.196303,20.15
7,0.021124,0.014700,0.041596,0.007037,0.000380,0.296202,0.001345,0.010855,0.190107,26.88
8,0.007015,0.009155,0.023159,0.003917,0.000320,0.280846,0.001258,0.006132,0.191093,37.27
9,0.018507,0.010493,0.044193,0.001419,0.000604,0.276765,0.001081,0.012453,0.196423,122.63


In [5]:
# Converting Data to Numpy Arrays
NpMatrix = data.to_numpy(dtype = None, copy = False)
X = NpMatrix[:,0:9] # Parameters
y = NpMatrix[:,9] # Price
print("X:", X)
print("\n")
print("Type X:", type(X))
print("Length of Individual X_train Vector:", len(X[1]))
print("Total Number of Training instances:", len(X))
print("\n")
print(y)
print("\n")
print("Type y:", type(y))
print("Length of Individual y_train vector", y[0])
print("Total number of y values", len(y))

X: [[1.63843386e-03 6.82611560e-04 2.81693214e-02 ... 1.34060792e-03
  2.92521072e-06 1.85568632e-01]
 [2.19349988e-02 1.36816489e-02 4.51864494e-02 ... 1.49174313e-03
  1.77752519e-02 2.00645972e-01]
 [1.91761262e-02 6.13437854e-03 5.38134120e-02 ... 1.41116898e-03
  5.52237995e-03 1.71446269e-01]
 ...
 [6.79067888e-03 4.97604015e-03 3.06086506e-02 ... 1.34050362e-03
  4.01380699e-03 1.89340224e-01]
 [2.35835177e-03 1.36510305e-03 2.83540777e-02 ... 1.34947368e-03
  1.52340795e-03 1.86742708e-01]
 [7.21358632e-04 8.08087212e-04 2.60515545e-02 ... 1.31145520e-03
  1.21500717e-03 1.86722887e-01]]


Type X: <class 'numpy.ndarray'>
Length of Individual X_train Vector: 9
Total Number of Training instances: 5000


[ 41.99  24.77 236.43 ...  49.01   8.07  34.69]


Type y: <class 'numpy.ndarray'>
Length of Individual y_train vector 41.99
Total number of y values 5000


In [59]:
#X_train, X_test, y_train, y_test = train_test_split(X , y, test_size = 0.2, random_state = 42) # 42, 56, 66, 87

In [15]:
# Hyperparameter Grid Search
#criterion = {'mse', 'friedman_mse', 'mae'}
#'criterion' : ['mse', 'mae'],
#  'criterion' : {'mse', 'friedman_mse', 'mae'}
params = {
        'splitter' : ['best', 'random'],
          'max_depth': range(1,50), 
         }

decision_tree = DecisionTreeRegressor()
#mse = make_scorer(mean_squared_error, greater_is_better= False)
mae = make_scorer(mean_absolute_error, greater_is_better= False)

#gs_decision_tree = GridSearchCV(decision_tree, params, scoring = mae,  cv=10)
gs_decision_tree = GridSearchCV(decision_tree, params, scoring = mae,  cv=5)

grid_results = gs_decision_tree.fit(X , y)

# Best parameters, best test score using these parameters, optimal KNN Configuration
print("Best Parameters", gs_decision_tree.best_params_)
print("Best score on Test Data", gs_decision_tree.best_score_)
print("Optimal Configuration", gs_decision_tree.best_estimator_)

Best Parameters {'max_depth': 31, 'splitter': 'random'}
Best score on Test Data -35.633826285353535
Optimal Configuration DecisionTreeRegressor(criterion='mse', max_depth=31, max_features=None,
                      max_leaf_nodes=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      presort=False, random_state=None, splitter='random')


In [16]:
# Summarize the results in a readable format
print("Best: {0}, using {1}".format(grid_results.cv_results_['mean_test_score'], grid_results.best_params_))
results_df = pd.DataFrame(grid_results.cv_results_)
results_df

Best: [-52.0593448  -53.89190507 -47.90914647 -52.94858869 -44.45502217
 -52.63701243 -42.46833264 -49.83716789 -41.49497662 -49.21468923
 -39.16219369 -48.09079863 -38.59976642 -45.50178167 -37.07921717
 -42.66827282 -37.93954768 -42.63958749 -37.51395069 -42.25320966
 -37.63374348 -40.19094027 -36.75301009 -41.8664704  -37.05789881
 -39.09923153 -37.63457012 -39.04887248 -37.20884497 -37.50710572
 -36.47211272 -37.46915799 -37.00921543 -37.48729754 -37.20877602
 -37.09957203 -37.27063382 -37.63807877 -36.86204121 -38.67701016
 -36.95887657 -36.65789897 -37.09754131 -38.1835967  -37.76266099
 -36.60029761 -36.27762641 -36.07867589 -37.60574923 -37.76520954
 -36.90019015 -36.99734125 -38.0625264  -39.92469689 -37.18608006
 -36.90754728 -36.93898059 -37.43619893 -36.61740979 -37.13037705
 -36.59955547 -35.63382629 -37.70150658 -36.76289694 -37.63577857
 -37.20414912 -37.15455367 -36.13648855 -37.1334445  -39.35767451
 -37.41113333 -38.31128823 -37.003144   -39.70322417 -37.15283
 -40.24

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_splitter,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.006391,0.001947,0.000799,0.001597,1,best,"{'max_depth': 1, 'splitter': 'best'}",-55.129839,-51.620158,-51.564011,-50.614833,-51.367882,-52.059345,1.576763,95
1,0.000804,0.001608,0.000000,0.000000,1,random,"{'max_depth': 1, 'splitter': 'random'}",-58.414363,-52.669605,-53.390251,-50.789003,-54.196303,-53.891905,2.526617,98
2,0.009575,0.001940,0.000798,0.001596,2,best,"{'max_depth': 2, 'splitter': 'best'}",-51.106068,-49.303116,-46.549301,-46.543572,-46.043677,-47.909146,1.967608,91
3,0.001817,0.001852,0.000000,0.000000,2,random,"{'max_depth': 2, 'splitter': 'random'}",-58.265749,-52.800773,-49.550091,-50.928387,-53.197943,-52.948589,2.966049,97
4,0.011794,0.000325,0.000216,0.000432,3,best,"{'max_depth': 3, 'splitter': 'best'}",-48.305462,-46.208562,-41.133522,-43.215587,-43.411977,-44.455022,2.512262,89
5,0.000799,0.001598,0.000799,0.001598,3,random,"{'max_depth': 3, 'splitter': 'random'}",-56.043639,-50.834157,-53.516888,-52.641412,-50.148967,-52.637012,2.089017,96
6,0.017583,0.001955,0.000000,0.000000,4,best,"{'max_depth': 4, 'splitter': 'best'}",-43.790142,-45.974540,-40.849615,-39.628453,-42.098913,-42.468333,2.229902,86
7,0.001598,0.001957,0.000000,0.000000,4,random,"{'max_depth': 4, 'splitter': 'random'}",-48.417452,-49.050655,-51.932371,-50.680366,-49.104997,-49.837168,1.286167,94
8,0.015972,0.000013,0.000000,0.000000,5,best,"{'max_depth': 5, 'splitter': 'best'}",-43.951607,-44.578061,-41.572662,-37.950452,-39.422100,-41.494977,2.545857,83
9,0.001605,0.001966,0.000000,0.000000,5,random,"{'max_depth': 5, 'splitter': 'random'}",-50.601363,-50.261169,-53.213041,-46.083569,-45.914303,-49.214689,2.817845,93


In [17]:
results_df = results_df.sort_values(['rank_test_score'])
results_df

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_splitter,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
61,0.010375,0.001948,0.000000,0.000000,31,random,"{'max_depth': 31, 'splitter': 'random'}",-37.671490,-38.761220,-37.471023,-35.844778,-28.420620,-35.633826,3.725126,1
47,0.007977,0.000025,0.000000,0.000000,24,random,"{'max_depth': 24, 'splitter': 'random'}",-36.026072,-37.778568,-37.636733,-36.514715,-32.437292,-36.078676,1.937511,2
85,0.013602,0.001958,0.001588,0.001944,43,random,"{'max_depth': 43, 'splitter': 'random'}",-39.146050,-31.799470,-40.408770,-33.150900,-36.073450,-36.115728,3.317684,3
67,0.014006,0.001799,0.000798,0.001596,34,random,"{'max_depth': 34, 'splitter': 'random'}",-36.714000,-35.607860,-35.840737,-33.653580,-38.866266,-36.136489,1.692156,4
46,0.043138,0.001598,0.000799,0.001598,24,best,"{'max_depth': 24, 'splitter': 'best'}",-36.260980,-41.049051,-37.309766,-32.113320,-34.655015,-36.277626,2.959946,5
97,0.012786,0.001595,0.000809,0.001618,49,random,"{'max_depth': 49, 'splitter': 'random'}",-39.517730,-37.588840,-35.325730,-37.362980,-32.326530,-36.424362,2.441522,6
30,0.037045,0.001542,0.001598,0.001957,16,best,"{'max_depth': 16, 'splitter': 'best'}",-35.620686,-42.440937,-35.529169,-33.274392,-35.495380,-36.472113,3.111924,7
87,0.011991,0.000035,0.000799,0.001598,44,random,"{'max_depth': 44, 'splitter': 'random'}",-40.254580,-34.156900,-36.411370,-37.412800,-34.301890,-36.507508,2.247641,8
91,0.011981,0.000028,0.000792,0.001584,46,random,"{'max_depth': 46, 'splitter': 'random'}",-38.399130,-34.552650,-37.185830,-37.089110,-35.374670,-36.520278,1.376996,9
94,0.052194,0.004330,0.000800,0.001599,48,best,"{'max_depth': 48, 'splitter': 'best'}",-37.151830,-42.424280,-36.457590,-31.733740,-34.935280,-36.540544,3.483709,10
