In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split 
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, make_scorer, mean_absolute_error
from sklearn.model_selection import GridSearchCV

In [54]:
#data = pd.read_csv('BS_train_data.csv')
#data

In [3]:
data = pd.read_csv('DeltaBS_data.csv')
data

Unnamed: 0,Company,Time,Total Assets,Total Liabilities,Total Equity,Total Shares Out. on Filing Date,Book Value / Share,Tangible Book Value,Tangible Book Value Per Share,Total Debt,Net Debt,Price_Change
0,SIRI,2019-12-01,2976.0,1895.0,1081.0,68.1,0.25,-1438.0,-0.30,1458.0,1406.0,1.44
1,NEWT,2015-12-01,50.6,13.1,37.5,4.3,-2.25,34.0,-2.22,7.6,24.1,1.87
2,ADM,2019-12-01,3164.0,2926.0,238.0,-2.2,0.52,-1208.0,-2.08,1512.0,2654.0,5.38
3,PXD,2017-12-01,544.0,-324.0,868.0,0.5,4.94,872.0,4.96,-481.0,-25.0,-7.22
4,TROW,2018-12-01,153.9,106.5,47.4,-8.1,1.96,299.9,1.88,0.0,477.5,-12.61
5,PRU,2013-12-01,22546.0,25777.0,-3231.0,-2.0,-6.62,-3160.0,-6.49,3913.0,10574.0,38.89
6,LEG,2017-12-01,566.7,469.9,96.8,-3.5,0.84,63.8,0.49,291.9,47.7,-1.15
7,ADS,2012-12-01,3019.9,2667.4,352.5,-0.1,7.12,-148.4,-3.15,2387.1,1710.0,40.92
8,PPG,2018-12-01,-523.0,417.0,-940.0,-14.0,-2.49,-982.0,-4.28,870.0,1398.0,-14.59
9,VZ,2015-12-01,11559.0,7393.0,4166.0,-81.6,1.07,-9660.0,-2.82,-3699.0,2384.0,-0.56


In [5]:
prices = data['Price_Change'] 
data = data.drop(['Company', 'Time', 'Price_Change'], axis = 1)

col_list = []
for col in data.columns:
    col_list.append(col)
    
for col in col_list:
    print(col)
    data[col] = data[col].astype(float) # Converting columns to floats
    print("Done")

Total Assets
Done
Total Liabilities
Done
Total Equity
Done
Total Shares Out. on Filing Date
Done
Book Value / Share
Done
Tangible Book Value
Done
Tangible Book Value Per Share
Done
Total Debt
Done
Net Debt
Done


In [6]:
# Normalising the data
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler() 
scaled_values = scaler.fit_transform(data) 
data.loc[:,:] = scaled_values

data.insert(data.shape[1], 'Price_Change', prices)
data

Unnamed: 0,Total Assets,Total Liabilities,Total Equity,Total Shares Out. on Filing Date,Book Value / Share,Tangible Book Value,Tangible Book Value Per Share,Total Debt,Net Debt,Price_Change
0,0.567761,0.674890,0.524838,0.209157,0.001012,0.478161,0.156527,0.660160,0.609477,1.44
1,0.558627,0.667503,0.518225,0.206439,0.000961,0.488115,0.156493,0.649944,0.600695,1.87
2,0.568348,0.678937,0.519496,0.206162,0.001017,0.479716,0.156496,0.660541,0.617408,5.38
3,0.560168,0.666180,0.523488,0.206277,0.001108,0.493781,0.156620,0.646502,0.600383,-7.22
4,0.558950,0.667870,0.518288,0.205911,0.001047,0.489913,0.156566,0.649890,0.603576,-12.61
5,0.628862,0.768633,0.497513,0.206171,0.000871,0.466516,0.156418,0.677453,0.667741,38.89
6,0.560238,0.669296,0.518601,0.206107,0.001024,0.488316,0.156541,0.651947,0.600845,-1.15
7,0.567898,0.677922,0.520221,0.206252,0.001152,0.486881,0.156477,0.666705,0.611409,40.92
8,0.556836,0.669089,0.512031,0.205659,0.000956,0.481244,0.156457,0.656019,0.609426,-14.59
9,0.594558,0.696471,0.544388,0.202780,0.001028,0.422563,0.156483,0.623835,0.615692,-0.56


In [7]:
# Converting Data to Numpy Arrays
NpMatrix = data.to_numpy(dtype = None, copy = False)
X = NpMatrix[:,0:9] # Parameters
y = NpMatrix[:,9] # Price
print("X:", X)
print("\n")
print("Type X:", type(X))
print("Length of Individual X_train Vector:", len(X[1]))
print("Total Number of Training instances:", len(X))
print("\n")
print(y)
print("\n")
print("Type y:", type(y))
print("Length of Individual y_train vector", y[0])
print("Total number of y values", len(y))

X: [[0.56776064 0.67489009 0.52483793 ... 0.15652723 0.66016046 0.60947678]
 [0.55862704 0.66750314 0.51822526 ... 0.1564935  0.649944   0.60069462]
 [0.56834761 0.67893704 0.51949583 ... 0.15649596 0.66054083 0.61740799]
 ...
 [0.56725797 0.67263699 0.52745512 ... 0.1565369  0.65582847 0.60690295]
 [0.5811891  0.69430837 0.52074422 ... 0.15652372 0.67007121 0.60685211]
 [0.56018282 0.67036623 0.51676141 ... 0.15630548 0.65148239 0.60218744]]


Type X: <class 'numpy.ndarray'>
Length of Individual X_train Vector: 9
Total Number of Training instances: 4500


[  1.44   1.87   5.38 ...   5.83  77.77 334.81]


Type y: <class 'numpy.ndarray'>
Length of Individual y_train vector 1.44
Total number of y values 4500


In [59]:
#X_train, X_test, y_train, y_test = train_test_split(X , y, test_size = 0.2, random_state = 42) # 42, 56, 66, 87

In [8]:
# Hyperparameter Grid Search
#criterion = {'mse', 'friedman_mse', 'mae'}
#'criterion' : ['mse', 'mae'],
#  'criterion' : {'mse', 'friedman_mse', 'mae'}
params = {
        'splitter' : ['best', 'random'],
          'max_depth': range(1,50), 
         }

decision_tree = DecisionTreeRegressor()
#mse = make_scorer(mean_squared_error, greater_is_better= False)
mae = make_scorer(mean_absolute_error, greater_is_better= False)

#gs_decision_tree = GridSearchCV(decision_tree, params, scoring = mae,  cv=10)
gs_decision_tree = GridSearchCV(decision_tree, params, scoring = mae,  cv=5)

grid_results = gs_decision_tree.fit(X , y)

# Best parameters, best test score using these parameters, optimal KNN Configuration
print("Best Parameters", gs_decision_tree.best_params_)
print("Best score on Test Data", gs_decision_tree.best_score_)
print("Optimal Configuration", gs_decision_tree.best_estimator_)

Best Parameters {'max_depth': 10, 'splitter': 'random'}
Best score on Test Data -16.171624884688924
Optimal Configuration DecisionTreeRegressor(criterion='mse', max_depth=10, max_features=None,
                      max_leaf_nodes=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      presort=False, random_state=None, splitter='random')


In [11]:
# Summarize the results in a readable format
print("Best: {0}, using {1}".format(grid_results.cv_results_['mean_test_score'], grid_results.best_params_))
results_df = pd.DataFrame(grid_results.cv_results_)
results_df

Best: [-16.68447533 -16.35672173 -16.8040424  -16.23600132 -16.93804795
 -16.43916355 -16.96460135 -16.96565695 -16.71649752 -16.82863854
 -16.63247825 -17.45300447 -17.07467513 -17.22776289 -17.09658483
 -17.0914029  -17.32546761 -18.49118639 -18.18608234 -16.17162488
 -18.01278174 -17.9232883  -18.48810216 -16.76384429 -18.94811432
 -17.40821695 -19.43343804 -18.29208692 -19.58690995 -17.70404915
 -19.94443898 -19.10170131 -20.37709961 -18.28316151 -20.6285827
 -18.83036    -21.63625968 -19.7512209  -21.80090921 -19.09152201
 -22.00494515 -20.26752966 -22.20279734 -21.63079177 -22.58186283
 -20.55872872 -22.48909454 -21.8596042  -22.94346545 -20.89217611
 -22.99204826 -22.57336964 -23.2387983  -21.03977005 -23.56692596
 -21.84509303 -23.19998577 -22.78736326 -23.68268722 -22.76727059
 -23.76318368 -24.48213611 -23.80982889 -22.91699594 -24.17904344
 -23.79015262 -24.05020449 -23.01142437 -23.90111591 -23.73004179
 -23.41575309 -24.63279724 -23.75524317 -23.34541493 -23.76291424
 -23.

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_splitter,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.005798,2.216013e-03,0.000209,0.000418,1,best,"{'max_depth': 1, 'splitter': 'best'}",-15.112594,-16.815807,-17.417591,-18.089769,-15.986616,-16.684475,1.047605,6
1,0.000000,0.000000e+00,0.000805,0.001610,1,random,"{'max_depth': 1, 'splitter': 'random'}",-15.027562,-16.923165,-15.965262,-18.032779,-15.834840,-16.356722,1.031726,3
2,0.007188,1.604920e-03,0.000000,0.000000,2,best,"{'max_depth': 2, 'splitter': 'best'}",-14.824508,-16.887039,-17.322924,-18.674982,-16.310759,-16.804042,1.260034,9
3,0.001600,1.959809e-03,0.000000,0.000000,2,random,"{'max_depth': 2, 'splitter': 'random'}",-14.650314,-16.819645,-16.029155,-17.941504,-15.739388,-16.236001,1.100290,2
4,0.008233,4.588898e-04,0.000800,0.001600,3,best,"{'max_depth': 3, 'splitter': 'best'}",-15.351332,-17.061975,-17.800305,-19.124322,-15.352305,-16.938048,1.454021,11
5,0.000799,1.597786e-03,0.000000,0.000000,3,random,"{'max_depth': 3, 'splitter': 'random'}",-14.655196,-17.169504,-16.206736,-18.201255,-15.963128,-16.439164,1.191766,4
6,0.010386,1.955907e-03,0.000000,0.000000,4,best,"{'max_depth': 4, 'splitter': 'best'}",-15.112909,-16.751966,-16.952173,-18.909588,-17.096371,-16.964601,1.206190,12
7,0.001598,1.956957e-03,0.000000,0.000000,4,random,"{'max_depth': 4, 'splitter': 'random'}",-15.470376,-17.102577,-16.221973,-20.425766,-15.607592,-16.965657,1.823424,13
8,0.013582,1.955888e-03,0.000000,0.000000,5,best,"{'max_depth': 5, 'splitter': 'best'}",-14.637897,-16.614486,-16.696670,-18.245811,-17.387624,-16.716498,1.193364,7
9,0.000799,1.597404e-03,0.000799,0.001598,5,random,"{'max_depth': 5, 'splitter': 'random'}",-15.214384,-16.844666,-17.747457,-18.250762,-16.085924,-16.828639,1.097667,10


In [12]:
results_df = results_df.sort_values(['rank_test_score'])
results_df

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_splitter,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
19,0.002396,1.955986e-03,0.001599,0.001959,10,random,"{'max_depth': 10, 'splitter': 'random'}",-14.806451,-17.234353,-16.940543,-16.193376,-15.683402,-16.171625,0.874432,1
3,0.001600,1.959809e-03,0.000000,0.000000,2,random,"{'max_depth': 2, 'splitter': 'random'}",-14.650314,-16.819645,-16.029155,-17.941504,-15.739388,-16.236001,1.100290,2
1,0.000000,0.000000e+00,0.000805,0.001610,1,random,"{'max_depth': 1, 'splitter': 'random'}",-15.027562,-16.923165,-15.965262,-18.032779,-15.834840,-16.356722,1.031726,3
5,0.000799,1.597786e-03,0.000000,0.000000,3,random,"{'max_depth': 3, 'splitter': 'random'}",-14.655196,-17.169504,-16.206736,-18.201255,-15.963128,-16.439164,1.191766,4
10,0.015978,2.617832e-06,0.000000,0.000000,6,best,"{'max_depth': 6, 'splitter': 'best'}",-14.570736,-16.857306,-17.031868,-18.182712,-16.519769,-16.632478,1.172755,5
0,0.005798,2.216013e-03,0.000209,0.000418,1,best,"{'max_depth': 1, 'splitter': 'best'}",-15.112594,-16.815807,-17.417591,-18.089769,-15.986616,-16.684475,1.047605,6
8,0.013582,1.955888e-03,0.000000,0.000000,5,best,"{'max_depth': 5, 'splitter': 'best'}",-14.637897,-16.614486,-16.696670,-18.245811,-17.387624,-16.716498,1.193364,7
23,0.003993,2.603959e-05,0.000000,0.000000,12,random,"{'max_depth': 12, 'splitter': 'random'}",-14.997936,-17.336235,-18.543211,-16.521556,-16.420284,-16.763844,1.165242,8
2,0.007188,1.604920e-03,0.000000,0.000000,2,best,"{'max_depth': 2, 'splitter': 'best'}",-14.824508,-16.887039,-17.322924,-18.674982,-16.310759,-16.804042,1.260034,9
9,0.000799,1.597404e-03,0.000799,0.001598,5,random,"{'max_depth': 5, 'splitter': 'random'}",-15.214384,-16.844666,-17.747457,-18.250762,-16.085924,-16.828639,1.097667,10
