In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split 
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, make_scorer, mean_absolute_error
from sklearn.model_selection import cross_val_score 
from math import sqrt
from sklearn.model_selection import KFold
from sklearn.model_selection import RepeatedKFold

In [2]:
data = pd.read_csv('BS_data.csv')
data

Unnamed: 0,Company,Time,Total Assets,Total Liabilities,Total Equity,Total Shares Out. on Filing Date,Book Value / Share,Tangible Book Value,Tangible Book Value Per Share,Total Debt,Net Debt,Price
0,RHI,2013-12-01,1490.3,570.6,919.6,135.3,6.80,718.3,5.31,1.4,-274.3,41.99
1,RCL,2011-12-01,19804.4,11396.6,8407.8,217.5,38.74,7442.4,34.29,8507.2,8245.0,24.77
2,NVDA,2020-01-01,17315.0,5111.0,12204.0,612.0,19.93,11537.0,18.84,2643.0,-8254.0,236.43
3,STC,2012-12-01,1291.2,710.8,580.4,21.1,29.31,263.5,13.58,71.2,-137.4,26.00
4,XRAY,2010-12-01,3258.0,1348.0,1909.9,142.1,12.98,457.6,3.23,786.6,246.6,34.17
5,GWW,2016-12-01,5694.3,3788.5,1905.8,58.8,30.57,684.7,11.64,2247.1,1972.9,232.25
6,SKM,2015-12-01,24288.4,11223.5,13064.9,70.6,183.55,9386.2,132.93,7039.9,5790.9,20.15
7,BAX,2011-12-01,19073.0,12245.0,6828.0,560.3,11.74,3442.0,6.14,5195.0,2290.0,26.88
8,MAR,2012-12-01,6342.0,7627.0,-1285.0,312.3,-4.13,-3274.0,-10.53,2935.0,2847.0,37.27
9,SJM,2019-04-01,16711.3,8740.8,7970.5,113.7,70.08,-5059.2,-44.48,5959.9,5858.6,122.63


In [3]:
prices = data['Price'] 
data = data.drop(['Company', 'Time', 'Price'], axis = 1)

col_list = []
for col in data.columns:
    col_list.append(col)
    
for col in col_list:
    print(col)
    data[col] = data[col].astype(float) # Converting columns to floats
    print("Done")

Total Assets
Done
Total Liabilities
Done
Total Equity
Done
Total Shares Out. on Filing Date
Done
Book Value / Share
Done
Tangible Book Value
Done
Tangible Book Value Per Share
Done
Total Debt
Done
Net Debt
Done


In [4]:
# Normalising the data
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler() 
scaled_values = scaler.fit_transform(data) 
data.loc[:,:] = scaled_values

data.insert(data.shape[1], 'Price', prices)
data

Unnamed: 0,Total Assets,Total Liabilities,Total Equity,Total Shares Out. on Filing Date,Book Value / Share,Tangible Book Value,Tangible Book Value Per Share,Total Debt,Net Debt,Price
0,0.001638,0.000683,0.028169,0.001691,0.000362,0.289974,0.001341,0.000003,0.185569,41.99
1,0.021935,0.013682,0.045186,0.002725,0.000484,0.305348,0.001492,0.017775,0.200646,24.77
2,0.019176,0.006134,0.053813,0.007687,0.000412,0.314710,0.001411,0.005522,0.171446,236.43
3,0.001418,0.000851,0.027398,0.000254,0.000448,0.288934,0.001384,0.000149,0.185811,26.00
4,0.003597,0.001616,0.030420,0.001776,0.000385,0.289378,0.001330,0.001644,0.186491,34.17
5,0.006298,0.004546,0.030410,0.000728,0.000453,0.289897,0.001374,0.004695,0.189546,232.25
6,0.026904,0.013474,0.055770,0.000877,0.001038,0.309792,0.002006,0.014709,0.196303,20.15
7,0.021124,0.014700,0.041596,0.007037,0.000380,0.296202,0.001345,0.010855,0.190107,26.88
8,0.007015,0.009155,0.023159,0.003917,0.000320,0.280846,0.001258,0.006132,0.191093,37.27
9,0.018507,0.010493,0.044193,0.001419,0.000604,0.276765,0.001081,0.012453,0.196423,122.63


In [5]:
# Converting Data to Numpy Arrays
NpMatrix = data.to_numpy(dtype = None, copy = False)
X = NpMatrix[:,0:9] # Parameters
y = NpMatrix[:,9] # Price
print("X:", X)
print("\n")
print("Type X:", type(X))
print("Length of Individual X_train Vector:", len(X[1]))
print("Total Number of Training instances:", len(X))
print("\n")
print(y)
print("\n")
print("Type y:", type(y))
print("Length of Individual y_train vector", y[0])
print("Total number of y values", len(y))

X: [[1.63843386e-03 6.82611560e-04 2.81693214e-02 ... 1.34060792e-03
  2.92521072e-06 1.85568632e-01]
 [2.19349988e-02 1.36816489e-02 4.51864494e-02 ... 1.49174313e-03
  1.77752519e-02 2.00645972e-01]
 [1.91761262e-02 6.13437854e-03 5.38134120e-02 ... 1.41116898e-03
  5.52237995e-03 1.71446269e-01]
 ...
 [6.79067888e-03 4.97604015e-03 3.06086506e-02 ... 1.34050362e-03
  4.01380699e-03 1.89340224e-01]
 [2.35835177e-03 1.36510305e-03 2.83540777e-02 ... 1.34947368e-03
  1.52340795e-03 1.86742708e-01]
 [7.21358632e-04 8.08087212e-04 2.60515545e-02 ... 1.31145520e-03
  1.21500717e-03 1.86722887e-01]]


Type X: <class 'numpy.ndarray'>
Length of Individual X_train Vector: 9
Total Number of Training instances: 5000


[ 41.99  24.77 236.43 ...  49.01   8.07  34.69]


Type y: <class 'numpy.ndarray'>
Length of Individual y_train vector 41.99
Total number of y values 5000


In [44]:
# Repeated KFOLD
# prepare the cross-validation procedure
#from sklearn.model_selection import RepeatedKFold
#cv2 = RepeatedKFold(n_splits=10, n_repeats=10, random_state=1)

In [6]:
decision_tree = DecisionTreeRegressor(criterion='mse', max_depth=31, max_features=None,
                      max_leaf_nodes=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      presort=False, random_state=None, splitter='random')

In [7]:
CV = KFold(n_splits=10, random_state=None, shuffle=False)

In [8]:
# Applying k-Fold Cross Validation 
# https://stackoverflow.com/questions/24132237/scikit-learn-cross-validation-scoring-for-regression#:~:text=How%20can%20one%20use%20cross_val_score%20for%20regression%3F%20The,accuracy%2C%20which%20is%20not%20very%20meaningful%20for%20regression.
#mse = make_scorer(mean_squared_error, greater_is_better= False)
mae = make_scorer(mean_absolute_error, greater_is_better= False) 
scores = cross_val_score(decision_tree, X, y, scoring= mae, cv = CV) 

print(scores)

print("Mean MAE:", scores.mean())
print("SD of the mean:", scores.std())

[-33.530414   -36.07889    -35.03342    -35.93149433 -36.40632121
 -35.03149467 -28.68528457 -34.88657333 -36.02675465 -39.53326667]
Mean MAE: -35.11439134369432
SD of the mean: 2.5987225907551577
