In [23]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split 
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV, StratifiedKFold

In [24]:
data = pd.read_csv('BSTrend_data.csv')
data

Unnamed: 0,Company,Time,Total Assets,Total Liabilities,Total Equity,Total Shares Out. on Filing Date,Book Value / Share,Tangible Book Value,Tangible Book Value Per Share,Total Debt,Net Debt,Classification
0,CI,2011-12-01,5015.0,3684.0,1331.0,15.3,3.56,1250.0,3.91,2253.0,-832.0,1
1,RE,2014-12-01,1009.8,198.8,811.0,-2.9,20.18,482.8,20.18,150.1,324.0,1
2,PAYX,2016-05-01,-26.7,-152.9,126.2,-0.7,0.36,-6.5,-0.01,0.0,184.5,1
3,DOV,2018-12-01,-2292.6,-678.1,-1614.5,-9.5,-9.20,-1457.1,-10.32,-403.8,-46.0,0
4,BIP,2016-12-01,3540.0,1072.0,2468.0,18.0,1.79,-819.0,-2.94,956.0,690.0,1
5,EOG,2018-12-01,4101.4,1020.5,3080.9,1.5,5.24,3080.9,5.24,-303.8,-1025.2,0
6,MAS,2011-12-01,-843.0,-3.0,-840.0,-0.8,-2.46,-292.0,-0.84,-73.0,-14.0,0
7,ATNI,2011-12-01,50.8,27.3,23.5,0.1,0.59,11.4,0.71,1.7,-9.8,1
8,PEIX,2016-12-01,33.5,-13.1,46.8,0.9,0.22,16.3,0.21,-29.4,-41.0,1
9,ATVI,2011-12-01,-170.0,-459.0,289.0,-59.3,0.64,391.0,0.44,0.0,-17.0,0


In [26]:
count_1 = 0
count_0 = 0
for element in data['Classification'].tolist():
    if element == 1:
        count_1 += 1
    elif element == 0:
        count_0 += 1
        
print("Count 1:", count_1)
print("Count 0:", count_0)

Count 1: 3038
Count 0: 1462


In [27]:
classes = data['Classification'] 
data = data.drop(['Company', 'Time', 'Classification'], axis = 1)

col_list = []
for col in data.columns:
    col_list.append(col)
    
for col in col_list:
    print(col)
    data[col] = data[col].astype(float) # Converting columns to floats
    print("Done")

Total Assets
Done
Total Liabilities
Done
Total Equity
Done
Total Shares Out. on Filing Date
Done
Book Value / Share
Done
Tangible Book Value
Done
Tangible Book Value Per Share
Done
Total Debt
Done
Net Debt
Done


In [28]:
# Normalising the data
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler() 
scaled_values = scaler.fit_transform(data) 
data.loc[:,:] = scaled_values

data.insert(data.shape[1], 'Classification', classes)
data

Unnamed: 0,Total Assets,Total Liabilities,Total Equity,Total Shares Out. on Filing Date,Book Value / Share,Tangible Book Value,Tangible Book Value Per Share,Total Debt,Net Debt,Classification
0,0.574127,0.681912,0.526422,0.206908,0.001079,0.496337,0.156601,0.665760,0.595254,1
1,0.561622,0.668232,0.523127,0.206132,0.001419,0.491149,0.156887,0.650948,0.602601,1
2,0.558386,0.666852,0.518787,0.206226,0.001014,0.487841,0.156532,0.649890,0.601714,1
3,0.551311,0.664790,0.507757,0.205851,0.000819,0.478032,0.156351,0.647046,0.600249,0
4,0.569522,0.671660,0.533627,0.207023,0.001043,0.482346,0.156481,0.656624,0.604927,1
5,0.571274,0.671457,0.537511,0.206320,0.001114,0.508718,0.156625,0.647751,0.594026,0
6,0.555837,0.667440,0.512665,0.206222,0.000956,0.485910,0.156518,0.649376,0.600452,0
7,0.558628,0.667559,0.518137,0.206260,0.001019,0.487962,0.156545,0.649902,0.600479,1
8,0.558574,0.667400,0.518284,0.206294,0.001011,0.487995,0.156536,0.649683,0.600281,1
9,0.557938,0.665650,0.519819,0.203730,0.001020,0.490529,0.156540,0.649890,0.600433,0


In [29]:
# Converting Data to Numpy Arrays
NpMatrix = data.to_numpy(dtype = None, copy = False)
X = NpMatrix[:,0:9] # Parameters
y = NpMatrix[:,9] # Price
print("X:", X)
print("\n")
print("Type X:", type(X))
print("Length of Individual X_train Vector:", len(X[1]))
print("Total Number of Training instances:", len(X))
print("\n")
print(y)
print("\n")
print("Type y:", type(y))
print("Length of Individual y_train vector", y[0])
print("Total number of y values", len(y))

X: [[0.57412675 0.68191239 0.52642218 ... 0.15660121 0.66576035 0.59525398]
 [0.56162183 0.66823206 0.52312694 ... 0.15688709 0.65094776 0.60260052]
 [0.5583857  0.66685155 0.51878735 ... 0.15653233 0.64989047 0.60171398]
 ...
 [0.55809846 0.67046279 0.51237429 ... 0.15644957 0.65595174 0.608092  ]
 [0.56398688 0.67446145 0.51787038 ... 0.15653408 0.66283855 0.61190508]
 [0.56259345 0.67198972 0.51903259 ... 0.15619478 0.65718794 0.60745648]]


Type X: <class 'numpy.ndarray'>
Length of Individual X_train Vector: 9
Total Number of Training instances: 4500


[1. 1. 1. ... 0. 0. 1.]


Type y: <class 'numpy.ndarray'>
Length of Individual y_train vector 1.0
Total number of y values 4500


In [30]:
# Hyperparameter Grid Search

params = {
        'criterion' : ['gini', 'entropy'],
        'splitter' : ['best', 'random'],
          'max_depth': range(1,20), 
         }

decision_tree = DecisionTreeClassifier()

cv_str = StratifiedKFold(n_splits=5)

gs_decision_tree = GridSearchCV(decision_tree, params,  cv= cv_str)
grid_results = gs_decision_tree.fit(X , y)

# Best parameters, best test score using these parameters, optimal KNN Configuration
print("Best Parameters", gs_decision_tree.best_params_)
print("Best score on Test Data", gs_decision_tree.best_score_)
print("Optimal Configuration", gs_decision_tree.best_estimator_)

Best Parameters {'criterion': 'entropy', 'max_depth': 4, 'splitter': 'best'}
Best score on Test Data 0.6813333333333333
Optimal Configuration DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=4,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')


In [31]:
# Summarize the results in a readable format
print("Best: {0}, using {1}".format(grid_results.cv_results_['mean_test_score'], grid_results.best_params_))
results_df = pd.DataFrame(grid_results.cv_results_)
results_df

Best: [0.67511111 0.67422222 0.67222222 0.67422222 0.66644444 0.67466667
 0.67755556 0.67488889 0.674      0.67288889 0.67911111 0.67044444
 0.66844444 0.67266667 0.67088889 0.66977778 0.66066667 0.67577778
 0.64666667 0.67533333 0.646      0.67088889 0.64355556 0.67066667
 0.63755556 0.67111111 0.63733333 0.67333333 0.62533333 0.66222222
 0.62333333 0.65844444 0.61688889 0.66777778 0.61155556 0.65244444
 0.604      0.65177778 0.67511111 0.67511111 0.67177778 0.67444444
 0.66622222 0.67622222 0.68133333 0.67577778 0.67244444 0.67377778
 0.67       0.67555556 0.67066667 0.67422222 0.66844444 0.66911111
 0.66644444 0.67377778 0.658      0.68133333 0.65844444 0.67711111
 0.65044444 0.67288889 0.64733333 0.66666667 0.64644444 0.66333333
 0.63088889 0.668      0.63177778 0.66844444 0.62355556 0.66333333
 0.62288889 0.66       0.61933333 0.64244444], using {'criterion': 'entropy', 'max_depth': 4, 'splitter': 'best'}


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_criterion,param_max_depth,param_splitter,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.008382,1.347443e-03,0.001198,9.758342e-04,gini,1,best,"{'criterion': 'gini', 'max_depth': 1, 'splitte...",0.674806,0.674806,0.675556,0.675195,0.675195,0.675111,0.000282,11
1,0.001786,1.156307e-03,0.001006,1.897608e-05,gini,1,random,"{'criterion': 'gini', 'max_depth': 1, 'splitte...",0.674806,0.674806,0.675556,0.672970,0.672970,0.674222,0.001058,17
2,0.012369,1.718288e-03,0.000987,1.885431e-05,gini,2,best,"{'criterion': 'gini', 'max_depth': 2, 'splitte...",0.674806,0.677026,0.675556,0.675195,0.658509,0.672222,0.006893,28
3,0.000999,2.998521e-05,0.000408,5.004153e-04,gini,2,random,"{'criterion': 'gini', 'max_depth': 2, 'splitte...",0.674806,0.674806,0.674444,0.672970,0.674082,0.674222,0.000681,17
4,0.009776,3.966626e-04,0.000199,3.981590e-04,gini,3,best,"{'criterion': 'gini', 'max_depth': 3, 'splitte...",0.680355,0.674806,0.656667,0.654060,0.666296,0.666444,0.010125,45
5,0.000997,2.376544e-06,0.000199,3.981590e-04,gini,3,random,"{'criterion': 'gini', 'max_depth': 3, 'splitte...",0.673696,0.674806,0.674444,0.674082,0.676307,0.674667,0.000899,15
6,0.011769,3.998505e-04,0.000200,3.996849e-04,gini,4,best,"{'criterion': 'gini', 'max_depth': 4, 'splitte...",0.673696,0.694784,0.674444,0.684093,0.660734,0.677556,0.011376,4
7,0.001196,3.954061e-04,0.000398,4.880527e-04,gini,4,random,"{'criterion': 'gini', 'max_depth': 4, 'splitte...",0.674806,0.677026,0.676667,0.670745,0.675195,0.674889,0.002235,14
8,0.016155,1.330263e-03,0.000589,4.814066e-04,gini,5,best,"{'criterion': 'gini', 'max_depth': 5, 'splitte...",0.687014,0.684795,0.673333,0.667408,0.657397,0.674000,0.011004,20
9,0.000996,1.945122e-06,0.000798,3.989960e-04,gini,5,random,"{'criterion': 'gini', 'max_depth': 5, 'splitte...",0.673696,0.668147,0.673333,0.676307,0.672970,0.672889,0.002647,24


In [33]:
results_df = results_df.sort_values(['rank_test_score'])
results_df

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_criterion,param_max_depth,param_splitter,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
57,0.002202,3.949990e-04,0.000200,4.001617e-04,entropy,10,random,"{'criterion': 'entropy', 'max_depth': 10, 'spl...",0.671476,0.679245,0.677778,0.694105,0.684093,0.681333,0.007547,1
44,0.017352,4.706254e-04,0.000400,4.898664e-04,entropy,4,best,"{'criterion': 'entropy', 'max_depth': 4, 'spli...",0.688124,0.693674,0.676667,0.684093,0.664071,0.681333,0.010253,1
10,0.016748,3.960786e-04,0.000797,3.986888e-04,gini,6,best,"{'criterion': 'gini', 'max_depth': 6, 'splitte...",0.680355,0.688124,0.676667,0.690768,0.659622,0.679111,0.010991,3
6,0.011769,3.998505e-04,0.000200,3.996849e-04,gini,4,best,"{'criterion': 'gini', 'max_depth': 4, 'splitte...",0.673696,0.694784,0.674444,0.684093,0.660734,0.677556,0.011376,4
59,0.004187,1.470273e-03,0.000399,4.888124e-04,entropy,11,random,"{'criterion': 'entropy', 'max_depth': 11, 'spl...",0.691454,0.687014,0.663333,0.672970,0.670745,0.677111,0.010503,5
43,0.001197,3.989223e-04,0.000199,3.984451e-04,entropy,3,random,"{'criterion': 'entropy', 'max_depth': 3, 'spli...",0.678135,0.674806,0.674444,0.675195,0.678532,0.676222,0.001744,6
45,0.001197,3.985168e-04,0.000199,3.988266e-04,entropy,4,random,"{'criterion': 'entropy', 'max_depth': 4, 'spli...",0.674806,0.675916,0.674444,0.678532,0.675195,0.675778,0.001460,7
17,0.001796,3.970876e-04,0.000391,4.790024e-04,gini,9,random,"{'criterion': 'gini', 'max_depth': 9, 'splitte...",0.678135,0.677026,0.675556,0.676307,0.671858,0.675778,0.002136,7
49,0.001603,4.933957e-04,0.000391,4.790428e-04,entropy,6,random,"{'criterion': 'entropy', 'max_depth': 6, 'spli...",0.680355,0.674806,0.675556,0.676307,0.670745,0.675556,0.003076,9
19,0.003790,7.463788e-04,0.000798,3.988748e-04,gini,10,random,"{'criterion': 'gini', 'max_depth': 10, 'splitt...",0.663707,0.678135,0.670000,0.679644,0.685206,0.675333,0.007583,10
