In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split 
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_squared_error, make_scorer, mean_absolute_error
from sklearn.model_selection import cross_val_score 
from math import sqrt
from sklearn.model_selection import KFold
from sklearn.model_selection import RepeatedKFold, StratifiedKFold
from sklearn.model_selection import GridSearchCV

In [2]:
data = pd.read_csv('BSTrend_data.csv')
data

Unnamed: 0,Company,Time,Total Assets,Total Liabilities,Total Equity,Total Shares Out. on Filing Date,Book Value / Share,Tangible Book Value,Tangible Book Value Per Share,Total Debt,Net Debt,Classification
0,XRX,2016-12-01,-7490.0,-2985.0,-4505.0,0.9,-17.27,-4105.0,-16.22,-963.0,-1958.0,0
1,MCK,2012-03-01,2207.0,2596.0,-389.0,-16.7,0.42,-1342.0,-5.48,-24.0,439.0,1
2,CXO,2015-12-01,890.1,-771.7,1661.8,16.2,7.07,1663.3,7.12,-229.5,-458.1,0
3,BKNG,2015-12-01,-2418.4,-1393.7,-1024.6,0.4,-22.35,-2131.2,-43.89,-1010.9,640.6,0
4,NI,2012-12-01,1136.4,579.4,557.0,29.0,0.17,568.0,1.50,149.4,124.6,1
5,TDS,2013-12-01,49.7,-164.8,214.4,0.9,1.47,269.5,2.37,-220.2,-628.4,1
6,CMG,2019-12-01,3058.9,2740.4,318.6,-0.1,11.96,318.6,11.95,2851.5,2479.7,1
7,PHM,2017-12-01,-491.6,13.8,-505.4,-31.3,-0.11,-491.6,-0.13,-43.1,386.3,1
8,WRB,2014-12-01,-14.3,-5.2,-9.1,4.8,-0.73,-10.0,-0.69,270.7,359.8,0
9,CSCO,2019-07-01,10698.0,24386.0,-13688.0,-1137.6,-0.80,-30059.0,-5.64,7845.0,19017.0,1


In [3]:
classes = data['Classification'] 
data = data.drop(['Company', 'Time', 'Classification'], axis = 1)

col_list = []
for col in data.columns:
    col_list.append(col)
    
for col in col_list:
    print(col)
    data[col] = data[col].astype(float) # Converting columns to floats
    print("Done")
    
# Normalising the data
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler() 
scaled_values = scaler.fit_transform(data) 
data.loc[:,:] = scaled_values

data.insert(data.shape[1], 'Classification', classes)
data

Total Assets
Done
Total Liabilities
Done
Total Equity
Done
Total Shares Out. on Filing Date
Done
Book Value / Share
Done
Tangible Book Value
Done
Tangible Book Value Per Share
Done
Total Debt
Done
Net Debt
Done


Unnamed: 0,Total Assets,Total Liabilities,Total Equity,Total Shares Out. on Filing Date,Book Value / Share,Tangible Book Value,Tangible Book Value Per Share,Total Debt,Net Debt,Classification
0,0.384391,0.384644,0.319589,0.383287,0.194941,0.361292,0.233831,0.253257,0.221294,0
1,0.397569,0.394002,0.329956,0.383038,0.195027,0.369540,0.233895,0.255122,0.226272,1
2,0.395779,0.388355,0.335122,0.383505,0.195059,0.378512,0.233969,0.254713,0.224409,0
3,0.391283,0.387312,0.328356,0.383280,0.194917,0.367185,0.233668,0.253161,0.226691,0
4,0.396114,0.390621,0.332339,0.383686,0.195026,0.375242,0.233936,0.255466,0.225619,1
5,0.394637,0.389373,0.331476,0.383287,0.195032,0.374351,0.233941,0.254732,0.224055,1
6,0.398727,0.394244,0.331739,0.383273,0.195083,0.374498,0.233998,0.260833,0.230511,1
7,0.393902,0.389672,0.329663,0.382830,0.195024,0.372079,0.233926,0.255084,0.226163,1
8,0.394550,0.389640,0.330913,0.383343,0.195021,0.373517,0.233923,0.255707,0.226108,0
9,0.409108,0.430539,0.296459,0.367122,0.195021,0.283815,0.233894,0.270751,0.264861,1


In [4]:
# Converting Data to Numpy Arrays
NpMatrix = data.to_numpy(dtype = None, copy = False)
X = NpMatrix[:,0:9] # Parameters
y = NpMatrix[:,9] # Price
print("X:", X)
print("\n")
print("Type X:", type(X))
print("Length of Individual X_train Vector:", len(X[1]))
print("Total Number of Training instances:", len(X))
print("\n")
print(y)
print("\n")
print("Type y:", type(y))
print("Length of Individual y_train vector", y[0])
print("Total number of y values", len(y))

X: [[0.3843908  0.38464394 0.31958913 ... 0.23383148 0.25325652 0.22129353]
 [0.39756893 0.39400199 0.32995648 ... 0.23389478 0.25512158 0.22627232]
 [0.39577927 0.38835514 0.33512201 ... 0.23396904 0.25471341 0.22440896]
 ...
 [0.39385209 0.39038855 0.32849558 ... 0.23390964 0.25597765 0.2274895 ]
 [0.39479319 0.38969236 0.33128564 ... 0.23393109 0.25542369 0.22562593]
 [0.39366047 0.38997104 0.32876761 ... 0.23392779 0.25617428 0.22299259]]


Type X: <class 'numpy.ndarray'>
Length of Individual X_train Vector: 9
Total Number of Training instances: 4500


[0. 1. 0. ... 1. 0. 1.]


Type y: <class 'numpy.ndarray'>
Length of Individual y_train vector 0.0
Total number of y values 4500


In [5]:
RF = RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=9, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=25,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [6]:
CV = StratifiedKFold(n_splits=10, random_state=None, shuffle=False)

scores = cross_val_score(RF, X, y, cv = CV) 

print(scores)

print("Mean MAE:", scores.mean())
print("SD of the mean:", scores.std())

[0.70731707 0.6962306  0.72062084 0.66740576 0.73111111 0.70444444
 0.69265033 0.73051225 0.70824053 0.69933185]
Mean MAE: 0.7057864801527141
SD of the mean: 0.018059733769369094
