In [7]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split 
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV, StratifiedKFold, cross_val_score 

In [8]:
data = pd.read_csv('BSTrend_data.csv')
data

Unnamed: 0,Company,Time,Total Assets,Total Liabilities,Total Equity,Total Shares Out. on Filing Date,Book Value / Share,Tangible Book Value,Tangible Book Value Per Share,Total Debt,Net Debt,Classification
0,CI,2011-12-01,5015.0,3684.0,1331.0,15.3,3.56,1250.0,3.91,2253.0,-832.0,1
1,RE,2014-12-01,1009.8,198.8,811.0,-2.9,20.18,482.8,20.18,150.1,324.0,1
2,PAYX,2016-05-01,-26.7,-152.9,126.2,-0.7,0.36,-6.5,-0.01,0.0,184.5,1
3,DOV,2018-12-01,-2292.6,-678.1,-1614.5,-9.5,-9.20,-1457.1,-10.32,-403.8,-46.0,0
4,BIP,2016-12-01,3540.0,1072.0,2468.0,18.0,1.79,-819.0,-2.94,956.0,690.0,1
5,EOG,2018-12-01,4101.4,1020.5,3080.9,1.5,5.24,3080.9,5.24,-303.8,-1025.2,0
6,MAS,2011-12-01,-843.0,-3.0,-840.0,-0.8,-2.46,-292.0,-0.84,-73.0,-14.0,0
7,ATNI,2011-12-01,50.8,27.3,23.5,0.1,0.59,11.4,0.71,1.7,-9.8,1
8,PEIX,2016-12-01,33.5,-13.1,46.8,0.9,0.22,16.3,0.21,-29.4,-41.0,1
9,ATVI,2011-12-01,-170.0,-459.0,289.0,-59.3,0.64,391.0,0.44,0.0,-17.0,0


In [9]:
classes = data['Classification'] 
data = data.drop(['Company', 'Time', 'Classification'], axis = 1)

col_list = []
for col in data.columns:
    col_list.append(col)
    
for col in col_list:
    print(col)
    data[col] = data[col].astype(float) # Converting columns to floats
    print("Done")
    
# Normalising the data
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler() 
scaled_values = scaler.fit_transform(data) 
data.loc[:,:] = scaled_values

data.insert(data.shape[1], 'Classification', classes)
data

Total Assets
Done
Total Liabilities
Done
Total Equity
Done
Total Shares Out. on Filing Date
Done
Book Value / Share
Done
Tangible Book Value
Done
Tangible Book Value Per Share
Done
Total Debt
Done
Net Debt
Done


Unnamed: 0,Total Assets,Total Liabilities,Total Equity,Total Shares Out. on Filing Date,Book Value / Share,Tangible Book Value,Tangible Book Value Per Share,Total Debt,Net Debt,Classification
0,0.574127,0.681912,0.526422,0.206908,0.001079,0.496337,0.156601,0.665760,0.595254,1
1,0.561622,0.668232,0.523127,0.206132,0.001419,0.491149,0.156887,0.650948,0.602601,1
2,0.558386,0.666852,0.518787,0.206226,0.001014,0.487841,0.156532,0.649890,0.601714,1
3,0.551311,0.664790,0.507757,0.205851,0.000819,0.478032,0.156351,0.647046,0.600249,0
4,0.569522,0.671660,0.533627,0.207023,0.001043,0.482346,0.156481,0.656624,0.604927,1
5,0.571274,0.671457,0.537511,0.206320,0.001114,0.508718,0.156625,0.647751,0.594026,0
6,0.555837,0.667440,0.512665,0.206222,0.000956,0.485910,0.156518,0.649376,0.600452,0
7,0.558628,0.667559,0.518137,0.206260,0.001019,0.487962,0.156545,0.649902,0.600479,1
8,0.558574,0.667400,0.518284,0.206294,0.001011,0.487995,0.156536,0.649683,0.600281,1
9,0.557938,0.665650,0.519819,0.203730,0.001020,0.490529,0.156540,0.649890,0.600433,0


In [10]:
# Converting Data to Numpy Arrays
NpMatrix = data.to_numpy(dtype = None, copy = False)
X = NpMatrix[:,0:9] # Parameters
y = NpMatrix[:,9] # Price
print("X:", X)
print("\n")
print("Type X:", type(X))
print("Length of Individual X_train Vector:", len(X[1]))
print("Total Number of Training instances:", len(X))
print("\n")
print(y)
print("\n")
print("Type y:", type(y))
print("Length of Individual y_train vector", y[0])
print("Total number of y values", len(y))

X: [[0.57412675 0.68191239 0.52642218 ... 0.15660121 0.66576035 0.59525398]
 [0.56162183 0.66823206 0.52312694 ... 0.15688709 0.65094776 0.60260052]
 [0.5583857  0.66685155 0.51878735 ... 0.15653233 0.64989047 0.60171398]
 ...
 [0.55809846 0.67046279 0.51237429 ... 0.15644957 0.65595174 0.608092  ]
 [0.56398688 0.67446145 0.51787038 ... 0.15653408 0.66283855 0.61190508]
 [0.56259345 0.67198972 0.51903259 ... 0.15619478 0.65718794 0.60745648]]


Type X: <class 'numpy.ndarray'>
Length of Individual X_train Vector: 9
Total Number of Training instances: 4500


[1. 1. 1. ... 0. 0. 1.]


Type y: <class 'numpy.ndarray'>
Length of Individual y_train vector 1.0
Total number of y values 4500


In [11]:
knn = KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='manhattan',
                     metric_params=None, n_jobs=None, n_neighbors=61, p=2,
                     weights='distance')

In [12]:
CV = StratifiedKFold(n_splits=10, random_state=None, shuffle=False)

scores = cross_val_score(knn, X, y, cv = CV) 

print(scores)

print("Mean Accuracy:", scores.mean())
print("SD of the mean:", scores.std())

[0.68736142 0.68957871 0.69555556 0.69333333 0.69111111 0.68444444
 0.7        0.71333333 0.68374165 0.69710468]
Mean Accuracy: 0.693556423598251
SD of the mean: 0.008299518216221607
