In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split 
from sklearn.neighbors import KNeighborsRegressor 
from sklearn.metrics import mean_squared_error, make_scorer, mean_absolute_error
from sklearn.model_selection import GridSearchCV

In [2]:
data = pd.read_csv('DeltaBS_data.csv')
data

Unnamed: 0,Company,Time,Total Assets,Total Liabilities,Total Equity,Total Shares Out. on Filing Date,Book Value / Share,Tangible Book Value,Tangible Book Value Per Share,Total Debt,Net Debt,Price_Change
0,SIRI,2019-12-01,2976.0,1895.0,1081.0,68.1,0.25,-1438.0,-0.30,1458.0,1406.0,1.44
1,NEWT,2015-12-01,50.6,13.1,37.5,4.3,-2.25,34.0,-2.22,7.6,24.1,1.87
2,ADM,2019-12-01,3164.0,2926.0,238.0,-2.2,0.52,-1208.0,-2.08,1512.0,2654.0,5.38
3,PXD,2017-12-01,544.0,-324.0,868.0,0.5,4.94,872.0,4.96,-481.0,-25.0,-7.22
4,TROW,2018-12-01,153.9,106.5,47.4,-8.1,1.96,299.9,1.88,0.0,477.5,-12.61
5,PRU,2013-12-01,22546.0,25777.0,-3231.0,-2.0,-6.62,-3160.0,-6.49,3913.0,10574.0,38.89
6,LEG,2017-12-01,566.7,469.9,96.8,-3.5,0.84,63.8,0.49,291.9,47.7,-1.15
7,ADS,2012-12-01,3019.9,2667.4,352.5,-0.1,7.12,-148.4,-3.15,2387.1,1710.0,40.92
8,PPG,2018-12-01,-523.0,417.0,-940.0,-14.0,-2.49,-982.0,-4.28,870.0,1398.0,-14.59
9,VZ,2015-12-01,11559.0,7393.0,4166.0,-81.6,1.07,-9660.0,-2.82,-3699.0,2384.0,-0.56


In [3]:
prices = data['Price_Change'] 
data = data.drop(['Company', 'Time', 'Price_Change'], axis = 1)

col_list = []
for col in data.columns:
    col_list.append(col)
    
for col in col_list:
    print(col)
    data[col] = data[col].astype(float) # Converting columns to floats
    print("Done")

Total Assets
Done
Total Liabilities
Done
Total Equity
Done
Total Shares Out. on Filing Date
Done
Book Value / Share
Done
Tangible Book Value
Done
Tangible Book Value Per Share
Done
Total Debt
Done
Net Debt
Done


In [4]:
# Normalising the data
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler() 
scaled_values = scaler.fit_transform(data) 
data.loc[:,:] = scaled_values

data.insert(data.shape[1], 'Price_Change', prices)
data

Unnamed: 0,Total Assets,Total Liabilities,Total Equity,Total Shares Out. on Filing Date,Book Value / Share,Tangible Book Value,Tangible Book Value Per Share,Total Debt,Net Debt,Price_Change
0,0.567761,0.674890,0.524838,0.209157,0.001012,0.478161,0.156527,0.660160,0.609477,1.44
1,0.558627,0.667503,0.518225,0.206439,0.000961,0.488115,0.156493,0.649944,0.600695,1.87
2,0.568348,0.678937,0.519496,0.206162,0.001017,0.479716,0.156496,0.660541,0.617408,5.38
3,0.560168,0.666180,0.523488,0.206277,0.001108,0.493781,0.156620,0.646502,0.600383,-7.22
4,0.558950,0.667870,0.518288,0.205911,0.001047,0.489913,0.156566,0.649890,0.603576,-12.61
5,0.628862,0.768633,0.497513,0.206171,0.000871,0.466516,0.156418,0.677453,0.667741,38.89
6,0.560238,0.669296,0.518601,0.206107,0.001024,0.488316,0.156541,0.651947,0.600845,-1.15
7,0.567898,0.677922,0.520221,0.206252,0.001152,0.486881,0.156477,0.666705,0.611409,40.92
8,0.556836,0.669089,0.512031,0.205659,0.000956,0.481244,0.156457,0.656019,0.609426,-14.59
9,0.594558,0.696471,0.544388,0.202780,0.001028,0.422563,0.156483,0.623835,0.615692,-0.56


In [5]:
# Converting Data to Numpy Arrays
NpMatrix = data.to_numpy(dtype = None, copy = False)
X = NpMatrix[:,0:9] # Parameters
y = NpMatrix[:,9] # Price
print("X:", X)
print("\n")
print("Type X:", type(X))
print("Length of Individual X_train Vector:", len(X[1]))
print("Total Number of Training instances:", len(X))
print("\n")
print(y)
print("\n")
print("Type y:", type(y))
print("Length of Individual y_train vector", y[0])
print("Total number of y values", len(y))

X: [[0.56776064 0.67489009 0.52483793 ... 0.15652723 0.66016046 0.60947678]
 [0.55862704 0.66750314 0.51822526 ... 0.1564935  0.649944   0.60069462]
 [0.56834761 0.67893704 0.51949583 ... 0.15649596 0.66054083 0.61740799]
 ...
 [0.56725797 0.67263699 0.52745512 ... 0.1565369  0.65582847 0.60690295]
 [0.5811891  0.69430837 0.52074422 ... 0.15652372 0.67007121 0.60685211]
 [0.56018282 0.67036623 0.51676141 ... 0.15630548 0.65148239 0.60218744]]


Type X: <class 'numpy.ndarray'>
Length of Individual X_train Vector: 9
Total Number of Training instances: 4500


[  1.44   1.87   5.38 ...   5.83  77.77 334.81]


Type y: <class 'numpy.ndarray'>
Length of Individual y_train vector 1.44
Total number of y values 4500


In [6]:
# Grid Search for Optimal KNN Hyperparameters
# Using GridSearchCV to find best parameters
kvals = range(1,100)
params = {'n_neighbors': kvals,
        'weights' : ['uniform', 'distance'],
         'metric' : ['euclidean', 'manhattan']}

knn = KNeighborsRegressor()
#mse = make_scorer(mean_squared_error, greater_is_better= False)
mae = make_scorer(mean_absolute_error, greater_is_better= False)

gs_modelKNN = GridSearchCV(knn, params, scoring = mae,  cv=5)
grid_results = gs_modelKNN.fit(X , y)

# Best parameters, best test score using these parameters, optimal KNN Configuration
print("Best Parameters", gs_modelKNN.best_params_)
print("Best score on Test Data", gs_modelKNN.best_score_)
print("Optimal Configuration", gs_modelKNN.best_estimator_)

Best Parameters {'metric': 'manhattan', 'n_neighbors': 99, 'weights': 'distance'}
Best score on Test Data -15.947575300779546
Optimal Configuration KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='manhattan',
                    metric_params=None, n_jobs=None, n_neighbors=99, p=2,
                    weights='distance')


In [9]:
# Summarize the results in a readable format
print("Best: {0}, using {1}".format(grid_results.cv_results_['mean_test_score'], grid_results.best_params_))
results_df = pd.DataFrame(grid_results.cv_results_)
results_df

Best: [-23.26146    -23.26146    -20.77969222 -20.76936987 -19.35353111
 -19.40577775 -18.76155278 -18.78600284 -18.37645467 -18.39274905
 -17.89638815 -17.94524098 -17.72755587 -17.75532685 -17.58501361
 -17.60792708 -17.4416158  -17.45983577 -17.26458689 -17.29168525
 -17.19625636 -17.20585549 -17.12579926 -17.13153612 -17.05237299
 -17.04320397 -16.97594841 -16.97080583 -16.96162178 -16.93985712
 -16.91578431 -16.88965996 -16.84922523 -16.82465323 -16.75165556
 -16.73831915 -16.6899807  -16.67863332 -16.66661033 -16.65804159
 -16.63663767 -16.62127488 -16.59975273 -16.58206666 -16.55093372
 -16.53651058 -16.52177546 -16.50927891 -16.49329636 -16.48466561
 -16.44288949 -16.43922939 -16.43050996 -16.4243805  -16.39957381
 -16.39493376 -16.37958989 -16.37332728 -16.37049141 -16.36313243
 -16.35575971 -16.3465093  -16.33469937 -16.33035268 -16.32705764
 -16.318661   -16.30379235 -16.29443782 -16.30372432 -16.29097235
 -16.29229265 -16.27878389 -16.28708144 -16.26650554 -16.26166216
 -16

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_metric,param_n_neighbors,param_weights,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.003750,3.920330e-03,0.024185,0.008927,euclidean,1,uniform,"{'metric': 'euclidean', 'n_neighbors': 1, 'wei...",-20.651544,-23.024333,-23.856611,-26.174578,-22.600233,-23.261460,1.796974,395
1,0.002397,1.957542e-03,0.016733,0.002943,euclidean,1,distance,"{'metric': 'euclidean', 'n_neighbors': 1, 'wei...",-20.651544,-23.024333,-23.856611,-26.174578,-22.600233,-23.261460,1.796974,395
2,0.001598,1.957523e-03,0.016409,0.002669,euclidean,2,uniform,"{'metric': 'euclidean', 'n_neighbors': 2, 'wei...",-19.288967,-20.250828,-20.860378,-22.938100,-20.560189,-20.779692,1.201281,392
3,0.001598,1.957639e-03,0.015179,0.001597,euclidean,2,distance,"{'metric': 'euclidean', 'n_neighbors': 2, 'wei...",-19.159247,-20.155604,-21.007705,-23.031200,-20.493094,-20.769370,1.282046,391
4,0.003197,1.598334e-03,0.016777,0.002989,euclidean,3,uniform,"{'metric': 'euclidean', 'n_neighbors': 3, 'wei...",-18.142196,-18.981337,-19.272722,-21.272222,-19.099178,-19.353531,1.035226,385
5,0.000000,0.000000e+00,0.018375,0.003196,euclidean,3,distance,"{'metric': 'euclidean', 'n_neighbors': 3, 'wei...",-18.008108,-19.000321,-19.413364,-21.489255,-19.117840,-19.405778,1.143829,387
6,0.000800,1.599312e-03,0.019174,0.001597,euclidean,4,uniform,"{'metric': 'euclidean', 'n_neighbors': 4, 'wei...",-17.301206,-18.900794,-18.423669,-20.638322,-18.543772,-18.761553,1.080357,383
7,0.002398,1.958361e-03,0.017575,0.001958,euclidean,4,distance,"{'metric': 'euclidean', 'n_neighbors': 4, 'wei...",-17.254232,-18.833527,-18.556771,-20.785723,-18.499761,-18.786003,1.138590,384
8,0.000799,1.598167e-03,0.021585,0.005425,euclidean,5,uniform,"{'metric': 'euclidean', 'n_neighbors': 5, 'wei...",-16.951024,-18.428876,-18.109731,-19.991649,-18.400993,-18.376455,0.971102,379
9,0.000799,1.597118e-03,0.019974,0.002527,euclidean,5,distance,"{'metric': 'euclidean', 'n_neighbors': 5, 'wei...",-16.874429,-18.415024,-18.192295,-20.154208,-18.327788,-18.392749,1.044311,380


In [10]:
results_df = results_df.sort_values(['rank_test_score'])
results_df

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_metric,param_n_neighbors,param_weights,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
395,0.001598,0.001957,0.053309,0.005755,manhattan,99,distance,"{'metric': 'manhattan', 'n_neighbors': 99, 'we...",-14.408138,-16.585719,-15.452631,-17.611458,-15.679930,-15.947575,1.082793,1
393,0.003196,0.001598,0.052815,0.003937,manhattan,98,distance,"{'metric': 'manhattan', 'n_neighbors': 98, 'we...",-14.395653,-16.587913,-15.465195,-17.617046,-15.673394,-15.947840,1.087520,2
391,0.004794,0.001599,0.104669,0.006393,manhattan,97,distance,"{'metric': 'manhattan', 'n_neighbors': 97, 'we...",-14.394010,-16.592448,-15.458673,-17.622269,-15.679258,-15.949331,1.090415,3
389,0.003219,0.001610,0.103854,0.005628,manhattan,96,distance,"{'metric': 'manhattan', 'n_neighbors': 96, 'we...",-14.399543,-16.609643,-15.457130,-17.636729,-15.679862,-15.956581,1.095425,4
387,0.003994,0.000026,0.106250,0.005412,manhattan,95,distance,"{'metric': 'manhattan', 'n_neighbors': 95, 'we...",-14.415374,-16.621454,-15.460326,-17.639518,-15.679218,-15.963178,1.092944,5
381,0.003993,0.000001,0.100661,0.004684,manhattan,92,distance,"{'metric': 'manhattan', 'n_neighbors': 92, 'we...",-14.433601,-16.601793,-15.457299,-17.643672,-15.684881,-15.964249,1.086725,6
383,0.003994,0.000003,0.104669,0.004653,manhattan,93,distance,"{'metric': 'manhattan', 'n_neighbors': 93, 'we...",-14.440740,-16.600806,-15.463672,-17.648687,-15.685959,-15.967973,1.085502,7
379,0.001609,0.001970,0.053889,0.011343,manhattan,91,distance,"{'metric': 'manhattan', 'n_neighbors': 91, 'we...",-14.444386,-16.610521,-15.453986,-17.647020,-15.686595,-15.968502,1.085978,8
195,0.004792,0.001598,0.160580,0.030662,euclidean,98,distance,"{'metric': 'euclidean', 'n_neighbors': 98, 'we...",-14.477741,-16.627004,-15.423428,-17.711133,-15.608533,-15.969568,1.106294,9
193,0.001598,0.001957,0.059919,0.005649,euclidean,97,distance,"{'metric': 'euclidean', 'n_neighbors': 97, 'we...",-14.473050,-16.628170,-15.437799,-17.703653,-15.607919,-15.970118,1.103987,10
