In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split 
from sklearn.neighbors import KNeighborsRegressor 
from sklearn.metrics import mean_squared_error, make_scorer, mean_absolute_error
from sklearn.model_selection import GridSearchCV

In [3]:
data = pd.read_csv('BS_data.csv')
data

Unnamed: 0,Company,Time,Total Assets,Total Liabilities,Total Equity,Total Shares Out. on Filing Date,Book Value / Share,Tangible Book Value,Tangible Book Value Per Share,Total Debt,Net Debt,Price
0,RHI,2013-12-01,1490.3,570.6,919.6,135.3,6.80,718.3,5.31,1.4,-274.3,41.99
1,RCL,2011-12-01,19804.4,11396.6,8407.8,217.5,38.74,7442.4,34.29,8507.2,8245.0,24.77
2,NVDA,2020-01-01,17315.0,5111.0,12204.0,612.0,19.93,11537.0,18.84,2643.0,-8254.0,236.43
3,STC,2012-12-01,1291.2,710.8,580.4,21.1,29.31,263.5,13.58,71.2,-137.4,26.00
4,XRAY,2010-12-01,3258.0,1348.0,1909.9,142.1,12.98,457.6,3.23,786.6,246.6,34.17
5,GWW,2016-12-01,5694.3,3788.5,1905.8,58.8,30.57,684.7,11.64,2247.1,1972.9,232.25
6,SKM,2015-12-01,24288.4,11223.5,13064.9,70.6,183.55,9386.2,132.93,7039.9,5790.9,20.15
7,BAX,2011-12-01,19073.0,12245.0,6828.0,560.3,11.74,3442.0,6.14,5195.0,2290.0,26.88
8,MAR,2012-12-01,6342.0,7627.0,-1285.0,312.3,-4.13,-3274.0,-10.53,2935.0,2847.0,37.27
9,SJM,2019-04-01,16711.3,8740.8,7970.5,113.7,70.08,-5059.2,-44.48,5959.9,5858.6,122.63


In [4]:
prices = data['Price'] 
data = data.drop(['Company', 'Time', 'Price'], axis = 1)

col_list = []
for col in data.columns:
    col_list.append(col)
    
for col in col_list:
    print(col)
    data[col] = data[col].astype(float) # Converting columns to floats
    print("Done")

Total Assets
Done
Total Liabilities
Done
Total Equity
Done
Total Shares Out. on Filing Date
Done
Book Value / Share
Done
Tangible Book Value
Done
Tangible Book Value Per Share
Done
Total Debt
Done
Net Debt
Done


In [5]:
# Normalising the data
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler() 
scaled_values = scaler.fit_transform(data) 
data.loc[:,:] = scaled_values

data.insert(data.shape[1], 'Price', prices)
data

Unnamed: 0,Total Assets,Total Liabilities,Total Equity,Total Shares Out. on Filing Date,Book Value / Share,Tangible Book Value,Tangible Book Value Per Share,Total Debt,Net Debt,Price
0,0.001638,0.000683,0.028169,0.001691,0.000362,0.289974,0.001341,0.000003,0.185569,41.99
1,0.021935,0.013682,0.045186,0.002725,0.000484,0.305348,0.001492,0.017775,0.200646,24.77
2,0.019176,0.006134,0.053813,0.007687,0.000412,0.314710,0.001411,0.005522,0.171446,236.43
3,0.001418,0.000851,0.027398,0.000254,0.000448,0.288934,0.001384,0.000149,0.185811,26.00
4,0.003597,0.001616,0.030420,0.001776,0.000385,0.289378,0.001330,0.001644,0.186491,34.17
5,0.006298,0.004546,0.030410,0.000728,0.000453,0.289897,0.001374,0.004695,0.189546,232.25
6,0.026904,0.013474,0.055770,0.000877,0.001038,0.309792,0.002006,0.014709,0.196303,20.15
7,0.021124,0.014700,0.041596,0.007037,0.000380,0.296202,0.001345,0.010855,0.190107,26.88
8,0.007015,0.009155,0.023159,0.003917,0.000320,0.280846,0.001258,0.006132,0.191093,37.27
9,0.018507,0.010493,0.044193,0.001419,0.000604,0.276765,0.001081,0.012453,0.196423,122.63


In [6]:
# Converting Data to Numpy Arrays
NpMatrix = data.to_numpy(dtype = None, copy = False)
X = NpMatrix[:,0:9] # Parameters
y = NpMatrix[:,9] # Price
print("X:", X)
print("\n")
print("Type X:", type(X))
print("Length of Individual X_train Vector:", len(X[1]))
print("Total Number of Training instances:", len(X))
print("\n")
print(y)
print("\n")
print("Type y:", type(y))
print("Length of Individual y_train vector", y[0])
print("Total number of y values", len(y))

X: [[1.63843386e-03 6.82611560e-04 2.81693214e-02 ... 1.34060792e-03
  2.92521072e-06 1.85568632e-01]
 [2.19349988e-02 1.36816489e-02 4.51864494e-02 ... 1.49174313e-03
  1.77752519e-02 2.00645972e-01]
 [1.91761262e-02 6.13437854e-03 5.38134120e-02 ... 1.41116898e-03
  5.52237995e-03 1.71446269e-01]
 ...
 [6.79067888e-03 4.97604015e-03 3.06086506e-02 ... 1.34050362e-03
  4.01380699e-03 1.89340224e-01]
 [2.35835177e-03 1.36510305e-03 2.83540777e-02 ... 1.34947368e-03
  1.52340795e-03 1.86742708e-01]
 [7.21358632e-04 8.08087212e-04 2.60515545e-02 ... 1.31145520e-03
  1.21500717e-03 1.86722887e-01]]


Type X: <class 'numpy.ndarray'>
Length of Individual X_train Vector: 9
Total Number of Training instances: 5000


[ 41.99  24.77 236.43 ...  49.01   8.07  34.69]


Type y: <class 'numpy.ndarray'>
Length of Individual y_train vector 41.99
Total number of y values 5000


In [7]:
# Grid Search for Optimal KNN Hyperparameters
# Using GridSearchCV to find best parameters
kvals = range(1,100)
params = {'n_neighbors': kvals,
        'weights' : ['uniform', 'distance'],
         'metric' : ['euclidean', 'manhattan']}

knn = KNeighborsRegressor()
#mse = make_scorer(mean_squared_error, greater_is_better= False)
mae = make_scorer(mean_absolute_error, greater_is_better= False)

gs_modelKNN = GridSearchCV(knn, params, scoring = mae,  cv=5)
grid_results = gs_modelKNN.fit(X , y)

# Best parameters, best test score using these parameters, optimal KNN Configuration
print("Best Parameters", gs_modelKNN.best_params_)
print("Best score on Test Data", gs_modelKNN.best_score_)
print("Optimal Configuration", gs_modelKNN.best_estimator_)

Best Parameters {'metric': 'euclidean', 'n_neighbors': 2, 'weights': 'distance'}
Best score on Test Data -27.606501907990744
Optimal Configuration KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='euclidean',
                    metric_params=None, n_jobs=None, n_neighbors=2, p=2,
                    weights='distance')


In [8]:
# Summarize the results in a readable format
print("Best: {0}, using {1}".format(grid_results.cv_results_['mean_test_score'], grid_results.best_params_))
results_df = pd.DataFrame(grid_results.cv_results_)
results_df

Best: [-28.375058   -28.375058   -28.823881   -27.60650191 -31.19653467
 -28.97231955 -33.2361585  -30.12428803 -34.4839032  -30.94473749
 -35.60414433 -31.69213477 -36.27384086 -32.20339023 -36.861952
 -32.71520211 -37.32342333 -33.14732479 -37.9650764  -33.68641337
 -38.52641836 -34.14341749 -38.898403   -34.5507632  -39.31520985
 -34.96887427 -39.74450271 -35.37865467 -40.15308707 -35.74966757
 -40.63047475 -36.16702223 -40.83746024 -36.40866813 -41.11764022
 -36.70622072 -41.30341579 -36.9160692  -41.5510616  -37.15423674
 -41.82785362 -37.43267113 -42.110837   -37.70973366 -42.30100539
 -37.93767163 -42.50292575 -38.1568125  -42.68035328 -38.36586112
 -42.72387    -38.48475096 -42.75525867 -38.57936679 -42.930935
 -38.77674986 -43.05831524 -38.92644444 -43.28665633 -39.14993455
 -43.30402735 -39.23685339 -43.43660606 -39.39910772 -43.559402
 -39.55403109 -43.70002153 -39.70880164 -43.79147046 -39.82895164
 -43.88284289 -39.94271049 -43.97482881 -40.05645809 -44.01243547
 -40.14401

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_metric,param_n_neighbors,param_weights,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.011443,3.407104e-03,0.032184,0.004400,euclidean,1,uniform,"{'metric': 'euclidean', 'n_neighbors': 1, 'wei...",-30.664620,-28.884240,-29.594200,-28.492230,-24.240000,-28.375058,2.195454,3
1,0.005611,1.964081e-03,0.015806,0.010717,euclidean,1,distance,"{'metric': 'euclidean', 'n_neighbors': 1, 'wei...",-30.664620,-28.884240,-29.594200,-28.492230,-24.240000,-28.375058,2.195454,3
2,0.003195,1.597524e-03,0.011437,0.001791,euclidean,2,uniform,"{'metric': 'euclidean', 'n_neighbors': 2, 'wei...",-32.334535,-27.628270,-30.095055,-29.582395,-24.479150,-28.823881,2.639018,7
3,0.000799,1.597691e-03,0.011979,0.000028,euclidean,2,distance,"{'metric': 'euclidean', 'n_neighbors': 2, 'wei...",-31.056613,-26.750111,-28.788355,-28.150864,-23.286566,-27.606502,2.568720,1
4,0.004005,2.034626e-05,0.011977,0.000023,euclidean,3,uniform,"{'metric': 'euclidean', 'n_neighbors': 3, 'wei...",-34.869707,-29.259837,-31.366943,-32.254377,-28.231810,-31.196535,2.331551,15
5,0.003972,2.586788e-05,0.012009,0.000028,euclidean,3,distance,"{'metric': 'euclidean', 'n_neighbors': 3, 'wei...",-32.730845,-27.720899,-29.239795,-29.651227,-25.518832,-28.972320,2.372830,9
6,0.002387,1.949041e-03,0.014378,0.001957,euclidean,4,uniform,"{'metric': 'euclidean', 'n_neighbors': 4, 'wei...",-36.401320,-30.830578,-33.445755,-34.795752,-30.707387,-33.236159,2.221532,26
7,0.003196,1.598192e-03,0.019987,0.002526,euclidean,4,distance,"{'metric': 'euclidean', 'n_neighbors': 4, 'wei...",-33.484713,-28.536294,-30.157837,-31.108990,-27.333606,-30.124288,2.124752,12
8,0.003195,1.597459e-03,0.015013,0.001549,euclidean,5,uniform,"{'metric': 'euclidean', 'n_neighbors': 5, 'wei...",-37.808964,-32.219126,-34.770454,-35.978340,-31.642632,-34.483903,2.305364,33
9,0.003186,1.593096e-03,0.016054,0.002641,euclidean,5,distance,"{'metric': 'euclidean', 'n_neighbors': 5, 'wei...",-34.342694,-29.422964,-30.902103,-32.058304,-27.997622,-30.944737,2.181249,14


In [9]:
results_df = results_df.sort_values(['rank_test_score'])
results_df

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_metric,param_n_neighbors,param_weights,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
3,0.000799,0.001598,0.011979,0.000028,euclidean,2,distance,"{'metric': 'euclidean', 'n_neighbors': 2, 'wei...",-31.056613,-26.750111,-28.788355,-28.150864,-23.286566,-27.606502,2.568720,1
201,0.002400,0.001959,0.008785,0.001605,manhattan,2,distance,"{'metric': 'manhattan', 'n_neighbors': 2, 'wei...",-31.206236,-27.883596,-27.936440,-30.432060,-23.456740,-28.183015,2.708734,2
0,0.011443,0.003407,0.032184,0.004400,euclidean,1,uniform,"{'metric': 'euclidean', 'n_neighbors': 1, 'wei...",-30.664620,-28.884240,-29.594200,-28.492230,-24.240000,-28.375058,2.195454,3
1,0.005611,0.001964,0.015806,0.010717,euclidean,1,distance,"{'metric': 'euclidean', 'n_neighbors': 1, 'wei...",-30.664620,-28.884240,-29.594200,-28.492230,-24.240000,-28.375058,2.195454,3
199,0.002404,0.001963,0.011994,0.003573,manhattan,1,distance,"{'metric': 'manhattan', 'n_neighbors': 1, 'wei...",-30.725960,-28.362010,-29.484420,-31.085340,-23.567490,-28.645044,2.714829,5
198,0.002229,0.001842,0.007146,0.002358,manhattan,1,uniform,"{'metric': 'manhattan', 'n_neighbors': 1, 'wei...",-30.725960,-28.362010,-29.484420,-31.085340,-23.567490,-28.645044,2.714829,5
2,0.003195,0.001598,0.011437,0.001791,euclidean,2,uniform,"{'metric': 'euclidean', 'n_neighbors': 2, 'wei...",-32.334535,-27.628270,-30.095055,-29.582395,-24.479150,-28.823881,2.639018,7
203,0.003202,0.001601,0.014368,0.006957,manhattan,3,distance,"{'metric': 'manhattan', 'n_neighbors': 3, 'wei...",-31.776651,-27.930450,-28.606362,-30.794680,-25.433173,-28.908263,2.232028,8
5,0.003972,0.000026,0.012009,0.000028,euclidean,3,distance,"{'metric': 'euclidean', 'n_neighbors': 3, 'wei...",-32.730845,-27.720899,-29.239795,-29.651227,-25.518832,-28.972320,2.372830,9
200,0.004002,0.000018,0.010596,0.003058,manhattan,2,uniform,"{'metric': 'manhattan', 'n_neighbors': 2, 'wei...",-32.450640,-28.666575,-29.013025,-32.135710,-24.541200,-29.361430,2.866310,10
