In [5]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split 
from sklearn.neighbors import KNeighborsRegressor 
from sklearn.metrics import mean_squared_error, make_scorer, mean_absolute_error
from sklearn.model_selection import GridSearchCV

In [6]:
data = pd.read_csv('IS_data.csv')
data

Unnamed: 0,Company,Time,Total Revenues,Operating Income,Net Income,Revenue Per Share,Basic EPS,Normalized Basic EPS,Dividend Per Share,EBITA,EBIT,Normalized Net Income,Price
0,CTAS,2019-05-01,6892.3,1147.9,885.0,64.97,8.25,6.17,2.05,1284.4,1147.9,654.6,221.83
1,PGR,2014-12-01,19377.3,2029.1,1281.0,32.81,2.17,2.02,0.69,2029.1,2029.1,1195.1,26.99
2,ELP,2018-12-01,3857.8,583.3,363.5,14.10,1.33,1.12,0.37,676.6,583.3,306.2,7.83
3,PHM,2016-12-01,7676.5,980.2,602.7,22.59,1.76,1.82,0.36,994.0,980.2,619.1,18.38
4,F,2019-12-01,155900.0,2658.0,47.0,39.25,0.01,0.40,0.60,2658.0,2658.0,1600.5,9.30
5,WM,2011-12-01,13378.0,2081.0,961.0,28.48,2.05,1.98,1.36,2132.0,2081.0,929.5,32.71
6,SIRI,2019-12-01,7794.0,1756.0,914.0,1.73,0.20,0.19,0.05,1897.0,1756.0,851.9,7.15
7,SCCO,2017-12-01,6654.5,2618.9,728.5,8.61,0.94,1.88,0.59,2624.8,2618.9,1449.7,46.81
8,BAX,2017-12-01,10584.0,1510.0,602.0,19.49,1.11,1.59,0.61,1664.0,1510.0,865.6,64.64
9,CTSH,2014-12-01,10263.0,1885.0,1439.0,16.88,2.37,1.98,0.00,1921.0,1885.0,1201.3,52.66


In [7]:
prices = data['Price'] 
data = data.drop(['Company', 'Time', 'Price'], axis = 1)

col_list = []
for col in data.columns:
    col_list.append(col)
    
for col in col_list:
    print(col)
    data[col] = data[col].astype(float) # Converting columns to floats
    print("Done")

Total Revenues
Done
Operating Income
Done
Net Income
Done
Revenue Per Share
Done
Basic EPS
Done
Normalized Basic EPS
Done
Dividend Per Share
Done
EBITA
Done
EBIT
Done
Normalized Net Income
Done


In [8]:
# Normalising the data
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler() 
scaled_values = scaler.fit_transform(data) 
data.loc[:,:] = scaled_values

data.insert(data.shape[1], 'Price', prices)
data

Unnamed: 0,Total Revenues,Operating Income,Net Income,Revenue Per Share,Basic EPS,Normalized Basic EPS,Dividend Per Share,EBITA,EBIT,Normalized Net Income,Price
0,0.013132,0.152721,0.191876,0.000414,0.000911,0.000826,0.155303,0.151308,0.152721,0.151562,221.83
1,0.036961,0.159877,0.195145,0.000208,0.000790,0.000720,0.052273,0.157298,0.159877,0.158824,26.99
2,0.007341,0.148136,0.187570,0.000089,0.000773,0.000697,0.028030,0.146419,0.148136,0.146882,7.83
3,0.014629,0.151359,0.189545,0.000143,0.000781,0.000715,0.027273,0.148972,0.151359,0.151086,18.38
4,0.297524,0.164984,0.184957,0.000249,0.000746,0.000678,0.045455,0.162357,0.164984,0.164270,9.30
5,0.025511,0.160299,0.192503,0.000181,0.000787,0.000719,0.103030,0.158126,0.160299,0.155255,32.71
6,0.014853,0.157659,0.192115,0.000010,0.000750,0.000673,0.003788,0.156236,0.157659,0.154213,7.15
7,0.012679,0.164667,0.190584,0.000054,0.000765,0.000716,0.044697,0.162090,0.164667,0.162244,46.81
8,0.020178,0.155661,0.189539,0.000123,0.000768,0.000709,0.046212,0.154362,0.155661,0.154397,64.64
9,0.019566,0.158707,0.196450,0.000107,0.000794,0.000719,0.000000,0.156429,0.158707,0.158907,52.66


In [9]:
# Converting Data to Numpy Arrays
NpMatrix = data.to_numpy(dtype = None, copy = False)
X = NpMatrix[:,0:10] # Parameters
y = NpMatrix[:,10] # Price
print("X:", X)
print("\n")
print("Type X:", type(X))
print("Length of Individual X_train Vector:", len(X[1]))
print("Total Number of Training instances:", len(X))
print("\n")
print(y)
print("\n")
print("Type y:", type(y))
print("Length of Individual y_train vector", y[0])
print("Total number of y values", len(y))

X: [[0.01313249 0.15272091 0.19187598 ... 0.15130826 0.15272091 0.15156246]
 [0.03696098 0.15987705 0.19514547 ... 0.15729821 0.15987705 0.1588236 ]
 [0.00734093 0.14813585 0.18757033 ... 0.14641947 0.14813585 0.14688201]
 ...
 [0.31257509 0.34565004 0.29170268 ... 0.38301227 0.34565004 0.30593264]
 [0.00518902 0.15225639 0.19026601 ... 0.14998432 0.15225639 0.15134751]
 [0.00816982 0.15014171 0.18915389 ... 0.147815   0.15014171 0.14926791]]


Type X: <class 'numpy.ndarray'>
Length of Individual X_train Vector: 10
Total Number of Training instances: 5000


[221.83  26.99   7.83 ...  42.53  50.32  32.35]


Type y: <class 'numpy.ndarray'>
Length of Individual y_train vector 221.83
Total number of y values 5000


In [10]:
# Grid Search for Optimal KNN Hyperparameters
# Using GridSearchCV to find best parameters
kvals = range(1,100)
params = {'n_neighbors': kvals,
        'weights' : ['uniform', 'distance'],
         'metric' : ['euclidean', 'manhattan']}

knn = KNeighborsRegressor()
#mse = make_scorer(mean_squared_error, greater_is_better= False)
mae = make_scorer(mean_absolute_error, greater_is_better= False)

gs_modelKNN = GridSearchCV(knn, params, scoring = mae,  cv=5)
grid_results = gs_modelKNN.fit(X , y)

# Best parameters, best test score using these parameters, optimal KNN Configuration
print("Best Parameters", gs_modelKNN.best_params_)
print("Best score on Test Data", gs_modelKNN.best_score_)
print("Optimal Configuration", gs_modelKNN.best_estimator_)

Best Parameters {'metric': 'manhattan', 'n_neighbors': 3, 'weights': 'distance'}
Best score on Test Data -37.430815964160146
Optimal Configuration KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='manhattan',
                    metric_params=None, n_jobs=None, n_neighbors=3, p=2,
                    weights='distance')


In [13]:
# Summarize the results in a readable format
print("Best: {0}, using {1}".format(grid_results.cv_results_['mean_test_score'], grid_results.best_params_))
results_df = pd.DataFrame(grid_results.cv_results_)
results_df

Best: [-42.48185    -42.48185    -40.900799   -40.23324274 -40.72819267
 -39.78617392 -40.6592345  -39.51440263 -41.0276     -39.6885985
 -41.68243667 -40.11105157 -41.75045657 -40.14699774 -41.7285915
 -40.1423176  -42.07119178 -40.42240337 -42.4075774  -40.65441814
 -42.41287455 -40.68351382 -42.48807483 -40.74190165 -42.64927015
 -40.8306175  -42.83254871 -40.98284794 -43.0273988  -41.13026415
 -43.20628438 -41.2952593  -43.26890635 -41.38403336 -43.421183
 -41.49656674 -43.45877284 -41.55784055 -43.5126611  -41.64075847
 -43.46565629 -41.65779193 -43.55286909 -41.73881516 -43.60729939
 -41.80957201 -43.63458275 -41.84702406 -43.73178792 -41.92960397
 -43.70932231 -41.94004337 -43.77751407 -42.0055958  -43.78979657
 -42.04587684 -43.80148241 -42.07580524 -43.7812496  -42.08284028
 -43.81231148 -42.13980812 -43.80313531 -42.14898562 -43.82618212
 -42.18028269 -43.76739188 -42.16366608 -43.76662074 -42.1753105
 -43.75849106 -42.1923414  -43.79486005 -42.23412821 -43.80559247
 -42.2555

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_metric,param_n_neighbors,param_weights,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.004183,0.003421,0.011182,0.005202,euclidean,1,uniform,"{'metric': 'euclidean', 'n_neighbors': 1, 'wei...",-44.670200,-55.268620,-36.360640,-38.367900,-37.741890,-42.481850,7.002162,161
1,0.002385,0.000495,0.010779,0.003615,euclidean,1,distance,"{'metric': 'euclidean', 'n_neighbors': 1, 'wei...",-44.670200,-55.268620,-36.360640,-38.367900,-37.741890,-42.481850,7.002162,161
2,0.002393,0.000489,0.011580,0.004306,euclidean,2,uniform,"{'metric': 'euclidean', 'n_neighbors': 2, 'wei...",-42.477095,-51.418660,-35.480210,-38.045655,-37.082375,-40.900799,5.748480,48
3,0.002594,0.000488,0.011955,0.004065,euclidean,2,distance,"{'metric': 'euclidean', 'n_neighbors': 2, 'wei...",-41.881488,-50.913267,-34.631130,-37.573852,-36.166478,-40.233243,5.861301,30
4,0.002797,0.000392,0.012566,0.004751,euclidean,3,uniform,"{'metric': 'euclidean', 'n_neighbors': 3, 'wei...",-42.877167,-49.720153,-34.954567,-37.722057,-38.367020,-40.728193,5.165660,43
5,0.002392,0.000490,0.012954,0.004642,euclidean,3,distance,"{'metric': 'euclidean', 'n_neighbors': 3, 'wei...",-42.179329,-49.131670,-33.925449,-36.721736,-36.972685,-39.786174,5.379964,20
6,0.002392,0.000488,0.013892,0.004610,euclidean,4,uniform,"{'metric': 'euclidean', 'n_neighbors': 4, 'wei...",-41.685268,-50.493775,-34.402635,-37.712798,-39.001697,-40.659235,5.446924,40
7,0.002406,0.000502,0.013740,0.004815,euclidean,4,distance,"{'metric': 'euclidean', 'n_neighbors': 4, 'wei...",-40.972438,-49.614913,-33.362089,-36.343462,-37.279112,-39.514403,5.604436,15
8,0.002792,0.000400,0.014570,0.004843,euclidean,5,uniform,"{'metric': 'euclidean', 'n_neighbors': 5, 'wei...",-42.111580,-51.024842,-34.473346,-38.154400,-39.373832,-41.027600,5.568992,53
9,0.002786,0.000396,0.016158,0.005078,euclidean,5,distance,"{'metric': 'euclidean', 'n_neighbors': 5, 'wei...",-41.278650,-49.780624,-33.158236,-36.588193,-37.637290,-39.688598,5.671741,17


In [14]:
results_df = results_df.sort_values(['rank_test_score'])
results_df

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_metric,param_n_neighbors,param_weights,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
203,0.002385,4.774543e-04,0.011420,0.002318,manhattan,3,distance,"{'metric': 'manhattan', 'n_neighbors': 3, 'wei...",-36.862529,-50.201780,-31.506582,-35.170511,-33.412677,-37.430816,6.629803,1
201,0.003199,3.954163e-04,0.010971,0.003002,manhattan,2,distance,"{'metric': 'manhattan', 'n_neighbors': 2, 'wei...",-36.898811,-49.755115,-32.295692,-35.133963,-33.291529,-37.475022,6.339406,2
205,0.003180,4.049206e-04,0.013572,0.003707,manhattan,4,distance,"{'metric': 'manhattan', 'n_neighbors': 4, 'wei...",-37.297189,-50.011769,-32.196702,-35.141979,-34.932721,-37.916072,6.260927,3
207,0.002792,3.988989e-04,0.012766,0.003179,manhattan,5,distance,"{'metric': 'manhattan', 'n_neighbors': 5, 'wei...",-37.552430,-50.234851,-31.665158,-35.597144,-35.137165,-38.037350,6.387248,4
200,0.002998,1.157251e-05,0.011763,0.002633,manhattan,2,uniform,"{'metric': 'manhattan', 'n_neighbors': 2, 'wei...",-37.519030,-50.306145,-32.944055,-36.056190,-33.985230,-38.162130,6.276949,5
202,0.002998,1.493452e-05,0.011763,0.002707,manhattan,3,uniform,"{'metric': 'manhattan', 'n_neighbors': 3, 'wei...",-37.529633,-50.947620,-32.420863,-36.745727,-34.407600,-38.410289,6.521131,6
209,0.002393,4.891819e-04,0.013671,0.003141,manhattan,6,distance,"{'metric': 'manhattan', 'n_neighbors': 6, 'wei...",-37.494768,-50.284659,-31.851322,-36.152084,-36.354194,-38.427405,6.231925,7
211,0.002200,3.962626e-04,0.013756,0.002636,manhattan,7,distance,"{'metric': 'manhattan', 'n_neighbors': 7, 'wei...",-37.730359,-50.710556,-31.888652,-36.503582,-36.820020,-38.730634,6.323638,8
213,0.003205,3.893315e-04,0.016343,0.004943,manhattan,8,distance,"{'metric': 'manhattan', 'n_neighbors': 8, 'wei...",-37.279475,-50.921533,-31.993196,-36.523393,-37.033188,-38.750157,6.385425,9
215,0.002392,4.743728e-04,0.013764,0.003173,manhattan,9,distance,"{'metric': 'manhattan', 'n_neighbors': 9, 'wei...",-37.572441,-51.098259,-32.020632,-36.883301,-37.070230,-38.928972,6.407754,10
