In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_validate
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report
import pickle

In [2]:
#Reading the DataFrame
df = pd.read_csv('credit_score.csv')

In [3]:
#Our features that will be applied in the model
hypothesis_variables = ['Amount_invested_monthly','Total_EMI_per_month', 'Interest_Rate','Outstanding_Debt', 'Annual_Income', 'Delay_from_due_date', 'Num_of_Delayed_Payment', 'Num_of_Loan']

x = df[hypothesis_variables] #training set
y = df['Credit_Score'] #labels set

#Parameters of KNN to choose from using grid search
param_grid_SVR = {
    'n_neighbors': [2, 3, 4, 5, 7, 8, 10, 15],                                  
    'algorithm': ['ball_tree', 'kd_tree', 'brute', 'auto'],          
    'metric': ['minkowski', 'euclidean', 'manhattan', 'chebyshev']
}

#Splitting the data into train and test set
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=1)


In [8]:
#Initialising KNN classifier with 4 neighbors and weights param as distance
model = KNeighborsClassifier(n_neighbors=4, weights='distance')
model.fit(x_train,y_train)

#calculating the accuracy for it
model.score(x_test,y_test)

0.7916666666666666

In [63]:
#Trying to optimize accuracy by grid search, but the first result has higher accuracy so we stick with it
kNNModel_grid = GridSearchCV(estimator=KNeighborsClassifier(), param_grid=param_grid_knn, verbose=1, cv=10, n_jobs=-1)
kNNModel_grid.fit(x_train, y_train)
print(kNNModel_grid.best_estimator_)
#kNNModel_grid.score(x_test,y_test)


Fitting 10 folds for each of 128 candidates, totalling 1280 fits
KNeighborsClassifier(algorithm='ball_tree', metric='manhattan', n_neighbors=2)


In [6]:
#saving the model to our current directory
pickle.dump(model, open('final_model.sav', 'wb'))
y_pred = model.predict(x_test)

In [7]:
#calculating some measures and statistics for our model
print(confusion_matrix(y_test, y_pred))
print('--------------------')
print(classification_report(y_test, y_pred))

[[ 4259     4  1064]
 [   13  7346  1324]
 [ 1607  2238 12145]]
--------------------
              precision    recall  f1-score   support

        Good       0.72      0.80      0.76      5327
        Poor       0.77      0.85      0.80      8683
    Standard       0.84      0.76      0.80     15990

    accuracy                           0.79     30000
   macro avg       0.78      0.80      0.79     30000
weighted avg       0.80      0.79      0.79     30000

