In [3]:
import pandas as pd
from sklearn.metrics import precision_score, accuracy_score, recall_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier

In [5]:
heart_df = pd.read_csv("heart.csv")

X = heart_df.drop("target", axis=1)
Y = heart_df["target"]

X_train, X_test, Y_train, Y_test = train_test_split(
    X, Y, test_size = 0.2, random_state = 42
)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [7]:

knn_classifier = KNeighborsClassifier(n_neighbors=3)
knn_classifier.fit(X_train_scaled, Y_train)

Y_pred = knn_classifier.predict(X_test_scaled)

print("recall score: ", recall_score(Y_test, Y_pred))
print("accuracy score: ", accuracy_score(Y_test, Y_pred))
print("precision score: ", precision_score(Y_test, Y_pred))

recall score:  0.78125
accuracy score:  0.8524590163934426
precision score:  0.9259259259259259


In [9]:
#for K = 5  
knn_classifier = KNeighborsClassifier(n_neighbors=5)
knn_classifier.fit(X_train_scaled, Y_train)

Y_pred = knn_classifier.predict(X_test_scaled)

print("recall score: ", recall_score(Y_test, Y_pred))
print("accuracy score: ", accuracy_score(Y_test, Y_pred))
print("precision score: ", precision_score(Y_test, Y_pred))

recall score:  0.875
accuracy score:  0.9016393442622951
precision score:  0.9333333333333333


In [12]:
#for k = 7 toh apan dekh sakte hai ke knn is data par logistic regresion se best result till now but use only test data
knn_classifier = KNeighborsClassifier(n_neighbors=7)
knn_classifier.fit(X_train_scaled, Y_train)

Y_pred = knn_classifier.predict(X_test_scaled)

print("recall score: ", recall_score(Y_test, Y_pred))
print("accuracy score: ", accuracy_score(Y_test, Y_pred))
print("precision score: ", precision_score(Y_test, Y_pred))

recall score:  0.90625
accuracy score:  0.9180327868852459
precision score:  0.9354838709677419


In [13]:
#for k = 9   har bar value badhane se score increase nahi hoga 
knn_classifier = KNeighborsClassifier(n_neighbors=9)
knn_classifier.fit(X_train_scaled, Y_train)

Y_pred = knn_classifier.predict(X_test_scaled)

print("recall score: ", recall_score(Y_test, Y_pred))
print("accuracy score: ", accuracy_score(Y_test, Y_pred))
print("precision score: ", precision_score(Y_test, Y_pred))

recall score:  0.875
accuracy score:  0.9016393442622951
precision score:  0.9333333333333333


In [20]:
# Cross Validation for hyperparam tuning using GriSearchCV

from sklearn.model_selection import GridSearchCV

classifier = KNeighborsClassifier()
param_grid = {"n_neighbors": [3, 5, 7, 9]}

classifierCV = GridSearchCV(
    classifier,
    param_grid,
    cv = 5
)

classifierCV.fit(X_train_scaled, Y_train)
Y_pred = classifierCV.predict(X_test_scaled)

print("recall score: ", recall_score(Y_test, Y_pred))
print("accuracy score: ", accuracy_score(Y_test, Y_pred))
print("precision score: ", precision_score(Y_test, Y_pred))

res = pd.DataFrame(classifierCV.cv_results_)
print(res)
print(res[["param_n_neighbors", "mean_test_score"]])
print(classifierCV.best_params_)   #iske according best 5 aya hai yeh jyada sahi hai kyoki inse bhaut sara data use kara hai sirf testr data nahi

recall score:  0.875
accuracy score:  0.9016393442622951
precision score:  0.9333333333333333
   mean_fit_time  std_fit_time  mean_score_time  std_score_time  \
0       0.002722      0.000450         0.005320        0.000399   
1       0.007417      0.002505         0.008236        0.001970   
2       0.002029      0.000638         0.007810        0.008298   
3       0.001920      0.000596         0.003399        0.000493   

   param_n_neighbors              params  split0_test_score  \
0                  3  {'n_neighbors': 3}           0.816327   
1                  5  {'n_neighbors': 5}           0.775510   
2                  7  {'n_neighbors': 7}           0.755102   
3                  9  {'n_neighbors': 9}           0.734694   

   split1_test_score  split2_test_score  split3_test_score  split4_test_score  \
0           0.795918           0.854167           0.750000           0.812500   
1           0.836735           0.833333           0.770833           0.854167   
2          

In [26]:
# if scoring data is recall best is 7  so score ke liye kya choose kar rahe yeh bhi imp hai 
from sklearn.model_selection import GridSearchCV

classifier = KNeighborsClassifier()
param_grid = {"n_neighbors": [3, 5, 7, 9]}

classifierCV = GridSearchCV(
    classifier,
    param_grid,
    cv = 5,
    scoring="recall"
)

classifierCV.fit(X_train_scaled, Y_train)
Y_pred = classifierCV.predict(X_test_scaled)

print("recall score: ", recall_score(Y_test, Y_pred))
print("accuracy score: ", accuracy_score(Y_test, Y_pred))
print("precision score: ", precision_score(Y_test, Y_pred))

res = pd.DataFrame(classifierCV.cv_results_)
print(res)
print(res[["param_n_neighbors", "mean_test_score"]])
print(classifierCV.best_params_)

recall score:  0.90625
accuracy score:  0.9180327868852459
precision score:  0.9354838709677419
   mean_fit_time  std_fit_time  mean_score_time  std_score_time  \
0       0.002252      0.000568         0.007857        0.002221   
1       0.003752      0.002064         0.010815        0.003577   
2       0.002258      0.000525         0.007551        0.001084   
3       0.001209      0.000402         0.005946        0.000986   

   param_n_neighbors              params  split0_test_score  \
0                  3  {'n_neighbors': 3}           0.851852   
1                  5  {'n_neighbors': 5}           0.777778   
2                  7  {'n_neighbors': 7}           0.814815   
3                  9  {'n_neighbors': 9}           0.777778   

   split1_test_score  split2_test_score  split3_test_score  split4_test_score  \
0           0.814815           0.962963           0.884615           0.807692   
1           0.814815           0.925926           0.923077           0.846154   
2        

In [31]:
#Pipeline
from sklearn.pipeline import Pipeline

X_train, X_test, y_train, Y_test = train_test_split(
    X, Y, test_size = 0.2, random_state = 42
)

pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('knn', KNeighborsClassifier())
])

param_grid = {"knn__n_neighbors": [3, 5, 7, 9]}   #__ double underscore is hirarchial undersandabel i.e knn me n_neighbors hyperparameter ko use kar rahe

classifierCV = GridSearchCV(
    pipeline,
    param_grid,
    cv = 5,
    scoring="recall"
)

classifierCV.fit(X_train, Y_train)  #training testing dono me scaled data use nahi karna pipeline me kyoki woh khud scale kar rahi 
                                    # toh data do bar scale nahi hoe usse bachne ke liye hum pipeline me scaling use nahi karenge 
Y_pred = classifierCV.predict(X_test) #ab scaled data ki jarurat nahi kyoki hamare parameter me pipeline use kar rahe jo ki already scale kar rahi 

print("recall score: ", recall_score(Y_test, Y_pred))
print("accuracy score: ", accuracy_score(Y_test, Y_pred))
print("precision score: ", precision_score(Y_test, Y_pred))

print(classifierCV.best_params_)

recall score:  0.90625
accuracy score:  0.9180327868852459
precision score:  0.9354838709677419
{'knn__n_neighbors': 7}
