$\color{blue}{\text{Import Libraries}}$

In [None]:
# Always write your Name and Matriculation number here before submitting any task on elearning.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sn
import numpy as np
%matplotlib inline

$\color{blue}{\text{Get the Data}}$

Set index_col=0 to use the first column as the index.

In [None]:
df = pd.read_csv("/content/MUT_features.csv",index_col=0)

In [None]:
df.dtypes    #data type object in pandas means either it includes text or mixed numeric and non numeric values

In [None]:
print(df.head())      #Note: .head() can be used only for dataframes and not for arrays
print(type(df))
#Note: Each row of Dataset is single datapoint for our model

In [None]:
a = df['Experimental value']        #to access any column of DataFrame
b = df[1:2]     # to access any row or multiple rows of DataFrame

In [None]:
print(df[0:2])    #to access multiple rows of DataFrame

In [None]:
a1 = df[df['Experimental value']==1]            #to access all rows having 'Experimental value' equals 1 in MUT_features dataset

In [None]:
a2 = df[df['qed']<0.5]             #to access all rows having 'qed' values less than 0.5 in MUT_features dataset

In [None]:
c = df.drop(['Id','CAS','SMILES','Status','Experimental value','Predicted value'],axis=1)
print(c)
#sometimes we also need to add new column to existing dataframe

$\color{blue}{\text{Standardize the Input Features}}$

The prediction performed by KNN is based on the distance measure between the point to be predicted and the set of the k nereast neighbours. This distance is naturally affected by the scale of the input features. Therefore, we need to scale all input features in order to give the same importance to all regardless of their original scale.

Here, we will use the method known as standardization.

$\hat{x} = \frac{x-\mu_x}{\sigma_x}$

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
scaler = StandardScaler()

In [None]:
scaler.fit(df.drop(['Id','CAS','SMILES','Status','Experimental value','Predicted value'],axis=1))

StandardScaler()

In [None]:
scaled_features = scaler.transform(df.drop(['Id','CAS','SMILES','Status','Experimental value','Predicted value'],axis=1))
#Note: Variable 'scaled_features' is by default calculated as array, so we need to convert array in DataFrame

In [None]:
df_feat = pd.DataFrame(scaled_features,columns=df.columns[6:])

In [None]:
df_feat.head()

$\color{blue}{\text{Train Test Split}}$

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(scaled_features,df['Experimental value'],
                                                    test_size=0.2)

In [None]:
print(len(X_train))
print(len(X_test))

$\color{blue}{\text{Using KNN}}$

In [None]:
from sklearn.neighbors import KNeighborsClassifier           # to import regression model replace Classifier by Regressor in this line

In [None]:
knn = KNeighborsClassifier(n_neighbors=3)

In [None]:
knn.fit(X_train,y_train)

In [None]:
pred = knn.predict(X_test)

$\color{blue}{\text{Predictions and Evaluations}}$

Let's evaluate our KNN model!

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import plot_roc_curve
from sklearn.metrics import roc_auc_score
from sklearn.metrics import mean_squared_error        # to be used for regression model

In [None]:
print(confusion_matrix(y_test,pred))

In [None]:
print(accuracy_score(y_test,pred))

In [None]:
print(precision_score(y_test,pred))
print(recall_score(y_test,pred))

In [None]:
print(f1_score(y_test,pred))


$\color{blue}{\text{The impact of k (Finding optimum value of k)}}$

Let's now study what is the impact of the chosen number of neighbours in the accuracy of the model

In [None]:
train_accuracy = []
test_accuracy = []
train_precision = []
test_precision = []
train_recall = []
test_recall = []

num_ks = [1, 3, 9, 11, 21, 41, 61, 81, 101, 121, 201, 301]    #list of numbers 
#choose only odd values of k in classification problem so that model can surely select the output as either 0 or 1

for i in range(len(num_ks)):
  knn = KNeighborsClassifier(n_neighbors=num_ks[i])
  knn.fit(X_train,y_train)

  pred_train = knn.predict(X_train)
  pred_test  = knn.predict(X_test)

  train_accuracy.append(1-accuracy_score(y_train, pred_train))
  test_accuracy.append(1-accuracy_score(y_test, pred_test))
  train_precision.append(precision_score(y_train, pred_train))
  test_precision.append(precision_score(y_test, pred_test))
  train_recall.append(recall_score(y_train, pred_train))
  test_recall.append(recall_score(y_test, pred_test))


In [None]:
plt.plot(test_precision, test_recall, 'bo--', label='Precision vs Recall trade off')
plt.xlim([0.6,0.8])
plt.ylim([0.6,0.8])
plt.xlabel('Precision')
plt.ylabel('Recall')
plt.legend()
plt.show()

In [None]:
plt.figure(figsize=(10,6))
plt.plot(num_ks, train_accuracy, 'bo--', label='Train')
plt.plot(num_ks, test_accuracy, 'ro--', label='Test')
plt.xlabel('K')
plt.ylabel('1 - Accuracy')
plt.legend()
plt.show()

This graph is pretty similar to the one that we saw on slide 9 of Lecture 2. Here, we can see the expected general trend of the performance curves. 

Which k do you think is the best?

$\color{blue}{\text{Cross Validation}}$

In [None]:
from sklearn.model_selection import cross_val_score

In [None]:
num_ks = 3
knn = KNeighborsClassifier(n_neighbors=num_ks)
scores = cross_val_score(knn, X_train, y_train, cv=5, scoring='accuracy')
mean_score = scores.mean()
print(scores)
print(mean_score)

In [None]:
num_ks = [1, 3, 9, 11, 21, 41, 61, 81, 101, 121, 201, 301]    #list of numbers 
#choose only odd values of k in classification problem so that model can surely select the output as either 0 or 1
validation_accuracy = []
validation_precision = []
validation_recall = []
validation_f1 = []
for i in range(len(num_ks)):
  knn = KNeighborsClassifier(n_neighbors=num_ks[i])
  score1 = cross_val_score(knn, X_train, y_train, cv=5, scoring='accuracy')
  score2 = cross_val_score(knn, X_train, y_train, cv=5, scoring='precision')
  score3 = cross_val_score(knn, X_train, y_train, cv=5, scoring='recall')
  score4 = cross_val_score(knn, X_train, y_train, cv=5, scoring='f1')
  mean_score1 = score1.mean()
  mean_score2 = score2.mean()
  mean_score3 = score3.mean()
  mean_score4 = score4.mean()
  validation_accuracy.append(1-mean_score1)
  validation_precision.append(mean_score2)
  validation_recall.append(mean_score3)
  validation_f1.append(mean_score4)

In [None]:
validation_accuracy, validation_precision,validation_recall, validation_f1

$\color{blue}{\text{Finding optimum k using 'cross validation mean accuracy score vs k' diagram}}$


In [None]:
#plt.figure(figsize=(10,6))
plt.plot(num_ks, validation_accuracy, 'ro--', label='Validation')
plt.xlabel('K')
plt.ylabel('1 - Accuracy')
plt.legend()
plt.show()

plt.plot(num_ks, validation_precision, 'bo--', label='Validation')
plt.xlabel('K')
plt.ylabel('precision')
plt.legend()
plt.show()

plt.plot(num_ks, validation_recall, 'go--', label='Validation')
plt.xlabel('K')
plt.ylabel('recall')
plt.legend()
plt.show()

In [None]:
plt.figure(figsize=(10,6))
plt.plot(validation_precision, validation_recall, 'ro--', label='Validation')
plt.xlabel('Precison')
plt.ylabel('Recall')
plt.title('Precision_recall trade off')
plt.legend()
plt.show()

#Task 1: Description

Please note that every task submission is optional.
1. Follow the same guidlines as discussed today to built KNN regression model using the dataset uploaded in github by the name 'BCF_features_training.csv'.
2. Include MSE (mean square error) as performance matrix and optimal k value at the very last line of the code.
3. After finishing your code download it in .ipynb format and upload your .ipynb file on elearning platform
 (Link to upload the files is already active on elearning)
 4. I will calculate the performance matrix using the secret data (unseen data) that I have already kept with me on your model and
  based on best 5 results (optimum k value and MSE error) will be given 2% extra marks for the exam.