# Applying Knn on our dataset.

* Step 1: Split the data into training and test data.
* Step 2: Converting the text into vectors(using all four techniques BoW, tfidf, avg word-2 vec and tfidf word-2 vec.
* Step 3: Applying Knn on the training data.
* Step 4: Performing hyper parameter tuning to find the best value of hyperparameter(k).
* Step 5: Now plot Acurracy vs hyperparameters for different value of hyperparamter for crossvalidation and training data.
* Step 6: Now train your model with this value of hyperparameter(k).
* Step 7: Now plot the ROC curve for trainig and test data, for given value of hyperparameter.
* Step 8: Now create confusion matrix.

**Note: Repeat these all Steps for all word embedding technique and then also using kd-tree**

In [35]:
# importing necessary modules
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
from tqdm import tqdm
from sklearn.model_selection import KFold 
from sklearn.model_selection import GridSearchCV
import sklearn.metrics as metrics
from sklearn.metrics import confusion_matrix

In [36]:
#reading dataframe
df=pd.read_csv(r'D:\study material\ML\Datasets\amazon-fine-food-reviews\Reviews_cleaned.csv')
df.head()

Unnamed: 0,ProductId,UserId,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Clean_Text,clean_summary
0,B001E4KFG0,A3SGXH7AUHU8GW,1,1,1,1303862400,bought several vitality canned dog food produc...,good quality dog food
1,B00813GRG4,A1D87F6ZCVE5NK,0,0,0,1346976000,product arrived labeled jumbo salted peanut pe...,not advertised
2,B000LQOCH0,ABXLMWJIXXAIN,1,1,1,1219017600,confection around century light pillowy citrus...,delight say
3,B000UA0QIQ,A395BORC6FGVXV,3,3,0,1307923200,looking secret ingredient robitussin believe f...,cough medicine
4,B006K2ZZ7K,A1UQRSCLF8GW1T,0,0,1,1350777600,great taffy great price wide assortment yummy ...,great taffy


In [37]:
#changing the name of column 
df.rename(columns={'Clean_Text':'Text','clean_summary':'Summary'},inplace=True)

In [38]:
df.fillna('',inplace=True)

In [39]:
# Spliting the dataframe into train,and test dataset.
X_train,X_test,y_train,y_test=train_test_split(df['Text'],df['Score'],test_size=0.2,random_state=42)

In [40]:
#Shape of dataframes
print("The shape of the train and test datasets :",X_train.shape,X_test.shape,y_train.shape,y_test.shape)

The shape of the train and test datasets : (291346,) (72837,) (291346,) (72837,)


# 1. Applying Knn with Bow

In [41]:
# applying Bag of words on both test and training data
count_vect=CountVectorizer(ngram_range=(1,2),min_df=10,max_features=10000)
X_train_vect=count_vect.fit_transform(X_train)
X_test_vect=count_vect.transform(X_test)

In [42]:
X_train_vect.shape

(291346, 10000)

In [43]:
X_test_vect.shape

(72837, 10000)

In [44]:
y_train=np.array(y_train).reshape(-1,1)
y_train.shape

(291346, 1)

In [45]:
y_test=np.array(y_test).reshape(-1,1)
y_test.shape

(72837, 1)

In [46]:
X_test_vect.shape

(72837, 10000)

In [47]:
#Applying knn on the data
knn=KNeighborsClassifier(n_neighbors=10,algorithm='brute')

# 2. K-Fold Cross Validation :
* Performing 10-fold cross validation on the training data.

In [48]:
#using cross validation to find the best hyperparameters
acc=[]
kfold=KFold(n_splits=10)
param_k=list(range(1,50,2))
for i in tqdm(param_k):
    knn.n_neighbors=i
    sco=cross_val_score(knn,X_train_vect,y_train.ravel(),cv=kfold,scoring='accuracy')
    acc.append(np.mean(sco))



  0%|                                                                                           | 0/25 [00:00<?, ?it/s]
  4%|███                                                                         | 1/25 [1:08:24<27:21:40, 4104.21s/it]
  8%|██████                                                                      | 2/25 [2:23:39<27:00:33, 4227.56s/it]
 12%|█████████                                                                   | 3/25 [3:46:49<27:13:59, 4456.32s/it]
 16%|████████████▏                                                               | 4/25 [5:02:27<26:08:16, 4480.78s/it]
 20%|███████████████▏                                                            | 5/25 [6:24:21<25:36:53, 4610.67s/it]
 24%|██████████████████▏                                                         | 6/25 [7:49:46<25:08:55, 4765.04s/it]

KeyboardInterrupt: 

In [None]:
#ploting graph between hyperparameter-k and accuracy for each vaue of k.
for i in range(len(acc)):
    acc[i]=(1-acc[i])
    
sns.lineplot(param_k,acc)
plt.xlabel("Hyperparamter value")
plt.ylabel("Error")
plt.grid()
plt.show()


# 3.Grid-Search CV :
* Using Grid-Search CV to find the best hyperparams 

In [None]:
kfold=KFold(n_splits=10)
params={'n_neighbors':list(range(1,50))}
clf=GridSearchCV(knn,params,cv=10)
clf.fit(X_train_vect,y_train.ravel())

In [None]:
clf.best_params_

In [None]:
clf.score(X_test_vect,y_test)

# Conclusion (Hyperparameter Tuning):
* so when use K-Fold Cross validation for finding the best value of hyper_paramter k , we saw that the error is minimum at k=10 or we could say accuracy is maximum at k=10.
* Same way we performed grid search CV on our dataset and it automatically returns the best hyperparamter between the range we have entered.

In [None]:
# now using k=10 to train our model and 
knn.n_neighbors=10
knn.fit(X_train_vect,y_train.ravel())
knn.score(X_test_vect,y_test.ravel())

# ROC curve and AUC :

In [None]:
# Drawing Roc curve for both training and test data 
probs1 = knn.predict_proba(X_test_vect)[:,1]
probs2 = knn.predict_proba(X_train_vect)[:,1]

fpr1, tpr1, threshold = metrics.roc_curve(y_test, probs1)
fpr2, tpr2, threshold = metrics.roc_curve(y_train, probs2)

plt.title('Receiver Operating Characteristic Curve')
plt.plot(fpr1, tpr1, 'b', label = 'Test_data = %0.2f' % roc_auc)
plt.plot(fpr2, tpr2, 'b', label = 'Train_data = %0.2f' % roc_auc,c='r')
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'g--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()


# Printing Confusion Matrix

In [None]:
y_pred=knn.predict(X_test_vect)
con=confusion_matrix(y_test,y_pred)
con=pd.DataFrame(con,index=['Positive','Negative'],col=['Positive','Negative'])

In [None]:
sns.heatmap(con,annot=True)
plt.title('Confusion Matrix')
plt.xlabel("predicted label")
plt.ylabel("true label")
plt.show()