<a href="https://colab.research.google.com/github/elooo3/Masters-NLP---B620035/blob/main/KNN_ISEAR.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Importing dependencies

In [None]:
# importing libraries to be used for performing set tasks
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import re # for simplifying the sentences
import nltk # for downloading ensemble of stop words
nltk.download('wordnet')
nltk.download('stopwords') # now, download stop words
from nltk.corpus import stopwords # import stop words into notebook
from nltk.stem.porter import PorterStemmer # import class to be used in performing stemming 
from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\obemb\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\obemb\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
# importing dataset
dataset = pd.read_excel('Dataset.xlsx', sheet_name = 'isear')

# Text Preprocessing


In [None]:
from nltk.corpus import stopwords

# creating an argument that holds all stop words in english language
stop_words = stopwords.words('english')

In [None]:
corpus = [] # create a list which will contains all cleaned data
for i in range(0, 7511): # where 7511 is the number of sentences in the dataset 

#  store data in review and update after every cleaning process
  review = re.sub('[^a-zA-Z]', ' ', dataset['Review'][i]) # re subfunction to replace any element that is not a letter with a space
  review = review.lower() # transform all capitals to lowercase letters 
  review = review.split() # split the different elements of the sentences into different words preparing it for stemming
  ps = PorterStemmer() # call stemming function
  lemma = WordNetLemmatizer() #call lemmatizer function
  all_stopwords = stopwords.words('english') # defining stop words in english

  
 # defining a list of words to be removed from stop word list
  unwanted_num = {'not','is','but','why','before','again','how','more','most','no','don','will','wouldn','against','aren','couldn','didn','doesn','hadn','hasn','haven','isn','wasn','weren'}
 
  all_stopwords = [ele for ele in all_stopwords if ele not in unwanted_num] # remove all words specified above from stop word list

  review = [lemma.lemmatize(word) for word in review if not word in set(all_stopwords)] # lemmatize words in the sentences
  review = ' '.join(review) # get back original format of the sentence 
  corpus.append(review) # update corpus with each clean sentence

# Feature Engineering/Extraction

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer #import tfidf vectorizer for tokenization

# create instance of the tfidf vectorizer class
tfid = TfidfVectorizer(smooth_idf=False)


X = tfid.fit_transform(corpus).toarray() # fit corpus to X
y = dataset.loc[:, ['Label']].values # set target variable as the emotion state

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0) #split dataset into training and test sets in ratio 75:25

# Model Fitting and Tuning

In [None]:
# importing necessary libraries
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.model_selection import KFold 
from sklearn.metrics import accuracy_score, confusion_matrix, roc_curve, roc_auc_score
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

In [None]:
# fit the data into kNN model and start tuning parameters:
knn = KNeighborsClassifier()

param_grid = { 'algorithm' : ['ball_tree', 'kd_tree', 'brute'],
               'n_neighbors' : [3,5,7,9,10,11,12,13]
              }
      
gridsearch = GridSearchCV(knn, param_grid,verbose=3, n_jobs = 2)
gridsearch.fit(X_train,y_train)
# let's see the  best parameters according to gridsearch
gridsearch.best_params_

Fitting 5 folds for each of 24 candidates, totalling 120 fits


  return self._fit(X, y)


{'algorithm': 'ball_tree', 'n_neighbors': 13}

In [None]:
# give accuracy score on training set
knn = KNeighborsClassifier(algorithm = 'ball_tree', n_neighbors =13, n_jobs = 3)
knn.fit(X_train,y_train)
knn.score(X_train,y_train)

  return self._fit(X, y)


0.5625776673175927

In [None]:
# tune more parameters
knn = KNeighborsClassifier(algorithm = 'ball_tree')
param_grid = { 'weights' : ['uniform', 'distance'],
               'n_neighbors' : [13,15,20,25,30,45]
              }
gridsearch = GridSearchCV(knn, param_grid,verbose=3, n_jobs = 3)
gridsearch.fit(X_train,y_train)
# let's see the  best parameters according to gridsearch
gridsearch.best_params_

Fitting 5 folds for each of 12 candidates, totalling 60 fits


  return self._fit(X, y)


{'n_neighbors': 45, 'weights': 'distance'}

In [None]:
# we will use the best parameters in our k-NN algorithm and check if accuracy is increasing.
knn = KNeighborsClassifier(algorithm = 'ball_tree', n_neighbors =45, n_jobs = 3, weights = 'distance')
knn.fit(X_train,y_train)


  return self._fit(X, y)


KNeighborsClassifier(algorithm='ball_tree', n_jobs=3, n_neighbors=45,
                     weights='distance')

In [None]:
# give accuracy score on training set
y_pred = knn.predict(X_test)
#print("The accuracy score is : ", accuracy_score(y_test,y_pred))

In [None]:
# printing more detailed results
from sklearn.metrics import classification_report
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           1       0.50      0.76      0.60       250
           2       0.61      0.63      0.62       280
           3       0.41      0.27      0.32       272
           4       0.44      0.65      0.53       287
           5       0.64      0.41      0.50       280
           6       0.42      0.30      0.35       256
           7       0.41      0.40      0.41       253

    accuracy                           0.49      1878
   macro avg       0.49      0.49      0.48      1878
weighted avg       0.49      0.49      0.48      1878



In [None]:
# Area Under Curve
#roc auc score
#predicting the data
y_prob_pred = knn.predict_proba(X_test)
roc_auc_score(y_test, y_prob_pred, multi_class='ovr', average='weighted')

0.8232898994142814