<a href="https://colab.research.google.com/github/elooo3/Masters-NLP---B620035/blob/main/SVC_ISEAR.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## **Importing dependencies**

In [None]:
# importing libraries to be used for performing set tasks
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import re # for simplifying the reviews
import nltk # for downloading ensemble of stop words
nltk.download('wordnet')
nltk.download('stopwords') # now, download stop words
from nltk.corpus import stopwords # import stop words into notebook
from nltk.stem.porter import PorterStemmer # import class to be used in performing stemming 
from nltk.stem import WordNetLemmatizer # import class to be used in performing lematization

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\obemb\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\obemb\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
# importing dataset
dataset = pd.read_excel('Dataset.xlsx', sheet_name = 'isear')

In [None]:
from nltk.corpus import stopwords

# creating an argument that holds all stop words in english language
stop_words = stopwords.words('english')

# Text Preprocessing

In [None]:
corpus = [] # create a list which will contain all cleaned review data
for i in range(0, 7511): # where 7511 is the number of reviews in the dataset 
#  store data in review and update after every cleaning process
  review = re.sub('[^a-zA-Z]', ' ', dataset['Review'][i]) # re subfunction to replace any element that is not a letter with a space
  review = review.lower() # transform all capitals to lowercase letters 
  review = review.split() # split the different elements of the reviews into different words preparing it for stemming
  ps = PorterStemmer() # call stemming function
  lemma = WordNetLemmatizer() #call lemmatizer function
  all_stopwords = stopwords.words('english') # defining stop words in english
  
  # defining a list of words to be removed from stop word list
  unwanted_num = {'not','is','but','why','before','again','how','more','most','no','don','will','wouldn','against','aren','couldn','didn','doesn','hadn','hasn','haven','isn','wasn','weren'}
 
  all_stopwords = [ele for ele in all_stopwords if ele not in unwanted_num] # remove all words specified above from stop word list

  review = [lemma.lemmatize(word) for word in review if not word in set(all_stopwords)] # lemmatize words in the sentences for each review
  review = ' '.join(review) # get back original format of the review 
  corpus.append(review) # update corpus with each clean review

# Feature Engineering/Extraction

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer #import tfidf vectorizer for tokenization

# create instance of the count vectorizer class
tfid = TfidfVectorizer(smooth_idf=False)

X = tfid.fit_transform(corpus).toarray() # fit corpus to X 
y = dataset.loc[:, ['Label']].values # set target variable as the emotion state

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0) #split dataset into training and test sets in ratio 75:25

# Model fitting

In [None]:
# importing necessary libraries
from sklearn.svm import SVC 
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.model_selection import KFold 
from sklearn.metrics import accuracy_score, confusion_matrix, roc_curve, roc_auc_score, classification_report
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

#fit model on dataset using SVC
model=SVC()
model.fit(X_train,y_train)
model.score(X_train,y_train)

  return f(*args, **kwargs)


0.9778093378306408

In [None]:
y_hat = model.predict(X_train)
accuracy_score(y_train, y_hat) # give accuracy score on training set

0.9778093378306408

In [None]:
print(classification_report(y_train,y_hat)) # provide a more detailed report on training set results

              precision    recall  f1-score   support

           1       0.99      0.99      0.99       838
           2       0.98      0.99      0.98       803
           3       0.96      0.98      0.97       812
           4       0.97      0.96      0.97       793
           5       0.99      0.98      0.99       787
           6       0.98      0.97      0.97       793
           7       0.98      0.97      0.97       807

    accuracy                           0.98      5633
   macro avg       0.98      0.98      0.98      5633
weighted avg       0.98      0.98      0.98      5633



In [None]:
# test model on unseen data
y_pred = model.predict(X_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           1       0.63      0.73      0.68       250
           2       0.72      0.69      0.70       280
           3       0.42      0.52      0.47       272
           4       0.75      0.53      0.62       287
           5       0.57      0.61      0.59       280
           6       0.49      0.41      0.45       256
           7       0.47      0.50      0.49       253

    accuracy                           0.57      1878
   macro avg       0.58      0.57      0.57      1878
weighted avg       0.58      0.57      0.57      1878



# **Hyperparameter Tuning**

In [None]:
from sklearn.model_selection import GridSearchCV
model = SVC()
param_grid={'C':[0.1,1,10,50,100,500]}
      
gridsearch = GridSearchCV(model, param_grid,verbose=3, n_jobs = 3)
gridsearch.fit(X_train,y_train)
# let's see the  best parameters according to gridsearch
gridsearch.best_params_

Fitting 5 folds for each of 6 candidates, totalling 30 fits


  return f(*args, **kwargs)


{'C': 50}

In [None]:
model=SVC(C = 50)
model.fit(X_train,y_train)
model.score(X_train,y_train)

  return f(*args, **kwargs)


0.9987573229185159

In [None]:
y_pred = model.predict(X_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           1       0.66      0.72      0.69       250
           2       0.71      0.71      0.71       280
           3       0.44      0.46      0.45       272
           4       0.67      0.55      0.61       287
           5       0.57      0.62      0.59       280
           6       0.46      0.41      0.44       256
           7       0.47      0.49      0.48       253

    accuracy                           0.57      1878
   macro avg       0.57      0.57      0.57      1878
weighted avg       0.57      0.57      0.57      1878



In [None]:
model=SVC(C = 0.5)
model.fit(X_train,y_train)
model.score(X_train,y_train)

  return f(*args, **kwargs)


0.9039588141310136

In [None]:
y_pred = model.predict(X_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           1       0.64      0.72      0.68       250
           2       0.74      0.65      0.69       280
           3       0.35      0.68      0.46       272
           4       0.81      0.43      0.56       287
           5       0.57      0.61      0.59       280
           6       0.57      0.35      0.43       256
           7       0.49      0.42      0.45       253

    accuracy                           0.55      1878
   macro avg       0.60      0.55      0.55      1878
weighted avg       0.60      0.55      0.55      1878

