**Importing libraries**

In [11]:
import pandas as pd
import nltk
import re
import numpy as np
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords
from sklearn.metrics import classification_report
from sklearn.svm import SVC, LinearSVC
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,f1_score
from nltk.stem.porter import PorterStemmer
from nltk.stem import LancasterStemmer
from gensim.models import word2vec
from nltk.tokenize import word_tokenize
from sklearn.ensemble import RandomForestClassifier

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [13]:
df_train = pd.read_excel('P1_training.xlsx')
df_test = pd.read_excel('P1_testing.xlsx')
sentence=df_train['sentence']
label=df_train['label']
sentence_test=df_test['sentence']
label_test=df_test['label']
print("Training Data")
print(label.value_counts())
print("Testing Data")
print(label_test.value_counts())

Training Data
1    736
2    661
0    263
Name: label, dtype: int64
Testing Data
1    303
2    298
0     82
Name: label, dtype: int64


**Data Preprocessing**

In [14]:
def preprocessor(text):
  review=re.sub('\[[^]]*\]', ' ', text)
  review=re.sub('[^a-zA-z]', ' ', text)
  review=review.lower().split()
  review=[i for i in review if not i in set(stopwords.words('english'))]
  review= ' '.join(review)
  return review
sentence=sentence.apply(preprocessor)
sentence_test=sentence_test.apply(preprocessor)

**Downloading Spacy model**

In [15]:
!python -m spacy download en_core_web_lg

Collecting en_core_web_lg==2.2.5
[?25l  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-2.2.5/en_core_web_lg-2.2.5.tar.gz (827.9MB)
[K     |████████████████████████████████| 827.9MB 87.5MB/s 
Building wheels for collected packages: en-core-web-lg
  Building wheel for en-core-web-lg (setup.py) ... [?25l[?25hdone
  Created wheel for en-core-web-lg: filename=en_core_web_lg-2.2.5-cp36-none-any.whl size=829180944 sha256=bb4d902c27a4a868f48239a93939f3de6bcea4014b0c5bcf7054db436dc764ba
  Stored in directory: /tmp/pip-ephem-wheel-cache-umd6nwbf/wheels/2a/c1/a6/fc7a877b1efca9bc6a089d6f506f16d3868408f9ff89f8dbfc
Successfully built en-core-web-lg
Installing collected packages: en-core-web-lg
Successfully installed en-core-web-lg-2.2.5
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_lg')


In [16]:
import spacy
import en_core_web_lg
nlp=en_core_web_lg.load() 

**Spacy Word2Vec(with preprocessing)**

In [17]:
def get_vector(x):
    doc=nlp(x)
    vec=doc.vector
    return vec

train_df,test_df=df_train,df_test
train_df['vec']=sentence.apply(lambda x: get_vector(x))
test_df['vec']=sentence_test.apply(lambda x: get_vector(x))
train_X,test_X=train_df['vec'].to_numpy(),test_df['vec'].to_numpy()
train_X,test_X=train_X.reshape(-1,1),test_X.reshape(-1,1)
train_X=np.concatenate(np.concatenate(train_X,axis=0),axis=0).reshape(-1,300)
test_X=np.concatenate(np.concatenate(test_X,axis=0),axis=0).reshape(-1,300)

In [18]:
clf = LogisticRegression(random_state=0,max_iter=500)
clf.fit(train_X,label)
predictions1=clf.predict(test_X)
print (classification_report(label_test,predictions1))
print("Accuracy is",accuracy_score(predictions1,label_test))

              precision    recall  f1-score   support

           0       0.29      0.15      0.19        82
           1       0.60      0.69      0.64       303
           2       0.65      0.63      0.64       298

    accuracy                           0.60       683
   macro avg       0.51      0.49      0.49       683
weighted avg       0.58      0.60      0.59       683

Accuracy is 0.6002928257686676


In [19]:
clf=LinearSVC()
clf.fit(train_X,label)
predictions1=clf.predict(test_X)
print (classification_report(label_test,predictions1))
print("Accuracy is",accuracy_score(predictions1,label_test))

              precision    recall  f1-score   support

           0       0.28      0.15      0.19        82
           1       0.58      0.67      0.62       303
           2       0.64      0.63      0.64       298

    accuracy                           0.59       683
   macro avg       0.50      0.48      0.48       683
weighted avg       0.57      0.59      0.58       683

Accuracy is 0.5885797950219619


**Spacy Word2Vec(without preprocessing)**

In [20]:
def get_vector(x):
    doc=nlp(x)
    vec=doc.vector
    return vec

train_df,test_df=df_train,df_test
train_df['vec']=train_df['sentence'].apply(lambda x: get_vector(x))
test_df['vec']=test_df['sentence'].apply(lambda x: get_vector(x))
train_X,test_X=train_df['vec'].to_numpy(),test_df['vec'].to_numpy()
train_X,test_X=train_X.reshape(-1,1),test_X.reshape(-1,1)
train_X=np.concatenate(np.concatenate(train_X,axis=0),axis=0).reshape(-1,300)
test_X=np.concatenate(np.concatenate(test_X,axis=0),axis=0).reshape(-1,300)

In [21]:
clf=LinearSVC()
clf.fit(train_X,label)
predictions=clf.predict(test_X)
print (classification_report(label_test,predictions))
print("Accuracy is",accuracy_score(predictions,label_test))

              precision    recall  f1-score   support

           0       0.18      0.09      0.12        82
           1       0.61      0.68      0.64       303
           2       0.66      0.67      0.66       298

    accuracy                           0.61       683
   macro avg       0.48      0.48      0.47       683
weighted avg       0.58      0.61      0.59       683

Accuracy is 0.6061493411420205


In [22]:
clf = LogisticRegression(random_state=0,max_iter=500)
clf.fit(train_X,label)
predictions=clf.predict(test_X)
print (classification_report(label_test,predictions))
print("Accuracy is",accuracy_score(predictions,label_test))

              precision    recall  f1-score   support

           0       0.23      0.09      0.12        82
           1       0.60      0.73      0.66       303
           2       0.68      0.66      0.67       298

    accuracy                           0.62       683
   macro avg       0.51      0.49      0.48       683
weighted avg       0.59      0.62      0.60       683

Accuracy is 0.6193265007320644


In [23]:
df_final = pd.read_excel('P1_testing.xlsx')
df_final['predicted_label']=predictions
df_final=df_final.rename(columns={"label": "gold_label"})
df_final

Unnamed: 0,sentence,gold_label,predicted_label
0,even if the whole thing proves to be a creativ...,2,2
1,", but isn't quite sure how to handle "" sam dee...",1,1
2,ruby's close friend gretchen ( cuz ya can't ha...,2,1
3,happy accidents is a romantic comedy filtered ...,2,2
4,"the film stars thandie newton , who was robbed...",2,2
...,...,...,...
678,"somehow , with a considerable suspension of di...",1,2
679,"occasionally , the violence is slightly uncomf...",0,0
680,what is perhaps most sensational about gods an...,2,2
681,he earned that nomination with his touching pe...,2,2


**Converting DF to CSV**

In [24]:
df_final.to_csv('test_output_proposedsolution.csv')
files.download('test_output_proposedsolution.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>