**Importing Libraries**

In [19]:
import pandas as pd
import nltk
import re
import numpy as np
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords
from sklearn.metrics import classification_report
from sklearn.svm import SVC, LinearSVC
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,f1_score
from nltk.stem.porter import PorterStemmer
from nltk.stem import LancasterStemmer
from gensim.models import word2vec
from nltk.tokenize import word_tokenize
from sklearn.ensemble import RandomForestClassifier

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


**Reading Data**

In [7]:
df_train = pd.read_excel('P1_training.xlsx')
df_test = pd.read_excel('P1_testing.xlsx')
sentence=df_train['sentence']
label=df_train['label']
sentence_test=df_test['sentence']
label_test=df_test['label']
print("Training Data")
print(label.value_counts())
print("Testing Data")
print(label_test.value_counts())

Training Data
1    736
2    661
0    263
Name: label, dtype: int64
Testing Data
1    303
2    298
0     82
Name: label, dtype: int64


**Preprocessing Data**

In [8]:
def preprocessor(text):
  review=re.sub('\[[^]]*\]', ' ', text)
  review=re.sub('[^a-zA-z]', ' ', text)
  review=review.lower().split()
  review=[i for i in review if not i in set(stopwords.words('english'))]
  review= ' '.join(review)
  return review

In [9]:
sentence=sentence.apply(preprocessor)
sentence_test=sentence_test.apply(preprocessor)

**Tfidf Vectorizer with Classifiers**

In [10]:
# Create feature vectors
vectorizer = TfidfVectorizer()
train_vectors = vectorizer.fit_transform(sentence)
train_vectors_test=vectorizer.transform(sentence_test)



In [11]:
#Linear SVC
clf=LinearSVC()
clf.fit(train_vectors,label)
predictions=clf.predict(train_vectors_test)
print (classification_report(label_test,predictions))
print("Accuracy is",accuracy_score(predictions,label_test))

              precision    recall  f1-score   support

           0       0.27      0.09      0.13        82
           1       0.57      0.66      0.61       303
           2       0.58      0.59      0.58       298

    accuracy                           0.56       683
   macro avg       0.47      0.44      0.44       683
weighted avg       0.53      0.56      0.54       683

Accuracy is 0.5592972181551976


In [12]:
#Random Forest Classifiers
clf = RandomForestClassifier(n_estimators=10, random_state=0)
clf.fit(train_vectors,label)
predictions=clf.predict(train_vectors_test)
print (classification_report(label_test,predictions))
print("Accuracy is",accuracy_score(predictions,label_test))

              precision    recall  f1-score   support

           0       0.25      0.05      0.08        82
           1       0.55      0.71      0.62       303
           2       0.56      0.53      0.55       298

    accuracy                           0.55       683
   macro avg       0.46      0.43      0.42       683
weighted avg       0.52      0.55      0.52       683

Accuracy is 0.5505124450951684


**Count Vectorizer**

In [13]:
vectorizer1 = CountVectorizer()
train_vectors_1 = vectorizer1.fit_transform(sentence)
train_vectors_test_1=vectorizer1.transform(sentence_test)

In [14]:
#Linear SVC
clf=LinearSVC()
clf.fit(train_vectors_1,label)
predictions=clf.predict(train_vectors_test_1)
print (classification_report(label_test,predictions))
print("Accuracy is",accuracy_score(predictions,label_test))

              precision    recall  f1-score   support

           0       0.22      0.18      0.20        82
           1       0.55      0.60      0.58       303
           2       0.57      0.54      0.56       298

    accuracy                           0.53       683
   macro avg       0.45      0.44      0.44       683
weighted avg       0.52      0.53      0.52       683

Accuracy is 0.527086383601757


In [15]:
#Random Forest Classifiers
clf = RandomForestClassifier(n_estimators=10, random_state=0)
clf.fit(train_vectors_1,label)
predictions=clf.predict(train_vectors_test_1)
print (classification_report(label_test,predictions))
print("Accuracy is",accuracy_score(predictions,label_test))

              precision    recall  f1-score   support

           0       0.67      0.05      0.09        82
           1       0.51      0.79      0.62       303
           2       0.59      0.41      0.48       298

    accuracy                           0.53       683
   macro avg       0.59      0.42      0.40       683
weighted avg       0.56      0.53      0.50       683

Accuracy is 0.5344070278184481


**Word2Vec**

In [23]:
porter=LancasterStemmer()
res=[]
for i in sentence:
  res.append(word_tokenize(i))
res_test=[]
for i in sentence_test:
  res_test.append(word_tokenize(i))
for doc in res:
  for i in doc:
    i=porter.stem(i)
for doc in res_test:
  for i in doc:
    i=porter.stem(i)


In [24]:
from gensim.models import word2vec
import warnings
warnings.filterwarnings('ignore')
model=word2vec.Word2Vec(res, min_count=1)
model.train(sentences=res, total_examples=len(res), epochs=100)

answer=[]
for doc in res:
  res1=[]  
  for i in doc:
    word_vec = model[i] 
    doc_vector=word_vec.mean()
    res1.append(doc_vector)
  a=np.array(res1).mean()
  answer.append([a])
data=np.array(answer)


model1=word2vec.Word2Vec(res_test,min_count=1)
model1.train(sentences=res_test, total_examples=len(res_test), epochs=100)
answer=[]
for doc in res_test:
  res1_test=[]  
  for i in doc:
    word_vec = model1[i] 
    doc_vector=word_vec.mean()
    res1_test.append(doc_vector)
  a=np.array(res1_test).mean()
  answer.append([a])
data_test=np.array(answer)


In [25]:
clf=LinearSVC()
clf.fit(data,label)
predictions=clf.predict(data_test)
print("Accuracy is",accuracy_score(predictions,label_test))
print("F1 Score is",f1_score(predictions,label_test,average='weighted'))

Accuracy is 0.49633967789165445
F1 Score is 0.5701196381060829


In [26]:
clf = LogisticRegression(random_state=0,max_iter=500)
clf.fit(data,label)
predictions=clf.predict(data_test)
print("Accuracy is",accuracy_score(predictions,label_test))
print("F1 Score is",f1_score(predictions,label_test,average='weighted'))

Accuracy is 0.4670571010248902
F1 Score is 0.5763144106514149
