# Import library

In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
import pandas as pd
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import numpy as np

# Read data

In [2]:
data = pd.read_csv('labeled_data.csv', engine='python')
data.head()

Unnamed: 0,review,cleaned_review,pos_tagged,lemma,lemma_words,polarity
0,It is now past 1 PM and I just finished watchi...,It is now past PM and I just finished watching...,"[('past', 'a'), ('PM', 'n'), ('finished', 'v')...",past PM finished watching Francis Ford Coppo...,"['past', 'PM', 'finished', 'watching', 'Franci...",negative
1,I should probably go to bed.,I should probably go to bed,"[('probably', 'r'), ('go', 'v'), ('bed', 'v')]",probably go bed,"['probably', 'go', 'bed']",neutral
2,It's late and tomorrow I have to wake up a bit...,It s late and tomorrow I have to wake up a bit...,"[('late', 'a'), ('tomorrow', 'n'), ('wake', 'v...",late tomorrow wake bit early,"['late', 'tomorrow', 'wake', 'bit', 'early']",neutral
3,But not early enough to postpone writing these...,But not early enough to postpone writing these...,"[('early', 'r'), ('enough', 'r'), ('postpone',...",early enough postpone writing line,"['early', 'enough', 'postpone', 'writing', 'li...",positive
4,"Now that I have seen it three times, the oppor...",Now that I have seen it three times the opport...,"[('seen', 'v'), ('three', None), ('times', 'v'...",seen three time opportunity sharing thought ...,"['seen', 'three', 'time', 'opportunity', 'shar...",positive


# Extract data

In [3]:
data = data[['lemma','lemma_words','polarity']]
data

Unnamed: 0,lemma,lemma_words,polarity
0,past PM finished watching Francis Ford Coppo...,"['past', 'PM', 'finished', 'watching', 'Franci...",negative
1,probably go bed,"['probably', 'go', 'bed']",neutral
2,late tomorrow wake bit early,"['late', 'tomorrow', 'wake', 'bit', 'early']",neutral
3,early enough postpone writing line,"['early', 'enough', 'postpone', 'writing', 'li...",positive
4,seen three time opportunity sharing thought ...,"['seen', 'three', 'time', 'opportunity', 'shar...",positive
...,...,...,...
172697,watched film wanted see bad film expentance ...,"['watched', 'film', 'wanted', 'see', 'bad', 'f...",negative
172698,everyone want really good film good actor hi...,"['everyone', 'want', 'really', 'good', 'film',...",positive
172699,troll film look really funny scene remembere...,"['troll', 'film', 'look', 'really', 'funny', '...",positive
172700,horror movie never scared film,"['horror', 'movie', 'never', 'scared', 'film']",negative


# Sampling data

In [4]:
data_each_polarity = 40000

In [5]:
pos_data = data.loc[data['polarity'] == 'positive']
pos_data = pos_data.sample(n=data_each_polarity)
pos_data

Unnamed: 0,lemma,lemma_words,polarity
92477,want damn good touching movie go see America...,"['want', 'damn', 'good', 'touching', 'movie', ...",positive
48381,unique expressive direction work perfectly p...,"['unique', 'expressive', 'direction', 'work', ...",positive
134557,disappointed guess well DBZ fan see made her...,"['disappointed', 'guess', 'well', 'DBZ', 'fan'...",positive
37849,music intense worked great amazing totally a...,"['music', 'intense', 'worked', 'great', 'amazi...",positive
6665,friend kept telling watch thought brilliant,"['friend', 'kept', 'telling', 'watch', 'though...",positive
...,...,...,...
2486,movie love admire say Godfather favourite mo...,"['movie', 'love', 'admire', 'say', 'Godfather'...",positive
27866,hacker awakens reality join rebellion machin...,"['hacker', 'awakens', 'reality', 'join', 'rebe...",positive
60857,Scorsese brings list cast comprising Damon D...,"['Scorsese', 'brings', 'list', 'cast', 'compri...",positive
114560,well expected,"['well', 'expected']",positive


In [6]:
neg_data = data.loc[data['polarity'] == 'negative']
neg_data = neg_data.sample(n=data_each_polarity)
neg_data

Unnamed: 0,lemma,lemma_words,polarity
162042,however high budget film singer acting exper...,"['however', 'high', 'budget', 'film', 'singer'...",negative
86783,bit light hearted relief every often especia...,"['bit', 'light', 'hearted', 'relief', 'every',...",negative
121951,Everyone else worked movie bow head Director...,"['Everyone', 'else', 'worked', 'movie', 'bow',...",negative
1630,irony American Dream man parodied capitalism...,"['irony', 'American', 'Dream', 'man', 'parodie...",negative
147974,usually ill laugh pretty much anything movie...,"['usually', 'ill', 'laugh', 'pretty', 'much', ...",negative
...,...,...,...
38114,Although constantly mope depression tried in...,"['Although', 'constantly', 'mope', 'depression...",negative
41619,dream movie orderly,"['dream', 'movie', 'orderly']",negative
86623,Gandalf gang still trying defeat wrath Sauro...,"['Gandalf', 'gang', 'still', 'trying', 'defeat...",negative
86534,something strangely knowing final scene One ...,"['something', 'strangely', 'knowing', 'final',...",negative


# Label encoding

In [7]:
data = pd.concat([pos_data, neg_data], ignore_index=True)
data = data.replace({'polarity': {'negative': 0, 'positive': 1}})
data = data.sample(frac=1)
data

Unnamed: 0,lemma,lemma_words,polarity
70045,Sadly cat smart enough split halfway nightma...,"['Sadly', 'cat', 'smart', 'enough', 'split', '...",0
55273,Walker forced slimy squirming centipede like...,"['Walker', 'forced', 'slimy', 'squirming', 'ce...",0
65720,felt sickened everything rarely hated charac...,"['felt', 'sickened', 'everything', 'rarely', '...",0
5770,waste time,"['waste', 'time']",1
68663,evidence gathered point think reach guilty v...,"['evidence', 'gathered', 'point', 'think', 're...",0
...,...,...,...
10250,Forrest gump one best movie ever seen Tom Ha...,"['Forrest', 'gump', 'one', 'best', 'movie', 'e...",1
24926,say film solely grown men punching one anoth...,"['say', 'film', 'solely', 'grown', 'men', 'pun...",1
70331,regret watching Radhe Superb Movie Excellent...,"['regret', 'watching', 'Radhe', 'Superb', 'Mov...",0
41180,Anthony Hopkin performance movie value whats...,"['Anthony', 'Hopkin', 'performance', 'movie', ...",0


# Train data and benchmark

In [8]:
def fit_and_benchmark(model, X_train,X_test, y_train, y_test):
  model.fit(X_train, y_train)
  y_pred = model.predict(X_test)
  print(classification_report(y_pred,y_test))
  print("Accuracy:",accuracy_score(y_pred, y_test))

In [9]:
rfclf = RandomForestClassifier()
xgb_clf= GradientBoostingClassifier()
lgr = LogisticRegression(max_iter=1000)

In [10]:
target = data['polarity']
lemma_data = data['lemma']
lemma_words_data = data['lemma_words'].apply(lambda x: x[1:-1].split(','))

# Tf-idf

In [11]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(lemma_data)
X_train, X_test, y_train, y_test = train_test_split(X, target, test_size=0.2, random_state=42)

In [12]:
fit_and_benchmark(rfclf, X_train, X_test, y_train, y_test)

              precision    recall  f1-score   support

           0       0.84      0.84      0.84      8014
           1       0.84      0.84      0.84      7986

    accuracy                           0.84     16000
   macro avg       0.84      0.84      0.84     16000
weighted avg       0.84      0.84      0.84     16000

Accuracy: 0.839625


In [13]:
fit_and_benchmark(xgb_clf, X_train, X_test, y_train, y_test)

              precision    recall  f1-score   support

           0       0.91      0.66      0.76     11080
           1       0.52      0.86      0.65      4920

    accuracy                           0.72     16000
   macro avg       0.72      0.76      0.71     16000
weighted avg       0.79      0.72      0.73     16000

Accuracy: 0.71725


In [14]:
fit_and_benchmark(lgr, X_train, X_test, y_train, y_test)

              precision    recall  f1-score   support

           0       0.90      0.88      0.89      8177
           1       0.88      0.90      0.89      7823

    accuracy                           0.89     16000
   macro avg       0.89      0.89      0.89     16000
weighted avg       0.89      0.89      0.89     16000

Accuracy: 0.8884375


# Doc2vec

In [15]:
vector_size = 128
training_data = [TaggedDocument(lemma_words_data[i], [i]) for i in range(len(lemma_words_data))]
model = Doc2Vec(min_count=1, vector_size=vector_size)
model.build_vocab(training_data)
model.train(training_data, total_examples=model.corpus_count, epochs=model.epochs)

In [16]:
X = np.ones((80000, vector_size))
for i in range(80000):
    X[i] = model[i]
X_train, X_test, Y_train, Y_test = train_test_split(
    X, target, test_size=0.2, random_state=42)

In [17]:
fit_and_benchmark(rfclf, X_train, X_test, Y_train, Y_test)

              precision    recall  f1-score   support

           0       0.53      0.49      0.51      8610
           1       0.46      0.50      0.48      7390

    accuracy                           0.50     16000
   macro avg       0.50      0.50      0.49     16000
weighted avg       0.50      0.50      0.50     16000

Accuracy: 0.495625


In [18]:
fit_and_benchmark(xgb_clf, X_train, X_test, Y_train, Y_test)

              precision    recall  f1-score   support

           0       0.50      0.49      0.50      8012
           1       0.50      0.50      0.50      7988

    accuracy                           0.50     16000
   macro avg       0.50      0.50      0.50     16000
weighted avg       0.50      0.50      0.50     16000

Accuracy: 0.497125


In [19]:
fit_and_benchmark(lgr, X_train, X_test, Y_train, Y_test)

              precision    recall  f1-score   support

           0       0.57      0.50      0.53      9061
           1       0.44      0.50      0.47      6939

    accuracy                           0.50     16000
   macro avg       0.50      0.50      0.50     16000
weighted avg       0.51      0.50      0.50     16000

Accuracy: 0.5016875
