# Import library

In [1]:
!pip install "gensim==3.8.3"

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting gensim==3.8.3
  Downloading gensim-3.8.3.tar.gz (23.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.4/23.4 MB[0m [31m57.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: gensim
  Building wheel for gensim (setup.py) ... [?25l[?25hdone
  Created wheel for gensim: filename=gensim-3.8.3-cp39-cp39-linux_x86_64.whl size=26528029 sha256=fe698dc7fcdd79dba83a710368f481782c623da5597c15bd5bd01b5d6d294132
  Stored in directory: /root/.cache/pip/wheels/ca/5d/af/618594ec2f28608c1d6ee7d2b7e95a3e9b06551e3b80a491d6
Successfully built gensim
Installing collected packages: gensim
  Attempting uninstall: gensim
    Found existing installation: gensim 4.3.1
    Uninstalling gensim-4.3.1:
      Successfully uninstalled gensim-4.3.1
Successfully installed gensim-3.8.3


In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
import pandas as pd
from gensim.test.utils import common_texts
from gensim.sklearn_api import D2VTransformer

# Read data

In [3]:
data = pd.read_csv('labeled_data.csv', engine='python')
data.head()

Unnamed: 0,review,cleaned_review,pos_tagged,lemma,lemma_words,polarity
0,It is now past 1 PM and I just finished watchi...,It is now past PM and I just finished watching...,"[('past', 'a'), ('PM', 'n'), ('finished', 'v')...",past PM finish watch Francis Ford Coppola Go...,"['past', 'PM', 'finish', 'watch', 'Francis', '...",negative
1,I should probably go to bed.,I should probably go to bed,"[('probably', 'r'), ('go', 'v'), ('bed', 'v')]",probably go bed,"['probably', 'go', 'bed']",neutral
2,It's late and tomorrow I have to wake up a bit...,It s late and tomorrow I have to wake up a bit...,"[('late', 'a'), ('tomorrow', 'n'), ('wake', 'v...",late tomorrow wake bit early,"['late', 'tomorrow', 'wake', 'bit', 'early']",neutral
3,But not early enough to postpone writing these...,But not early enough to postpone writing these...,"[('early', 'r'), ('enough', 'r'), ('postpone',...",early enough postpone write line,"['early', 'enough', 'postpone', 'write', 'line']",positive
4,"Now that I have seen it three times, the oppor...",Now that I have seen it three times the opport...,"[('seen', 'v'), ('three', None), ('times', 'v'...",see three time opportunity share thought ref...,"['see', 'three', 'time', 'opportunity', 'share...",positive


# Extract data

In [4]:
data = data[['lemma','lemma_words','polarity']]
data

Unnamed: 0,lemma,lemma_words,polarity
0,past PM finish watch Francis Ford Coppola Go...,"['past', 'PM', 'finish', 'watch', 'Francis', '...",negative
1,probably go bed,"['probably', 'go', 'bed']",neutral
2,late tomorrow wake bit early,"['late', 'tomorrow', 'wake', 'bit', 'early']",neutral
3,early enough postpone write line,"['early', 'enough', 'postpone', 'write', 'line']",positive
4,see three time opportunity share thought ref...,"['see', 'three', 'time', 'opportunity', 'share...",positive
...,...,...,...
173038,watch film want see bad film expentance Trol...,"['watch', 'film', 'want', 'see', 'bad', 'film'...",negative
173039,everyone want really good film good actor hi...,"['everyone', 'want', 'really', 'good', 'film',...",positive
173040,troll film look really funny scene remember ...,"['troll', 'film', 'look', 'really', 'funny', '...",positive
173041,horror movie never scar film,"['horror', 'movie', 'never', 'scar', 'film']",negative


# Sampling data

In [5]:
data_each_polarity = 40000

In [6]:
pos_data = data.loc[data['polarity'] == 'positive']
pos_data = pos_data.sample(n=data_each_polarity)
pos_data

Unnamed: 0,lemma,lemma_words,polarity
142464,one enjoy two terrorist send board ship seem...,"['one', 'enjoy', 'two', 'terrorist', 'send', '...",positive
55408,scene exist Tarantino make participant speak...,"['scene', 'exist', 'Tarantino', 'make', 'parti...",positive
12015,part comedy superbly marvelous love famous w...,"['part', 'comedy', 'superbly', 'marvelous', 'l...",positive
161520,overall verdict okay film best,"['overall', 'verdict', 'okay', 'film', 'best']",positive
76070,direction stylish time always appropriate,"['direction', 'stylish', 'time', 'always', 'ap...",positive
...,...,...,...
105807,However truly think hold gunpoint star mess,"['However', 'truly', 'think', 'hold', 'gunpoin...",positive
119604,know even dare watch watch rubbish trailer s...,"['know', 'even', 'dare', 'watch', 'watch', 'ru...",positive
134392,Goku love fight,"['Goku', 'love', 'fight']",positive
155676,Hollywood yet,"['Hollywood', 'yet']",positive


In [7]:
neg_data = data.loc[data['polarity'] == 'negative']
neg_data = neg_data.sample(n=data_each_polarity)
neg_data

Unnamed: 0,lemma,lemma_words,polarity
148264,Hell CArmen Electra MOvie,"['Hell', 'CArmen', 'Electra', 'MOvie']",negative
71182,fact everyone put excellent performance alwa...,"['fact', 'everyone', 'put', 'excellent', 'perf...",negative
80717,short Ted Levine rob blind portrayal charact...,"['short', 'Ted', 'Levine', 'rob', 'blind', 'po...",negative
21549,felt cheat bad possible way,"['felt', 'cheat', 'bad', 'possible', 'way']",negative
172961,Rent one day friend start rip away,"['Rent', 'one', 'day', 'friend', 'start', 'rip...",negative
...,...,...,...
15572,know go next crazy also charismatic point co...,"['know', 'go', 'next', 'crazy', 'also', 'chari...",negative
112020,OK see film care Kirk Cameron think anything...,"['OK', 'see', 'film', 'care', 'Kirk', 'Cameron...",negative
158554,give Milkshake montage start film,"['give', 'Milkshake', 'montage', 'start', 'film']",negative
24373,Sorry bother little one refuse validity twis...,"['Sorry', 'bother', 'little', 'one', 'refuse',...",negative


# Label encoding

In [8]:
data = pd.concat([pos_data, neg_data], ignore_index=True)
data = data.replace({'polarity': {'negative': 0, 'positive': 1}})
data

Unnamed: 0,lemma,lemma_words,polarity
0,one enjoy two terrorist send board ship seem...,"['one', 'enjoy', 'two', 'terrorist', 'send', '...",1
1,scene exist Tarantino make participant speak...,"['scene', 'exist', 'Tarantino', 'make', 'parti...",1
2,part comedy superbly marvelous love famous w...,"['part', 'comedy', 'superbly', 'marvelous', 'l...",1
3,overall verdict okay film best,"['overall', 'verdict', 'okay', 'film', 'best']",1
4,direction stylish time always appropriate,"['direction', 'stylish', 'time', 'always', 'ap...",1
...,...,...,...
79995,know go next crazy also charismatic point co...,"['know', 'go', 'next', 'crazy', 'also', 'chari...",0
79996,OK see film care Kirk Cameron think anything...,"['OK', 'see', 'film', 'care', 'Kirk', 'Cameron...",0
79997,give Milkshake montage start film,"['give', 'Milkshake', 'montage', 'start', 'film']",0
79998,Sorry bother little one refuse validity twis...,"['Sorry', 'bother', 'little', 'one', 'refuse',...",0


# Transform data with transformer

In [9]:
def transform_and_split(model, data, target):
  X = model.fit_transform(data)
  X_train, X_test, y_train, y_test = train_test_split(
    X, target, test_size=0.2, random_state=42
  )
  return X_train, X_test, y_train, y_test

# Train data and benchmark

In [10]:
def fit_and_benchmark(model, X_train,X_test, y_train, y_test):
  model.fit(X_train, y_train)
  y_pred = model.predict(X_test)
  print(classification_report(y_pred,y_test))
  print("Accuracy:",accuracy_score(y_pred, y_test))

In [11]:
vectorizer = TfidfVectorizer()
doc2vec = D2VTransformer(min_count=1, size=300)

In [12]:
rfclf = RandomForestClassifier()
xgb_clf= GradientBoostingClassifier()
lgr = LogisticRegression(max_iter=1000)

In [13]:
target = data['polarity']
lemma_data = data['lemma']
lemma_words_data = data['lemma_words'].apply(lambda x: x[1:-1].split(','))

# Tf-idf

In [14]:
X_train, X_test, y_train, y_test = transform_and_split(vectorizer ,lemma_data, target)

In [15]:
fit_and_benchmark(rfclf, X_train, X_test, y_train, y_test)

              precision    recall  f1-score   support

           0       0.85      0.85      0.85      8019
           1       0.85      0.85      0.85      7981

    accuracy                           0.85     16000
   macro avg       0.85      0.85      0.85     16000
weighted avg       0.85      0.85      0.85     16000

Accuracy: 0.854375


In [16]:
fit_and_benchmark(xgb_clf, X_train, X_test, y_train, y_test)

              precision    recall  f1-score   support

           0       0.90      0.68      0.78     10652
           1       0.57      0.85      0.69      5348

    accuracy                           0.74     16000
   macro avg       0.74      0.77      0.73     16000
weighted avg       0.79      0.74      0.75     16000

Accuracy: 0.7380625


In [17]:
fit_and_benchmark(lgr, X_train, X_test, y_train, y_test)

              precision    recall  f1-score   support

           0       0.91      0.89      0.90      8199
           1       0.89      0.91      0.90      7801

    accuracy                           0.90     16000
   macro avg       0.90      0.90      0.90     16000
weighted avg       0.90      0.90      0.90     16000

Accuracy: 0.89875


# Doc to vec

In [18]:
X_train, X_test, y_train, y_test = transform_and_split(doc2vec ,lemma_words_data, target)

In [19]:
fit_and_benchmark(rfclf, X_train, X_test, y_train, y_test)

              precision    recall  f1-score   support

           0       0.72      0.62      0.66      9408
           1       0.55      0.66      0.60      6592

    accuracy                           0.63     16000
   macro avg       0.63      0.64      0.63     16000
weighted avg       0.65      0.63      0.64     16000

Accuracy: 0.6346875


In [20]:
fit_and_benchmark(xgb_clf, X_train, X_test, y_train, y_test)

              precision    recall  f1-score   support

           0       0.75      0.62      0.68      9690
           1       0.54      0.68      0.60      6310

    accuracy                           0.64     16000
   macro avg       0.64      0.65      0.64     16000
weighted avg       0.66      0.64      0.65     16000

Accuracy: 0.6420625


In [21]:
fit_and_benchmark(lgr, X_train, X_test, y_train, y_test)

              precision    recall  f1-score   support

           0       0.65      0.63      0.64      8307
           1       0.61      0.64      0.63      7693

    accuracy                           0.63     16000
   macro avg       0.63      0.63      0.63     16000
weighted avg       0.63      0.63      0.63     16000

Accuracy: 0.633
