# Team names: Mieszko Mirgos, Paweł Gelar, Jakub Kasprzak

Task 2: embeddings

In [10]:
import nltk                                # Python library for NLP
from nltk.corpus import twitter_samples    # sample Twitter dataset from NLTK
from nltk.corpus import stopwords
import matplotlib.pyplot as plt            # library for visualization
import numpy as np                         # linear algebra
import pandas as pd                        # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.model_selection import train_test_split # function for splitting data to train and test sets
from nltk.classify import SklearnClassifier
from wordcloud import WordCloud,STOPWORDS

In [3]:
import pandas as pd

df = pd.read_csv('/content/drive/MyDrive/IMDB Dataset.csv')

In [4]:
df

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


In [6]:
print('Dataset size:',df.shape)
df.groupby('sentiment').count()

Dataset size: (50000, 2)


Unnamed: 0_level_0,review
sentiment,Unnamed: 1_level_1
negative,25000
positive,25000


In [7]:
df['sentiment'] = df['sentiment'].map({'positive':1,'negative':0})

In [11]:
train, test = train_test_split(df,test_size = 0.1)

In [12]:
train['review'] = train['review'].str.replace('<br />','')
test['review'] = test['review'].str.replace('<br />','')

In [17]:
train

Unnamed: 0,review,sentiment
30218,"This movie sucked. The acting sucked, the scri...",0
42532,An excellent documentry. I personally remember...,1
24146,One word: suPURRRRb! I don't think I have see ...,1
29599,How many of us have read a book or seen a play...,1
24682,"Because some people, like me, like to know EVE...",0
...,...,...
27464,"Spoiler!! I love Branagh, love Helena Bonham-C...",0
37851,After a long hard week behind the desk making ...,1
21125,I haven't seen this in over 20yrs but I still ...,1
42860,This movie is pretty predictable nuff said.......,0


In [18]:
nltk.download('stopwords')

stop_words = stopwords.words('english')


def remove_stopwords(text):
  text = ' '.join([word for word in text.split() if word not in stop_words])
  return text


train['review'] = train['review'].apply(lambda text: remove_stopwords(text))
test['review'] = test['review'].apply(lambda text: remove_stopwords(text))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [21]:
import gensim

sentences = [review.split() for review in train['review'].tolist()]

model = gensim.models.Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)


In [29]:
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def get_sentence_vector(review, model):
  vector = np.zeros(model.vector_size)
  count = 0
  for word in review.split():
    if word in model.wv:
      vector += model.wv[word]
      count += 1
  if count > 0:
    vector /= count
  return vector

train_vectors = [get_sentence_vector(review, model) for review in train['review'].tolist()]
test_vectors = [get_sentence_vector(review, model) for review in test['review'].tolist()]


clf = ExtraTreesClassifier()
clf.fit(train_vectors, train['sentiment'])

predictions = clf.predict(test_vectors)

accuracy_et = accuracy_score(test['sentiment'], predictions)
precision_et = precision_score(test['sentiment'], predictions)
recall_et = recall_score(test['sentiment'], predictions)
f1_et = f1_score(test['sentiment'], predictions)

print(f"Accuracy: {accuracy_et}")
print(f"Precision: {precision_et}")
print(f"Recall: {recall_et}")
print(f"F1-score: {f1_et}")

Accuracy: 0.8028
Precision: 0.785823754789272
Recall: 0.827683615819209
F1-score: 0.8062106918238994


In [32]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier()
clf.fit(train_vectors, train['sentiment'])

predictions = clf.predict(test_vectors)

accuracy_rf = accuracy_score(test['sentiment'], predictions)
precision_rf = precision_score(test['sentiment'], predictions)
recall_rf = recall_score(test['sentiment'], predictions)
f1_rf = f1_score(test['sentiment'], predictions)

print(f"Accuracy: {accuracy_rf}")
print(f"Precision: {precision_rf}")
print(f"Recall: {recall_rf}")
print(f"F1-score: {f1_rf}")

Accuracy: 0.8046
Precision: 0.7828872973991707
Recall: 0.8381759483454398
F1-score: 0.8095887741181056


In [33]:
import xgboost as xgb
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

clf = xgb.XGBClassifier()
clf.fit(train_vectors, train['sentiment'])

predictions = clf.predict(test_vectors)

accuracy_xgb = accuracy_score(test['sentiment'], predictions)
precision_xgb = precision_score(test['sentiment'], predictions)
recall_xgb = recall_score(test['sentiment'], predictions)
f1_xgb = f1_score(test['sentiment'], predictions)

print(f"Accuracy: {accuracy_xgb}")
print(f"Precision: {precision_xgb}")
print(f"Recall: {recall_xgb}")
print(f"F1-score: {f1_xgb}")

Accuracy: 0.816
Precision: 0.8005401234567902
Recall: 0.8373688458434221
F1-score: 0.8185404339250493


In [34]:
from sklearn.svm import SVC

clf = SVC()
clf.fit(train_vectors, train['sentiment'])

predictions = clf.predict(test_vectors)

accuracy_svm = accuracy_score(test['sentiment'], predictions)
precision_svm = precision_score(test['sentiment'], predictions)
recall_svm = recall_score(test['sentiment'], predictions)
f1_svm = f1_score(test['sentiment'], predictions)

print(f"Accuracy: {accuracy_svm}")
print(f"Precision: {precision_svm}")
print(f"Recall: {recall_svm}")
print(f"F1-score: {f1_svm}")

Accuracy: 0.8362
Precision: 0.8169659915934276
Recall: 0.8627925746569814
F1-score: 0.8392541707556428


In [35]:
from tabulate import tabulate

data = [
    ["Extra Trees Classifier", accuracy_et, precision_et, recall_et, f1_et],
    ["Random Forest Classifier", accuracy_rf, precision_rf, recall_rf, f1_rf],
    ["XGBoost Classifier", accuracy_xgb, precision_xgb, recall_xgb, f1_xgb],
    ["Support Vector Machine", accuracy_svm, precision_svm, recall_svm, f1_svm],
]

headers = ["Model", "Accuracy", "Precision", "Recall", "F1-score"]

print(tabulate(data, headers=headers, tablefmt="grid"))

+--------------------------+------------+-------------+----------+------------+
| Model                    |   Accuracy |   Precision |   Recall |   F1-score |
| Extra Trees Classifier   |     0.8028 |    0.785824 | 0.827684 |   0.806211 |
+--------------------------+------------+-------------+----------+------------+
| Random Forest Classifier |     0.8046 |    0.782887 | 0.838176 |   0.809589 |
+--------------------------+------------+-------------+----------+------------+
| XGBoost Classifier       |     0.816  |    0.80054  | 0.837369 |   0.81854  |
+--------------------------+------------+-------------+----------+------------+
| Support Vector Machine   |     0.8362 |    0.816966 | 0.862793 |   0.839254 |
+--------------------------+------------+-------------+----------+------------+
