Importing Required libaries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import nltk
import spacy
import string

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')

nltk.download('stopwords')
nltk.download('punkt')
from gensim.utils import simple_preprocess

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Loading the Dataset from Google Drive

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Reviewing the dataset

In [3]:
data = pd.read_csv('/content/drive/MyDrive/Exam/labeledTrainData.tsv', delimiter='\t')

In [4]:
data.head()

Unnamed: 0,id,sentiment,review
0,5814_8,1,With all this stuff going down at the moment w...
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi..."
2,7759_3,0,The film starts with a manager (Nicholas Bell)...
3,3630_4,0,It must be assumed that those who praised this...
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25000 entries, 0 to 24999
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   id         25000 non-null  object
 1   sentiment  25000 non-null  int64 
 2   review     25000 non-null  object
dtypes: int64(1), object(2)
memory usage: 586.1+ KB


In [6]:
data.dtypes

id           object
sentiment     int64
review       object
dtype: object

In [7]:
data.shape

(25000, 3)

Checking for missing values

In [8]:
data.isna().sum()

id           0
sentiment    0
review       0
dtype: int64

In [9]:
data['sentiment'].value_counts()

1    12500
0    12500
Name: sentiment, dtype: int64

In [10]:
data.columns

Index(['id', 'sentiment', 'review'], dtype='object')

Removing punctuations

In [16]:
PTR = string.punctuation
def r_pun(text):
    return text.translate(str.maketrans('', '', PTR))


data['review'] = data['review'].apply(lambda text: r_pun(text))

Lowercasing

In [12]:
data['review'] = data['review'].apply(lambda x:x.lower())

Stopword removal

In [17]:
from nltk.corpus import stopwords
s_list = stopwords.words('english')
data['review'] = data['review'].apply(lambda x: [item for item in x.split() if item not in s_list]).apply(lambda x:" ".join(x))

Lemmatization

In [18]:
from nltk.stem import WordNetLemmatizer

lemma = WordNetLemmatizer()
def lemmatize_words(text):
    return " ".join([lemma.lemmatize(word) for word in text.split()])

data["review"] = data["review"].apply(lambda text: lemmatize_words(text))
data.head()

Unnamed: 0,id,sentiment,review
0,5814_8,1,stuff going moment mj ive started listening mu...
1,2381_9,1,classic war world timothy hines entertaining f...
2,7759_3,0,film start manager nicholas bell giving welcom...
3,3630_4,0,must assumed praised film greatest filmed oper...
4,9495_8,1,superbly trashy wondrously unpretentious 80 ex...


Feature Extraction

In [19]:
x = data.iloc[:,2:3]
x.head()

Unnamed: 0,review
0,stuff going moment mj ive started listening mu...
1,classic war world timothy hines entertaining f...
2,film start manager nicholas bell giving welcom...
3,must assumed praised film greatest filmed oper...
4,superbly trashy wondrously unpretentious 80 ex...


In [20]:
y= data['sentiment']
y.head()

0    1
1    1
2    0
3    0
4    1
Name: sentiment, dtype: int64

In [21]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2, random_state=134)

In [22]:
x_train

Unnamed: 0,review
17272,admit weakness alternate history story wonderf...
11941,sur me lèvres read lip fine little thriller al...
2802,sex youve ever farm animal tag line movie prob...
4511,browsing internet previous sale price ran acro...
7929,nope going get refuse go along program dont su...
...,...
1202,kept attention start finish great performance ...
13687,take special kind person make movie wretched b...
11640,found imdb searched film moved almost tear com...
13862,known fall asleep film usually due combination...


Bag of words

In [23]:
cv = CountVectorizer(stop_words = "english", min_df = 10, max_df=200, max_features = 2000)

In [24]:
x_train_bow = cv.fit_transform(x_train['review']).toarray()
x_test_bow = cv.transform(x_test['review']).toarray()

In [26]:
print(cv.vocabulary_)

{'weakness': 1955, 'destiny': 485, 'cox': 405, 'scale': 1544, 'afternoon': 56, 'rental': 1457, 'lip': 1042, 'tense': 1794, 'tight': 1825, 'significant': 1613, 'statement': 1694, 'thirty': 1811, 'secretary': 1559, 'bleak': 178, 'enter': 585, 'vincent': 1928, 'ad': 42, 'assistant': 116, 'meeting': 1115, 'hoped': 854, 'sleazy': 1637, 'letter': 1028, 'discovers': 511, 'spy': 1685, 'join': 964, 'gain': 720, 'freedom': 706, 'seat': 1558, 'unlikely': 1903, 'factor': 637, 'sensitive': 1570, 'hearing': 813, 'aid': 59, 'rank': 1409, 'noir': 1198, 'farm': 649, 'lying': 1072, 'burning': 241, 'unable': 1881, 'fictional': 659, 'conspiracy': 371, 'internet': 921, 'price': 1353, 'ran': 1407, 'shirley': 1599, 'refuse': 1434, 'program': 1369, 'overrated': 1256, 'reader': 1416, '710': 24, 'mainstream': 1083, 'bag': 137, 'thrill': 1816, '25': 19, 'hardcore': 801, 'appropriate': 106, 'critical': 420, 'religion': 1446, 'remaining': 1449, 'acceptable': 32, 'revealed': 1484, 'bbc': 149, 'broadcast': 225, 'bib

In [27]:
import gensim

In [28]:
from nltk import sent_tokenize
from gensim.utils import simple_preprocess

In [29]:
story = []
for doc in data['review']: #We sentence tokenize our doc then do preprocessing
    raw_sent = sent_tokenize(doc)
    for sent in raw_sent:
        story.append(simple_preprocess(sent))

Building model

In [30]:
model = gensim.models.Word2Vec(
    window=10,
 min_count=2
)

In [31]:
model.build_vocab(story)

Model training

In [32]:
model.train(story, total_examples=model.corpus_count, epochs=model.epochs)

(14153019, 15308715)

Finding the number of model in our vocabulary

In [33]:
len(model.wv.index_to_key)

50069

creating vector to represent our reviews

In [34]:
def document_vector(doc):
    # remove out-of-vocabulary words
    doc = [word for word in doc.split() if word in model.wv.index_to_key]
    return np.mean(model.wv[doc], axis=0)

In [35]:
document_vector(data['review'].values[0])

array([ 5.5610485e-02,  7.4281566e-02, -2.6672831e-01, -5.9818757e-01,
        6.3464284e-02, -4.8056713e-01,  4.7066014e-02,  9.9900788e-01,
        7.4374832e-02, -4.2752984e-01, -9.6135929e-02, -1.1916489e-01,
       -6.6675074e-02,  5.5745226e-01, -2.3102356e-01,  1.4081617e-01,
        8.8206249e-01, -2.9603210e-01,  2.7928182e-01, -6.9435292e-01,
        3.4643361e-01,  3.7601117e-02, -1.8025046e-02,  2.6813215e-01,
       -9.8951755e-04,  2.1669044e-01,  1.9991040e-01,  2.8327847e-02,
        4.5916848e-02,  1.0852660e-01,  3.9126477e-01,  1.6770330e-01,
        9.5401861e-02, -7.6610558e-02,  1.2004996e-02, -5.3336840e-02,
        3.5709652e-01, -5.2778780e-01, -2.0269939e-01, -2.8432897e-01,
        1.4868779e-01, -4.6063143e-01, -1.8244542e-01,  3.0377275e-01,
        9.6451439e-02, -4.1390178e-01, -4.2556924e-01, -2.6425353e-01,
       -6.4413689e-02, -3.0852968e-01,  9.9530399e-02, -2.4654415e-01,
       -2.4136969e-01,  2.8875479e-01,  1.1873989e-01, -1.8967451e-01,
      

Model Building and Evaluation

Logistic Regression

In [36]:
from sklearn.linear_model import LogisticRegression
lr= LogisticRegression()

In [38]:
lr.fit(x_train_bow,y_train)
y_pred = lr.predict(x_test_bow)

In [39]:
from sklearn.metrics import accuracy_score, recall_score, f1_score, confusion_matrix, classification_report
accuracy_score(y_test,y_pred)

0.7466

RandomForestClassifier

In [40]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()

rf.fit(x_train_bow,y_train)
y_pred = rf.predict(x_test_bow)
accuracy_score(y_test,y_pred)

0.7372

In [41]:
from sklearn.metrics import accuracy_score, recall_score, f1_score, confusion_matrix, classification_report

In [42]:
confusion_matrix(y_test,y_pred)

array([[1859,  610],
       [ 704, 1827]])

In [43]:
print('The accuracy score is: ', accuracy_score(y_test,y_pred))

The accuracy score is:  0.7372
