In [1]:
import pandas as pd
import numpy as np
import re
from bs4 import BeautifulSoup

from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD as TSVD
from sklearn.svm import SVC, LinearSVC
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import cross_val_score, cross_val_predict

import spacy
from spacy.lang.en.stop_words import STOP_WORDS as stopwords

In [2]:
data = pd.read_csv('IMDB Dataset.csv')

In [3]:
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [4]:
data.shape

(50000, 2)

In [5]:
train = data.iloc[:25000]
test = data.iloc[25000:]

In [6]:
train.shape, test.shape

((25000, 2), (25000, 2))

In [7]:
train.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [8]:
nlp = spacy.load('en_core_web_lg')

In [9]:
# Converting the text to lowercase

train['review'] = train['review'].apply(lambda x: str(x).lower())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [10]:
data.head()

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production. <br /><br />the...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically there's a family where a little boy ...,negative
4,"petter mattei's ""love in the time of money"" is...",positive


### Contractions Expansion

In [11]:
#!pip install contractions

In [12]:
import contractions

In [13]:
contractions_dict = contractions.contractions_dict
contractions_dict

{"ain't": 'are not',
 "aren't": 'are not',
 "can't": 'can not',
 "can't've": 'can not have',
 "'cause": 'because',
 "could've": 'could have',
 "couldn't": 'could not',
 "couldn't've": 'could not have',
 "didn't": 'did not',
 "doesn't": 'does not',
 "don't": 'do not',
 "hadn't": 'had not',
 "hadn't've": 'had not have',
 "hasn't": 'has not',
 "haven't": 'have not',
 "he'd": 'he would',
 "he'd've": 'he would have',
 "he'll": 'he will',
 "he'll've": 'he will have',
 "he's": 'he is',
 "how'd": 'how did',
 "how're": 'how are',
 "how'd'y": 'how do you',
 "how'll": 'how will',
 "how's": 'how is',
 "I'd": 'I would',
 "I'd've": 'I would have',
 "I'll": 'I will',
 "I'll've": 'I will have',
 "I'm": 'I am',
 "I've": 'I have',
 "isn't": 'is not',
 "it'd": 'it would',
 "it'd've": 'it would have',
 "it'll": 'it will',
 "it'll've": 'it will have',
 "it's": 'it is',
 "let's": 'let us',
 "ma'am": 'madam',
 "mayn't": 'may not',
 "might've": 'might have',
 "mightn't": 'might not',
 "mightn't've": 'might no

In [14]:
def contraction_expansion(x):
    
    if type(x) is str:
        
        for key in contractions_dict:
            
            value = contractions_dict[key]
            
            x = x.replace(key, value)
            
        return x
    
    else:
        
        return x

In [15]:
train['review'] = train['review'].apply(lambda x: contraction_expansion(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [16]:
train.head()

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production. <br /><br />the...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically there is a family where a little boy...,negative
4,"petter mattei's ""love in the time of money"" is...",positive


### Removing Emails

In [17]:
def remove_emails(x):
    
    email_pattern = re.compile(r"(^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$)")
    
    return re.sub(email_pattern, '', x)

In [18]:
train['review'] = train['review'].apply(lambda x:remove_emails(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [19]:
train.sample(5)

Unnamed: 0,review,sentiment
18706,it is a sad state in corporate hollywood when ...,positive
2163,i'm from belgium and therefore my english writ...,positive
548,if you like to see animals being skinned alive...,negative
2418,the turgid pace of this movie numbs us to any ...,negative
13035,"hynkel, dictator of tomania, is a spoiled chil...",positive


### Removing HTML Tags

In [20]:
train['review'] = train['review'].apply(lambda x: BeautifulSoup(x, 'lxml').get_text().strip())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [21]:
train.iloc[6005][0]

'pretty.pretty actresses and actors. pretty bad script. pretty frequent "let us strip to our undies" scenes. pretty fair f/x. pretty jarring location decisions (the college dorm room looks like a high-end hotel room - probably because it was shot at a hotel). pretty bland storyline. pretty awful dialog. pretty locations. pretty annoying editing, unless you like the music video flash-cut style.this one is not a guilty pleasure - this is more an embarrassing one. if you must watch this, pick a good dance/techno album and turn the sound off on the movie - you will see the pretty people in their pretty black undies, and probably follow the story just fine.the cast may be able to act - i doubt that anyone could look skilled given the lines/plot that they had to deal with.'

In [22]:
train.sample(5)

Unnamed: 0,review,sentiment
8140,perhaps once in a generation a film comes alon...,positive
23511,"a long overdue concert release, rush-in-rio dv...",positive
16111,it is really a shame to see so many talented p...,negative
16531,"first of all, i know almost nothing about rugb...",positive
138,i just watched this movie on it is premier nig...,positive


### Removing Special Characters

In [23]:
def RemoveSpecialChars(x):
    
    x = re.sub(r'[^\w ]+', "", x)
    x = ' '.join(x.split())
    return x

In [24]:
train['review'] = train['review'].apply(lambda x: RemoveSpecialChars(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [25]:
train.sample(5)

Unnamed: 0,review,sentiment
8625,the real shame of the gathering is not in the ...,negative
24675,before the internet this movie could never hav...,negative
18145,the show is great no words to describe it wond...,positive
1167,it would be a shame if tommy lee jones and rob...,negative
9861,i love eddie izzard i think this is awesome an...,positive


In [26]:
train.iloc[6005][0]

'prettypretty actresses and actors pretty bad script pretty frequent let us strip to our undies scenes pretty fair fx pretty jarring location decisions the college dorm room looks like a highend hotel room probably because it was shot at a hotel pretty bland storyline pretty awful dialog pretty locations pretty annoying editing unless you like the music video flashcut stylethis one is not a guilty pleasure this is more an embarrassing one if you must watch this pick a good dancetechno album and turn the sound off on the movie you will see the pretty people in their pretty black undies and probably follow the story just finethe cast may be able to act i doubt that anyone could look skilled given the linesplot that they had to deal with'

### Lemmetization

In [27]:
def lemme(x):
    
    x = str(x)
    x_list = []
    doc = nlp(x)
    
    for token in doc:
        lemma = token.lemma_
        
        if lemma == '-PRON-' or lemma == 'be':
            lemma = token.text
            
        x_list.append(lemma)
        
    return ' '.join(x_list)

In [28]:
%%time
train['review'] = train['review'].apply(lambda x: lemme(x))

Wall time: 14min 8s


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [29]:
train.sample(5)

Unnamed: 0,review,sentiment
3362,bill and ted are back only this time an evil d...,positive
7208,i use to love this movie as a kid but see it a...,negative
8381,this move is about as bad as they come i was h...,negative
4342,argh this film hurt my head and not in a good ...,negative
8943,un gatto nel cervellocat in the brain is one o...,positive


### Tokenization using Text Blob

### Removing Stop Words

In [30]:
stopwords

{"'d",
 "'ll",
 "'m",
 "'re",
 "'s",
 "'ve",
 'a',
 'about',
 'above',
 'across',
 'after',
 'afterwards',
 'again',
 'against',
 'all',
 'almost',
 'alone',
 'along',
 'already',
 'also',
 'although',
 'always',
 'am',
 'among',
 'amongst',
 'amount',
 'an',
 'and',
 'another',
 'any',
 'anyhow',
 'anyone',
 'anything',
 'anyway',
 'anywhere',
 'are',
 'around',
 'as',
 'at',
 'back',
 'be',
 'became',
 'because',
 'become',
 'becomes',
 'becoming',
 'been',
 'before',
 'beforehand',
 'behind',
 'being',
 'below',
 'beside',
 'besides',
 'between',
 'beyond',
 'both',
 'bottom',
 'but',
 'by',
 'ca',
 'call',
 'can',
 'cannot',
 'could',
 'did',
 'do',
 'does',
 'doing',
 'done',
 'down',
 'due',
 'during',
 'each',
 'eight',
 'either',
 'eleven',
 'else',
 'elsewhere',
 'empty',
 'enough',
 'even',
 'ever',
 'every',
 'everyone',
 'everything',
 'everywhere',
 'except',
 'few',
 'fifteen',
 'fifty',
 'first',
 'five',
 'for',
 'former',
 'formerly',
 'forty',
 'four',
 'from',
 'fron

In [31]:
len(stopwords)

326

In [32]:
def RemoveStopWords(x):
    
    return ' '.join([word for word in x.split() if word not in stopwords])

In [33]:
x = train.iloc[6005][0]

In [34]:
# EXAMPLE CODE

print(x)
print()
print("length of x: ",len(x))

prettypretty actress and actor pretty bad script pretty frequent let us strip to our undie scene pretty fair fx pretty jarring location decision the college dorm room look like a highend hotel room probably because it was shoot at a hotel pretty bland storyline pretty awful dialog pretty location pretty annoying editing unless you like the music video flashcut stylethis one is not a guilty pleasure this is more an embarrassing one if you must watch this pick a good dancetechno album and turn the sound off on the movie you will see the pretty people in their pretty black undie and probably follow the story just finethe cast may be able to act i doubt that anyone could look skilled give the linesplot that they have to deal with

length of x:  735


In [35]:
x1 = RemoveStopWords(x)
x1

'prettypretty actress actor pretty bad script pretty frequent let strip undie scene pretty fair fx pretty jarring location decision college dorm room look like highend hotel room probably shoot hotel pretty bland storyline pretty awful dialog pretty location pretty annoying editing like music video flashcut stylethis guilty pleasure embarrassing watch pick good dancetechno album turn sound movie pretty people pretty black undie probably follow story finethe cast able act doubt look skilled linesplot deal'

In [36]:
len(x1)

508

In [37]:
%%time

train['review'] = train['review'].apply(lambda x: RemoveStopWords(x))

Wall time: 845 ms


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [38]:
train.sample(5)

Unnamed: 0,review,sentiment
24126,deborah mess cast grace tolerable film simply ...,negative
18322,spy short dvd good new zealand short great fre...,positive
12797,12 year old arnald hillerman accidentally kill...,positive
7795,spoiler warning contain contain spoiler reader...,negative
14807,surprise enthusiastic response film dull enjoy...,negative


### Removing Rare Words

In [39]:
text = ' '.join(train['review'])

In [40]:
#text

In [41]:
len(text)

17340803

In [42]:
# Creating Frequency

text_series = pd.Series(text.split())

In [43]:
freq_comm = text_series.value_counts()

In [44]:
freq_comm

movie                  49560
film                   45828
like                   21491
good                   20110
time                   14627
character              13753
watch                  13412
bad                    12602
story                  12071
think                  11356
scene                  10437
great                   9684
look                    9490
know                    9231
people                  9051
play                    8350
way                     8291
come                    8215
love                    8201
thing                   7981
find                    7767
end                     7266
man                     6914
life                    6643
plot                    6528
work                    6517
actor                   6489
want                    6410
little                  6282
try                     6135
                       ...  
coffeestaine               1
coopercrossfire            1
singalongsthe              1
checkthank    

In [45]:
rare_words = freq_comm[-82000:-1]
'rockumentarie' in rare_words

True

In [46]:
rare_words

overwho                1
victoriawhat           1
painlessly             1
annoyingwellexcept     1
cigaret                1
grants                 1
humanely               1
citysurprise           1
idiotsthe              1
otheras                1
eversobrilliant        1
brooma                 1
mahatmas               1
seriousross            1
riviere                1
oppositesure           1
mäger                  1
crazyass               1
mordant                1
adolescentsthis        1
reportinstead          1
51ch                   1
glockenspur            1
adaptationall          1
choicebut              1
lineread               1
banzie                 1
revised                1
peakmy                 1
longedward             1
                      ..
bestwhat               1
coffeestaine           1
coopercrossfire        1
singalongsthe          1
checkthank             1
stopping               1
experiencesit          1
intoand                1
ibbs                   1


In [47]:
# Removing 82000 rare occuring words 

train['review'] = train['review'].apply(lambda x: ' '.join([word for word in x.split() if word not in rare_words]))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [48]:
train['review'].sample(5)

22396    surprise maker hopeless movie find uk distribu...
16369    oh glorious musical bit miscast frank sinatra ...
18148    opie tom movie look year sorry bad m tell writ...
8705     movie prove judge movie awesome artwork dvd co...
4638     pretty know deep basic story teen find slimy a...
Name: review, dtype: object

### Converting the Data into Vector

In [49]:
train['sentiment'].value_counts()

negative    12526
positive    12474
Name: sentiment, dtype: int64

In [50]:
X = train['review']
y = train['sentiment']

In [51]:
tfidf = TfidfVectorizer()

In [52]:
X = tfidf.fit_transform(X)

In [53]:
X.shape

(25000, 48508)

In [54]:
X

<25000x48508 sparse matrix of type '<class 'numpy.float64'>'
	with 1932173 stored elements in Compressed Sparse Row format>

### Splitting Data into Training and Testing sets

In [55]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 4, stratify = y)

In [56]:
X_train.shape, X_test.shape

((20000, 48508), (5000, 48508))

### Dimensionality reduction using Truncated Singular Value Decomposition

In [57]:
%%time

#tsvd = TSVD(n_components=10000, random_state=4)
#X_train_tsvd = tsvd.fit_transform(X_train)

Wall time: 0 ns


In [58]:
#sum(tsvd.explained_variance_)

### Using SVC for Classification

In [59]:
#clf_svc = SVC()

In [60]:
%%time

#scores = cross_val_score(clf_svc, X_train, y_train, cv=6, n_jobs=-1)

Wall time: 0 ns


In [61]:
#scores

### Using Logistic Regression

In [62]:
from sklearn.linear_model import LogisticRegression

In [63]:
clf_lr = LogisticRegression()

In [64]:
X_train

<20000x48508 sparse matrix of type '<class 'numpy.float64'>'
	with 1546890 stored elements in Compressed Sparse Row format>

In [65]:
%%time

scores = cross_val_score(clf_lr, X_train, y_train, cv=10, n_jobs=4)

Wall time: 3.83 s


In [66]:
scores

array([0.87406297, 0.8815    , 0.8805    , 0.8745    , 0.891     ,
       0.8925    , 0.877     , 0.8765    , 0.871     , 0.87243622])

In [67]:
scores.mean()

0.8790999186624797

In [68]:
clf_lr.fit(X_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [69]:
y_test_pred = clf_lr.predict(X_test)

In [70]:
print(classification_report(y_test, y_test_pred))

              precision    recall  f1-score   support

    negative       0.89      0.85      0.87      2505
    positive       0.86      0.90      0.88      2495

    accuracy                           0.87      5000
   macro avg       0.87      0.87      0.87      5000
weighted avg       0.87      0.87      0.87      5000



In [71]:
confusion_matrix(y_test, y_test_pred)

array([[2134,  371],
       [ 261, 2234]], dtype=int64)

In [72]:
clf_lr.predict(tfidf.transform(['American Psycho deserved an Oscar, they were robbed']))

array(['positive'], dtype=object)

In [73]:
y_real_pred = clf_lr.predict(tfidf.transform(test['review']))

In [74]:
print(classification_report(test['sentiment'], y_real_pred))

              precision    recall  f1-score   support

    negative       0.89      0.84      0.86     12474
    positive       0.85      0.90      0.87     12526

    accuracy                           0.87     25000
   macro avg       0.87      0.87      0.87     25000
weighted avg       0.87      0.87      0.87     25000



In [75]:
clf_lr.predict(tfidf.transform(["What hell was that, it's a masterpiece"]))

array(['positive'], dtype=object)