In [1]:
import numpy as np
import pandas as pd

In [20]:
data = pd.read_csv('E:\Datasets/IMDB-Dataset.csv')

In [3]:
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [4]:
data.tail()

Unnamed: 0,review,sentiment
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative
49999,No one expects the Star Trek movies to be high...,negative


In [6]:
data.columns

Index(['review', 'sentiment'], dtype='object')

In [8]:
data.shape

(50000, 2)

In [9]:
data.describe()

Unnamed: 0,review,sentiment
count,50000,50000
unique,49582,2
top,Loved today's show!!! It was a variety and not...,positive
freq,5,25000


In [10]:
data.isnull().any()

review       False
sentiment    False
dtype: bool

In [11]:
data.isnull().sum()

review       0
sentiment    0
dtype: int64

In [12]:
data['sentiment'].value_counts()

positive    25000
negative    25000
Name: sentiment, dtype: int64

# Text Normalization

**Tokenization**

In [8]:
import seaborn as sns
import matplotlib as plt
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelBinarizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from wordcloud import WordCloud,STOPWORDS
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize,sent_tokenize
from bs4 import BeautifulSoup
import spacy
import re,string,unicodedata
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.stem import LancasterStemmer,WordNetLemmatizer
from sklearn.linear_model import LogisticRegression,SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from textblob import TextBlob
from textblob import Word
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score

import os
import warnings
warnings.filterwarnings('ignore')

In [10]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Farnoush\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [11]:
#Tokenization of text
tokenizer=ToktokTokenizer()
#Setting English stopwords
stopword_list=nltk.corpus.stopwords.words('english')

In [12]:
def noiseremoval(text):
    soup = BeautifulSoup(text, "html.parser") # removing the html strips
    text = soup.get_text()
    text = re.sub('\[[^]]*\]', '', text) # removing the square brackets

In [13]:
data['review'] = data['review'].apply(noiseremoval)

**Stemming**

In [21]:
def stemmer (text):
    ps=nltk.porter.PorterStemmer()
    text= ' '.join([ps.stem(word) for word in text.split()])
    return text

In [22]:
data['review'] = data['review'].apply(stemmer)

In [24]:
data.head()

Unnamed: 0,review,sentiment
0,one of the other review ha mention that after ...,positive
1,a wonder littl production. <br /><br />the fil...,positive
2,i thought thi wa a wonder way to spend time on...,positive
3,basic there' a famili where a littl boy (jake)...,negative
4,"petter mattei' ""love in the time of money"" is ...",positive


**Removing stopwords**

In [25]:
#set stopwords to english
stop=set(stopwords.words('english'))
print(stop)

#removing the stopwords
def remove_stopwords(text, is_lower_case=False):
    tokens = tokenizer.tokenize(text)
    tokens = [token.strip() for token in tokens]
    if is_lower_case:
        filtered_tokens = [token for token in tokens if token not in stopword_list]
    else:
        filtered_tokens = [token for token in tokens if token.lower() not in stopword_list]
    filtered_text = ' '.join(filtered_tokens)    
    return filtered_text
#Apply function on review column
data['review']=data['review'].apply(remove_stopwords)

{"she's", 'more', 'him', 'll', "hasn't", 'doesn', 'y', 'weren', 'for', 'does', 'why', 'an', 'she', "don't", "mightn't", 'during', 'yourself', 'here', 'it', 'how', 'when', 'own', 'you', 'to', 'where', 'itself', 'or', "you'll", 'hasn', 'on', "should've", 'until', "didn't", 'will', 'out', 'am', 'my', 'wasn', 'than', 'such', 'o', 'ours', 'if', 'ma', 'were', 'further', 'between', 'into', 'once', 'having', 'm', 'didn', "that'll", 'about', 'only', 'no', 'these', 'all', 'should', 'a', 'same', 'd', 'we', 'haven', 'is', 'which', 'wouldn', "couldn't", 'with', 'down', 'above', 'the', 'other', 'as', 'not', 'whom', "you've", 'very', 'mightn', "you'd", "won't", 'they', 'below', 'most', 'over', 'in', 'was', 're', 'mustn', 'at', 'from', 'his', 'theirs', 'me', 'yourselves', 'our', 'had', 'yours', 'those', 'doing', 'against', 'them', 'because', 'few', "shouldn't", 'what', 'there', "wasn't", "aren't", 'are', 'just', 'myself', 'ourselves', 'up', 'too', 't', 'did', 's', 'by', "needn't", 'couldn', 'don', 'ag

**Train/Test split**

In [27]:
#train dataset
train_reviews = data.review[:30000]
#train_sentiments = imdb_data.sentiment[:30000]

#test dataset
test_reviews = data.review[30000:]
#test_sentiments = imdb_data.sentiment[30000:]
print(train_reviews.shape)
print(test_reviews.shape)

(30000,)
(20000,)


**Bag of words**

In [29]:
#Count vectorizer for bag of words
cv = CountVectorizer(min_df = 0,max_df = 1,binary = False,ngram_range = (1,3))
#transformed train reviews
cv_train_reviews = cv.fit_transform(train_reviews)
#transformed test reviews
cv_test_reviews = cv.transform(test_reviews)

print('BOW_cv_train:',cv_train_reviews.shape)
print('BOW_cv_test:',cv_test_reviews.shape)
#vocab=cv.get_feature_names()-toget feature names

BOW_cv_train: (30000, 4910561)
BOW_cv_test: (20000, 4910561)


**TF-IDF**

In [30]:
#Tfidf vectorizer
tv = TfidfVectorizer(min_df=0,max_df=1,use_idf=True,ngram_range=(1,3))
#transformed train reviews
tv_train_reviews = tv.fit_transform(train_reviews)
#transformed test reviews
tv_test_reviews = tv.transform(test_reviews)
print('Tfidf_train:',tv_train_reviews.shape)
print('Tfidf_test:',tv_test_reviews.shape)

Tfidf_train: (30000, 4910561)
Tfidf_test: (20000, 4910561)


**Label encoding**

In [32]:
#labeling the sentiment data
lb = LabelBinarizer()
#transformed sentiment data
sentiment_data = lb.fit_transform(data['sentiment'])
print(sentiment_data.shape)

(50000, 1)


**Training the model**

In [35]:
#Spliting the sentiment data
train_d = sentiment_data[:30000]
test_d = sentiment_data[30000:]

In [36]:
#training the model
lr=LogisticRegression(penalty='l2',max_iter=500,C=1,random_state=42)
#Fitting the model for Bag of words
lr_bow=lr.fit(cv_train_reviews,train_d)
print(lr_bow)
#Fitting the model for tfidf features
lr_tfidf=lr.fit(tv_train_reviews,train_d)
print(lr_tfidf)

LogisticRegression(C=1, max_iter=500, random_state=42)
LogisticRegression(C=1, max_iter=500, random_state=42)


In [37]:
#Predicting the model for bag of words
lr_bow_predict = lr.predict(cv_test_reviews)
print(lr_bow_predict)
##Predicting the model for tfidf features
lr_tfidf_predict = lr.predict(tv_test_reviews)
print(lr_tfidf_predict)

[0 0 0 ... 0 1 1]
[0 0 0 ... 0 1 1]


In [38]:
#Accuracy score for bag of words
lr_bow_score=accuracy_score(test_d,lr_bow_predict)
print("lr_bow_score :",lr_bow_score)
#Accuracy score for tfidf features
lr_tfidf_score=accuracy_score(test_d,lr_tfidf_predict)
print("lr_tfidf_score :",lr_tfidf_score)

lr_bow_score : 0.7438
lr_tfidf_score : 0.74455


In [42]:
#Classification report for bag of words 
lr_bow_report=classification_report(test_d,lr_bow_predict,target_names=['Positive','Negative'])
print("Bag of words:\n",lr_bow_report)

#Classification report for tfidf features
lr_tfidf_report=classification_report(test_d,lr_tfidf_predict,target_names=['Positive','Negative'])
print("TF-IDF:\n",lr_tfidf_report)

Bag of words:
               precision    recall  f1-score   support

    Positive       0.75      0.73      0.74     10015
    Negative       0.74      0.75      0.75      9985

    accuracy                           0.74     20000
   macro avg       0.74      0.74      0.74     20000
weighted avg       0.74      0.74      0.74     20000

TF-IDF:
               precision    recall  f1-score   support

    Positive       0.75      0.74      0.74     10015
    Negative       0.74      0.75      0.75      9985

    accuracy                           0.74     20000
   macro avg       0.74      0.74      0.74     20000
weighted avg       0.74      0.74      0.74     20000



In [40]:
#confusion matrix for bag of words
cm_bow = confusion_matrix(test_sentiments,lr_bow_predict,labels = [1,0])
print(cm_bow)
#confusion matrix for tfidf features
cm_tfidf = confusion_matrix(test_sentiments,lr_tfidf_predict,labels = [1,0])
print(cm_tfidf)

[[7519 2466]
 [2658 7357]]
[[7497 2488]
 [2621 7394]]
