## Dataset description: <br>
IMDB dataset having 50K movie reviews for natural language processing or Text analytics.
This is a dataset for binary sentiment classification containing substantially more data than previous benchmark datasets. We provide a set of 25,000 highly polar movie reviews for training and 25,000 for testing. So, predict the number of positive and negative reviews using either classification or deep learning algorithms.

In [1]:
import numpy as np
import pandas as pd

In [2]:
data=pd.read_csv("IMDB Dataset.csv")

In [3]:
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [4]:
data.tail()

Unnamed: 0,review,sentiment
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative
49999,No one expects the Star Trek movies to be high...,negative


In [5]:
data.columns

Index(['review', 'sentiment'], dtype='object')

In [6]:
data.shape

(50000, 2)

In [8]:
data.isnull().any()

review       False
sentiment    False
dtype: bool

In [9]:
data.isnull().sum()

review       0
sentiment    0
dtype: int64

In [7]:
data.describe()

Unnamed: 0,review,sentiment
count,50000,50000
unique,49582,2
top,Loved today's show!!! It was a variety and not...,positive
freq,5,25000


In [10]:
data['sentiment'].value_counts()

positive    25000
negative    25000
Name: sentiment, dtype: int64

# Text normalization
## tokenization

In [12]:
!pip install wordcloud

Defaulting to user installation because normal site-packages is not writeable
Collecting wordcloud
  Downloading wordcloud-1.9.2-cp311-cp311-win_amd64.whl (151 kB)
                                              0.0/151.4 kB ? eta -:--:--
     ----------------------------           112.6/151.4 kB 2.2 MB/s eta 0:00:01
     -----------------------------------    143.4/151.4 kB 1.7 MB/s eta 0:00:01
     -------------------------------------- 151.4/151.4 kB 1.0 MB/s eta 0:00:00
Installing collected packages: wordcloud
Successfully installed wordcloud-1.9.2




In [13]:
import seaborn as sns
import matplotlib.pyplot as plt
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelBinarizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from wordcloud import WordCloud,STOPWORDS
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize,sent_tokenize

In [15]:
!pip install spacy

Defaulting to user installation because normal site-packages is not writeable
Collecting spacy
  Downloading spacy-3.6.0-cp311-cp311-win_amd64.whl (12.3 MB)
                                              0.0/12.3 MB ? eta -:--:--
                                              0.1/12.3 MB 2.2 MB/s eta 0:00:06
     -                                        0.3/12.3 MB 2.7 MB/s eta 0:00:05
     -                                        0.6/12.3 MB 3.3 MB/s eta 0:00:04
     --                                       0.6/12.3 MB 3.0 MB/s eta 0:00:04
     --                                       0.6/12.3 MB 3.0 MB/s eta 0:00:04
     ---                                      1.0/12.3 MB 2.8 MB/s eta 0:00:05
     -----                                    1.6/12.3 MB 3.7 MB/s eta 0:00:03
     ------                                   1.9/12.3 MB 3.9 MB/s eta 0:00:03
     -------                                  2.2/12.3 MB 3.9 MB/s eta 0:00:03
     --------                                 2.5/12.3 MB 4.



In [17]:
!pip install textblob

Defaulting to user installation because normal site-packages is not writeable
Collecting textblob
  Downloading textblob-0.17.1-py2.py3-none-any.whl (636 kB)
                                              0.0/636.8 kB ? eta -:--:--
     --                                      41.0/636.8 kB 1.9 MB/s eta 0:00:01
     --                                      41.0/636.8 kB 1.9 MB/s eta 0:00:01
     --                                      41.0/636.8 kB 1.9 MB/s eta 0:00:01
     --                                      41.0/636.8 kB 1.9 MB/s eta 0:00:01
     -----                                 92.2/636.8 kB 290.5 kB/s eta 0:00:02
     ------                               112.6/636.8 kB 311.2 kB/s eta 0:00:02
     ------                               122.9/636.8 kB 300.4 kB/s eta 0:00:02
     ---------                            174.1/636.8 kB 337.8 kB/s eta 0:00:02
     ------------                         225.3/636.8 kB 404.2 kB/s eta 0:00:02
     ----------------                     286.7/6

In [18]:
import spacy
import re,string,unicodedata
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.stem import LancasterStemmer,WordNetLemmatizer
from sklearn.linear_model import LogisticRegression,SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from textblob import TextBlob
from textblob import Word
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
from bs4 import BeautifulSoup


In [19]:
import nltk
nltk.download('stopwords')

[nltk_data] Error loading stopwords: <urlopen error [WinError 10060] A
[nltk_data]     connection attempt failed because the connected party
[nltk_data]     did not properly respond after a period of time, or
[nltk_data]     established connection failed because connected host
[nltk_data]     has failed to respond>


False

In [20]:
#Tokenization of text
tokenizers=ToktokTokenizer()
#Setting English stopwords
stopwords=nltk.corpus.stopwords.words('english')

In [21]:
#Removing the noisy text
def noiseremoval_text(text):
  soup = BeautifulSoup(text, "html.parser")
  text = soup.get_text()
  text = re.sub('\[[^]]*\]', '', text)
  return text


In [22]:
#Apply function on review column
data['review']=data['review'].apply(noiseremoval_text)

  soup = BeautifulSoup(text, "html.parser")


In [23]:
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. The filming tec...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


## Stemming

In [24]:
#Stemming the text
def stemmer(text):
    ps=nltk.porter.PorterStemmer()
    text= ' '.join([ps.stem(word) for word in text.split()])
    return text


In [28]:
#Apply function on review column
data['review']=data['review'].apply(stemmer)

In [29]:
data.head()

Unnamed: 0,review,sentiment
0,one of the other review ha mention that after ...,positive
1,a wonder littl production. the film techniqu i...,positive
2,i thought thi wa a wonder way to spend time on...,positive
3,basic there' a famili where a littl boy (jake)...,negative
4,"petter mattei' ""love in the time of money"" is ...",positive


## Removing stop words

In [30]:
from nltk.corpus import stopwords  
from nltk.tokenize import word_tokenize  


In [31]:
#set stopwords to english

stop_wr=set(stopwords.words('english'))
print(stop_wr)

{"don't", 're', 'couldn', 'y', 'haven', 'any', 'in', 'now', 'yours', 'or', 'most', 'doesn', 'himself', 'this', 'than', 'and', 'its', 'while', 'being', 'how', 'out', 'can', 'each', "couldn't", 'mightn', "haven't", 'has', 'that', 'into', 'because', 'an', 'during', 'me', 'have', 'no', 'should', 'why', 'when', "should've", "doesn't", 'them', 'through', 'over', 'both', 'about', 'were', "didn't", 'under', 'more', 'very', "you're", "that'll", 'do', "shan't", 've', 'hasn', 'is', 'did', 'shan', 'yourselves', 'such', 'theirs', 'on', 'does', 'will', 'didn', "wouldn't", 'up', "mustn't", 'my', 'ma', 'myself', 'won', 'hers', 'his', 'm', 'their', 'again', 'for', 'themselves', 'to', 'where', 'doing', "you've", "she's", 'weren', 'few', "wasn't", 'yourself', 'nor', 'be', 'had', 'herself', 'a', 'ain', 'what', 'those', "aren't", 'aren', 'been', "weren't", 'ourselves', 'once', 'from', 'your', "won't", "isn't", "you'll", 'having', 'll', 'am', 'only', 'these', "it's", 'i', 'before', 'needn', 'o', 'wasn', 'do

In [32]:
#removing the stopwords
def removing_stopwords(text, is_lower_case=False):
    #Tokenization of text
    tokenizers=ToktokTokenizer()
    #Setting English stopwords
    tokens = tokenizers.tokenize(text)
    tokens = [token.strip() for token in tokens]
    if is_lower_case:
        filter_tokens = [token for token in tokens if token not in stop_wr]
    else:
        filter_tokens = [token for token in tokens if token.lower() not in stop_wr]
    filtered_text = ' '.join(filter_tokens)    
    return filtered_text


In [33]:
#Apply function on review column
data['review']=data['review'].apply(removing_stopwords)

In [34]:
data.head()

Unnamed: 0,review,sentiment
0,one review ha mention watch 1 oz episod ' hook...,positive
1,wonder littl production. film techniqu veri un...,positive
2,thought thi wa wonder way spend time hot summe...,positive
3,basic ' famili littl boy ( jake ) think ' zomb...,negative
4,"petter mattei ' "" love time money "" visual stu...",positive


## Train test split

In [35]:
#split the dataset  
#train dataset
train_reviews_data=data.review[:30000]


In [36]:
#test dataset

test_reviews_data=data.review[30000:]


## Bag of words

In [37]:
#Count vectorizer for bag of words
cv=CountVectorizer(min_df=0,max_df=1,binary=False,ngram_range=(1,3))
#transformed train reviews
cv_train=cv.fit_transform(train_reviews_data)
#transformed test reviews
cv_test=cv.transform(test_reviews_data)

print('BOW_cv_train:',cv_train.shape)
print('BOW_cv_test:',cv_test.shape)
#vocab=cv.get_feature_names()-toget feature names

BOW_cv_train: (30000, 4948415)
BOW_cv_test: (20000, 4948415)


## TF_IDF

In [38]:
#Tfidf vectorizer
tf=TfidfVectorizer(min_df=0,max_df=1,use_idf=True,ngram_range=(1,3))
#transformed train reviews
tf_train=tf.fit_transform(train_reviews_data)
#transformed test reviews
tf_test=tf.transform(test_reviews_data)
print('Tfidf_train:',tf_train.shape)
print('Tfidf_test:',tf_test.shape)

Tfidf_train: (30000, 4948415)
Tfidf_test: (20000, 4948415)


## Lable encoding

In [39]:
#labeling the sentient data
label=LabelBinarizer()
#transformed sentiment data
sentiment_data=label.fit_transform(data['sentiment'])
print(sentiment_data.shape)

(50000, 1)


In [40]:
train_data=data.sentiment[:30000]


In [41]:
test_data=data.sentiment[30000:]


In [42]:
#training the model
logistic=LogisticRegression(penalty='l2',max_iter=500,C=1,random_state=42)
#Fitting the model for Bag of words
lr_bow=logistic.fit(cv_train,train_data)
print(lr_bow)
#Fitting the model for tfidf features
lr_tfidf=logistic.fit(tf_train,train_data)
print(lr_tfidf)

LogisticRegression(C=1, max_iter=500, random_state=42)
LogisticRegression(C=1, max_iter=500, random_state=42)


In [43]:
#Predicting the model for bag of words
lr_bow_predict=logistic.predict(cv_test)
print(lr_bow_predict)


['negative' 'negative' 'negative' ... 'negative' 'positive' 'positive']


In [44]:
##Predicting the model for tfidf features
lr_tfidf_predict=logistic.predict(tf_test)
print(lr_tfidf_predict)

['negative' 'negative' 'negative' ... 'negative' 'positive' 'positive']


In [45]:
#Accuracy score for bag of words
lr_bow_score=accuracy_score(test_data,lr_bow_predict)
print("lr_bow_score :",lr_bow_score)


lr_bow_score : 0.74415


In [46]:
#Accuracy score for tfidf features
lr_tfidf_score=accuracy_score(test_data,lr_tfidf_predict)
print("lr_tfidf_score :",lr_tfidf_score)

lr_tfidf_score : 0.7442
