In [372]:
import os
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize,sent_tokenize
from nltk.tokenize.toktok import ToktokTokenizer
from bs4 import BeautifulSoup
import re,string,unicodedata
from textblob import TextBlob
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score


In [373]:
#Load the Dataset to the DataFrame
df = pd.read_csv('C:/Users/User5/Desktop/NLP/Project/IMDB Dataset2.csv')

In [374]:
#Display the head of the DataFrame
df.head()

Unnamed: 0,id,Review
0,1,One of the other reviewers has mentioned that ...
1,2,A wonderful little production. <br /><br />The...
2,3,I thought this was a wonderful way to spend ti...
3,4,Basically there's a family where a little boy ...
4,5,"Petter Mattei's ""Love in the Time of Money"" is..."


In [375]:
#Removing the html strips
def strip_html(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()

#clean Text
def cleanText(text):
    text = text.lower()
    text = text.translate(string.punctuation)
    text = text.replace('+', ' ')
    text = text.replace(',', '')
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002500-\U00002BEF"  # chinese char
                               u"\U00002702-\U000027B0"
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               u"\U0001f926-\U0001f937"
                               u"\U00010000-\U0010ffff"
                               u"\u2640-\u2642"
                               u"\u2600-\u2B55"
                               u"\u200d"
                               u"\u23cf"
                               u"\u23e9"
                               u"\u231a"
                               u"\ufe0f"  # dingbats
                               u"\u3030"
                               "]+", flags=re.UNICODE)
    text = re.sub(r'https?:\/\/\S+','',text) #Remove the hyperlink
    text = emoji_pattern.sub(r'',text) #Remove emoji
    text = strip_html(text) #Removing html strips
    text = re.sub(r'[^a-zA-z0-9\s]','',text) #remove special character
    
    
    #tokens = nltk.word_tokenize(text)
    
    return text
    

In [376]:
'''
#Tokenization of text
tokenizer=ToktokTokenizer()

#Setting English stopwords
stopword_list=nltk.corpus.stopwords.words('english')


'''

'''#Removing the square brackets
def remove_between_square_brackets(text):
    return re.sub('\[[^]]*\]', '', text)
'''
'''#Removing the noisy text
def denoise_text(text):
    text = strip_html(text)
    text = remove_between_square_brackets(text)
    return text
'''

'#Removing the noisy text\ndef denoise_text(text):\n    text = strip_html(text)\n    text = remove_between_square_brackets(text)\n    return text\n'

In [377]:
'''
#Apply function on review column
df_reviews['review']=df_reviews['review'].apply(denoise_text)
'''

"\n#Apply function on review column\ndf_reviews['review']=df_reviews['review'].apply(denoise_text)\n"

In [378]:
'''
#Define function for removing special characters
def remove_special_characters(text, remove_digits=True):
    pattern=r'[^a-zA-z0-9\s]'
    text=re.sub(pattern,'',text)
    return text
'''

"\n#Define function for removing special characters\ndef remove_special_characters(text, remove_digits=True):\n    pattern=r'[^a-zA-z0-9\\s]'\n    text=re.sub(pattern,'',text)\n    return text\n"

In [379]:
'''
#Apply function on review column
df_reviews['review']=df_reviews['review'].apply(remove_special_characters)
'''

"\n#Apply function on review column\ndf_reviews['review']=df_reviews['review'].apply(remove_special_characters)\n"

In [380]:
'''
#Stemming the text
def simple_stemmer(text):
    ps=nltk.porter.PorterStemmer()
    text= ' '.join([ps.stem(word) for word in text.split()])
    return text

'''
'''
#Apply function on review column
df_reviews['review']=df_reviews['review'].apply(simple_stemmer)

#set stopwords to english
stop=set(stopwords.words('english'))

#removing the stopwords
def remove_stopwords(text, is_lower_case=False):
    tokens = tokenizer.tokenize(text)
    tokens = [token.strip() for token in tokens]
    if is_lower_case:
        filtered_tokens = [token for token in tokens if token not in stopword_list]
    else:
        filtered_tokens = [token for token in tokens if token.lower() not in stopword_list]
    filtered_text = ' '.join(filtered_tokens)    
    return filtered_text

#Apply function on review column
df_reviews['review']=df_reviews['review'].apply(remove_stopwords)
'''

"\n#Apply function on review column\ndf_reviews['review']=df_reviews['review'].apply(simple_stemmer)\n\n#set stopwords to english\nstop=set(stopwords.words('english'))\n\n#removing the stopwords\ndef remove_stopwords(text, is_lower_case=False):\n    tokens = tokenizer.tokenize(text)\n    tokens = [token.strip() for token in tokens]\n    if is_lower_case:\n        filtered_tokens = [token for token in tokens if token not in stopword_list]\n    else:\n        filtered_tokens = [token for token in tokens if token.lower() not in stopword_list]\n    filtered_text = ' '.join(filtered_tokens)    \n    return filtered_text\n\n#Apply function on review column\ndf_reviews['review']=df_reviews['review'].apply(remove_stopwords)\n"

In [381]:
df['Review']=df['Review'].apply(cleanText)

In [382]:
df.head()

Unnamed: 0,id,Review
0,1,one of the other reviewers has mentioned that ...
1,2,a wonderful little production the filming tech...
2,3,i thought this was a wonderful way to spend ti...
3,4,basically theres a family where a little boy j...
4,5,petter matteis love in the time of money is a ...


In [383]:
#Subjectivity range [0,1]
#The higher subjectivity means that the text contains personal 
#opinion rather than factual information
def getSubjectivity(text):
    return TextBlob(text).sentiment.subjectivity

#Polarity range [-1,1]
#-1 = negative sentiment
#1 = positive sentiment
def getPolarity(text):
    
    return TextBlob(text).sentiment.polarity   

def getSentiment(score):
    
    if score == 0:
        return 'Neutral'
    elif score < 0:
        return 'Negative'
    else:
        return 'Positive'


In [384]:
df['Subjectivity Score'] = df['Review'].apply(getSubjectivity)
df['Polarity Score'] = df['Review'].apply(getPolarity)
df['Sentiment'] = df['Polarity Score'].apply(getSentiment)

In [385]:
df.head()

Unnamed: 0,id,Review,Subjectivity Score,Polarity Score,Sentiment
0,1,one of the other reviewers has mentioned that ...,0.490369,0.023433,Positive
1,2,a wonderful little production the filming tech...,0.559343,0.11149,Positive
2,3,i thought this was a wonderful way to spend ti...,0.640769,0.346324,Positive
3,4,basically theres a family where a little boy j...,0.454167,-0.060937,Negative
4,5,petter matteis love in the time of money is a ...,0.452916,0.217952,Positive


In [386]:
# Multinomial Naive Bayes Classifier
from sklearn.naive_bayes import MultinomialNB

# Import the Label Encoder
from sklearn.preprocessing import LabelEncoder

# Initialize the Label Encoder.
encoder = LabelEncoder()

# Encode the categories
df['Sentiment_enc'] = encoder.fit_transform(df['Sentiment'])

df.head()

Unnamed: 0,id,Review,Subjectivity Score,Polarity Score,Sentiment,Sentiment_enc
0,1,one of the other reviewers has mentioned that ...,0.490369,0.023433,Positive,2
1,2,a wonderful little production the filming tech...,0.559343,0.11149,Positive,2
2,3,i thought this was a wonderful way to spend ti...,0.640769,0.346324,Positive,2
3,4,basically theres a family where a little boy j...,0.454167,-0.060937,Negative,0
4,5,petter matteis love in the time of money is a ...,0.452916,0.217952,Positive,2


In [387]:
#split dataset in features and target variable
X = df['Review']
y = df['Sentiment_enc']

# split data into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, random_state = 34)

In [388]:
# check the shape of X_train and X_test
X_train.shape, X_test.shape

((35000,), (15000,))

In [389]:
X_train.head()

33832    when i first saw before night falls javier bar...
29125    please why on earth did bava had to add insult...
3038     i really love this movie  i saw it for the fir...
16014    predictable unmotivated pointless caricatures ...
11262    not confusing in the sense that gee this movie...
Name: Review, dtype: object

In [395]:
# Create the tf-idf vectorizer
vectorizer = TfidfVectorizer(strip_accents='ascii')

# First fit the vectorizer with our training set
tfidf_train = vectorizer.fit_transform(X_train)

# Now we can fit our test data with the same vectorizer
tfidf_test = vectorizer.transform(X_test)

# Initialize the Multinomial Naive Bayes classifier
nb = MultinomialNB()

# Fit the model
nb.fit(tfidf_train, y_train)

# Print the accuracy score
print("Accuracy:", nb.score(tfidf_test, y_test))
print("Accuracy percentage:", round((nb.score(tfidf_test, y_test)*100),2), "%")


Accuracy: 0.7561333333333333
Accuracy percentage: 75.61 %
