In [23]:
#Load the libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelBinarizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize,sent_tokenize
from bs4 import BeautifulSoup
import re,string,unicodedata
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.stem import LancasterStemmer,WordNetLemmatizer
from sklearn.linear_model import LogisticRegression,SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score

import warnings
warnings.filterwarnings('ignore')

In [24]:
df = pd.read_csv('IMDB Dataset.csv')

In [25]:
df.shape

(50000, 2)

In [26]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [27]:
#Check sentiments
df['sentiment'].value_counts()

positive    25000
negative    25000
Name: sentiment, dtype: int64

In [28]:
train_reviews,train_sentiment = df.review[:40000],df.sentiment[:40000]

In [29]:
test_review,test_sentiment = df.review[40000:],df.sentiment[40000:]

In [30]:
tokenizer = ToktokTokenizer()
stopwords = nltk.corpus.stopwords.words('english')
print(stopwords)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [31]:
def striphtml(text):
    soup = BeautifulSoup(text,"html.parser")
    return soup.get_text()
def remove_sq_brackets(text):
    return re.sub('\[[^]]*\]','',text)

def remove_noise(text):
    text = striphtml(text)
    text = remove_sq_brackets(text)
    return text

#Apply function on review column

df['review'] = df['review'].apply(remove_noise)

In [32]:
df['review'].head(100)

0     One of the other reviewers has mentioned that ...
1     A wonderful little production. The filming tec...
2     I thought this was a wonderful way to spend ti...
3     Basically there's a family where a little boy ...
4     Petter Mattei's "Love in the Time of Money" is...
                            ...                        
95    Daniel Day-Lewis is the most versatile actor a...
96    My guess would be this was originally going to...
97    Well, I like to watch bad horror B-Movies, cau...
98    This IS the worst movie I have ever seen, as w...
99    I have been a Mario fan for as long as I can r...
Name: review, Length: 100, dtype: object

In [33]:
def remove_special_chars(text,remove_digits = True):
    pattern = r'[^a-zA-Z0-9\s]' 
    text = re.sub(pattern,'',text)
    return text
df['review'] = df['review'].apply(remove_special_chars)

In [34]:
df['review'].head(100)

0     One of the other reviewers has mentioned that ...
1     A wonderful little production The filming tech...
2     I thought this was a wonderful way to spend ti...
3     Basically theres a family where a little boy J...
4     Petter Matteis Love in the Time of Money is a ...
                            ...                        
95    Daniel DayLewis is the most versatile actor al...
96    My guess would be this was originally going to...
97    Well I like to watch bad horror BMovies cause ...
98    This IS the worst movie I have ever seen as we...
99    I have been a Mario fan for as long as I can r...
Name: review, Length: 100, dtype: object

In [35]:
#Stemming the text
def simple_stemmer(text):
    ps = nltk.porter.PorterStemmer()
    text = ' '.join([ps.stem(word) for word in text.split()])
    return text
#Apply function on review column
df['review'] = df['review'].apply(simple_stemmer)

In [36]:
df['review'].head(100)

0     one of the other review ha mention that after ...
1     A wonder littl product the film techniqu is ve...
2     I thought thi wa a wonder way to spend time on...
3     basic there a famili where a littl boy jake th...
4     petter mattei love in the time of money is a v...
                            ...                        
95    daniel daylewi is the most versatil actor aliv...
96    My guess would be thi wa origin go to be at le...
97    well I like to watch bad horror bmovi caus I t...
98    thi IS the worst movi I have ever seen as well...
99    I have been a mario fan for as long as I can r...
Name: review, Length: 100, dtype: object

In [37]:
stop = set(nltk.corpus.stopwords.words('english'))

def remove_stopwords(text, is_lowercase=False):
    tokens = tokenizer.tokenize(text)
    tokens = [token.strip() for token in tokens]
    if is_lowercase :
        filter_tokens = [token for token in tokens if token not in stopwords]
    else :
        filter_tokens = [token for token in tokens if token.lower() not in stopwords]
    return filter_tokens
df['review'] = df['review'].apply(remove_stopwords)

In [38]:
df['review'].head()

0    [one, review, ha, mention, watch, 1, Oz, episo...
1    [wonder, littl, product, film, techniqu, veri,...
2    [thought, thi, wa, wonder, way, spend, time, h...
3    [basic, famili, littl, boy, jake, think, zombi...
4    [petter, mattei, love, time, money, visual, st...
Name: review, dtype: object

In [44]:
norm_train_reviews = df.review[:40000]


In [43]:
norm_test_reviews = df.review[40000:]


In [46]:
cv = CountVectorizer(min_df = 0, max_df = 1, binary = False, ngram_range = (1,3))
cv_train_reviews = cv.fit_transform(norm_train_reviews)
cv_test_reviews = cv.transform(norm_test_reviews)

print('BOW_cv_train : ',cv_train_reviews.shape)
print('BOW_cv_test : ',cv_test_reviews.shape)

AttributeError: 'list' object has no attribute 'lower'

In [None]:
tv = TfidVectorizer(min_df=0,max_df=1,use_idf=True,ngram_range=(1,3))
tv_train_reviews = tv.fit_transform(norm_test_reviews)
tv_test_reviews = tv.transform(norm_test_reviews)