In [1]:
#Load and View Dataset
import pandas as pd
dataset = pd.read_csv('movie_reviews.csv.bz2', compression='bz2')
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB


In [2]:
dataset.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [3]:
#Build Train and Test Datasets
# build train and test datasets
reviews = dataset['review'].values
sentiments = dataset['sentiment'].values

train_reviews = reviews[:35000]
train_sentiments = sentiments[:35000]

test_reviews = reviews[35000:]
test_sentiments = sentiments[35000:]


In [4]:
#Text Wrangling & Normalization
import contractions
from bs4 import BeautifulSoup
import unicodedata
import re
import nltk
import numpy as np

ps = nltk.porter.PorterStemmer()
stop_words = nltk.corpus.stopwords.words('english')
stop_words.remove('no')
stop_words.remove('but')
stop_words.remove('not')


def strip_html_tags(text):
    soup = BeautifulSoup(text, "html.parser")
    [s.extract() for s in soup(['iframe', 'script'])]
    stripped_text = soup.get_text()
    stripped_text = re.sub(r'[\r|\n|\r\n]+', '\n', stripped_text)
    return stripped_text


def remove_accented_chars(text):
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    return text


def expand_contractions(text):
    return contractions.fix(text)


def simple_stemming(text, stemmer=ps):
    text = ' '.join([stemmer.stem(word) for word in text.split()])
    return text


def remove_special_characters(text, remove_digits=False):
    pattern = r'[^a-zA-Z0-9\s]' if not remove_digits else r'[^a-zA-Z\s]'
    text = re.sub(pattern, '', text)
    return text


def remove_stopwords(text, is_lower_case=False, stopwords=None):
    if not stopwords:
        stopwords = nltk.corpus.stopwords.words('english')
    tokens = nltk.word_tokenize(text)
    tokens = [token.strip() for token in tokens]
    
    if is_lower_case:
        filtered_tokens = [token for token in tokens if token not in stopwords]
    else:
        filtered_tokens = [token for token in tokens if token.lower() not in stopwords]
    
    filtered_text = ' '.join(filtered_tokens)    
    return filtered_text


def pre_process_document(document):
    
    # strip HTML
    document = strip_html_tags(document)
    
    # lower case
    document = document.lower()
    
    # remove extra newlines (often might be present in really noisy text)
    document = document.translate(document.maketrans("\n\t\r", "   "))
    
    # remove accented characters
    document = remove_accented_chars(document)
    
    # expand contractions    
    document = expand_contractions(document)
               
    # remove special characters and\or digits    
    # insert spaces between special characters to isolate them    
    special_char_pattern = re.compile(r'([{.(-)!}])')
    document = special_char_pattern.sub(" \\1 ", document)
    document = remove_special_characters(document, remove_digits=True)  

    # stemming text
    document = simple_stemming(document)      
    
    # remove stopwords
    document = remove_stopwords(document, is_lower_case=True, stopwords=stop_words)
        
    # remove extra whitespace
    document = re.sub(' +', ' ', document)
    document = document.strip()
    
    return document


pre_process_corpus = np.vectorize(pre_process_document)


In [5]:
%%time

norm_train_reviews = pre_process_corpus(train_reviews)
norm_test_reviews = pre_process_corpus(test_reviews)

Wall time: 7min 5s


In [6]:
%%time

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# build BOW features on train reviews
cv = CountVectorizer(binary=False, min_df=5, max_df=1.0, ngram_range=(1,2))
cv_train_features = cv.fit_transform(norm_train_reviews)


# build TFIDF features on train reviews
tv = TfidfVectorizer(use_idf=True, min_df=5, max_df=1.0, ngram_range=(1,2),
                     sublinear_tf=True)
tv_train_features = tv.fit_transform(norm_train_reviews)

Wall time: 55.3 s


In [7]:
%%time

# transform test reviews into features
cv_test_features = cv.transform(norm_test_reviews)
tv_test_features = tv.transform(norm_test_reviews)

Wall time: 10.5 s


In [8]:
print('BOW model:> Train features shape:', cv_train_features.shape, ' Test features shape:', cv_test_features.shape)
print('TFIDF model:> Train features shape:', tv_train_features.shape, ' Test features shape:', tv_test_features.shape)

BOW model:> Train features shape: (35000, 138268)  Test features shape: (15000, 138268)
TFIDF model:> Train features shape: (35000, 138268)  Test features shape: (15000, 138268)


In [9]:
# Logistic regrssion on BOW data

# Logistic Regression model on BOW features
from sklearn.linear_model import LogisticRegression

# instantiate model
lr = LogisticRegression(  )

# train model
lr.fit(cv_train_features, train_sentiments)

# predict on test data
lr_bow_predictions = lr.predict(cv_test_features)

In [10]:
from sklearn.metrics import confusion_matrix,classification_report
print(confusion_matrix(lr_bow_predictions,test_sentiments))
print(classification_report(lr_bow_predictions,test_sentiments))

[[6762  723]
 [ 728 6787]]
             precision    recall  f1-score   support

   negative       0.90      0.90      0.90      7485
   positive       0.90      0.90      0.90      7515

avg / total       0.90      0.90      0.90     15000



In [11]:
# Logistic regression with TF_IDF
# Logistic Regression model on TF-IDF features

# train model
lr.fit(tv_train_features, train_sentiments)

# predict on test data
lr_tfidf_predictions = lr.predict(tv_test_features)

In [12]:
from sklearn.metrics import confusion_matrix,classification_report
print(confusion_matrix(lr_tfidf_predictions ,test_sentiments))
print(classification_report(lr_tfidf_predictions ,test_sentiments))

[[6716  673]
 [ 774 6837]]
             precision    recall  f1-score   support

   negative       0.90      0.91      0.90      7389
   positive       0.91      0.90      0.90      7611

avg / total       0.90      0.90      0.90     15000



In [13]:
from sklearn.ensemble import RandomForestClassifier

# train model
rf=RandomForestClassifier()
rf.fit(tv_train_features, train_sentiments)

# predict on test data
rf_rf_predictions = rf.predict(tv_test_features)

  from numpy.core.umath_tests import inner1d


In [14]:
from sklearn.metrics import confusion_matrix,classification_report
print(confusion_matrix(rf_rf_predictions ,test_sentiments))
print(classification_report(rf_rf_predictions ,test_sentiments))

[[6294 2260]
 [1196 5250]]
             precision    recall  f1-score   support

   negative       0.84      0.74      0.78      8554
   positive       0.70      0.81      0.75      6446

avg / total       0.78      0.77      0.77     15000



In [30]:

new1= 'Be careful with this one. Once you get yer mitts on it, it ll change the way you look at kung-fu flicks. You will be yearning a plot from all of the kung-fu films now, you will be wanting character depth and development '
norm_new_reviews = pre_process_corpus(new1)
norm_new_reviews







array('care thi one onc get yer mitt chang way look kungfu flick yearn plot kungfu film want charact depth develop',
      dtype='<U107')