In [1]:
import numpy as np
import pandas as pd

df = pd.read_csv('TextFiles/moviereviews.tsv', sep='\t')
df.head()

Unnamed: 0,label,review
0,neg,how do films like mouse hunt get into theatres...
1,neg,some talented actresses are blessed with a dem...
2,pos,this has been an extraordinary year for austra...
3,pos,according to hollywood movies made in last few...
4,neg,my first press screening of 1998 and already i...


In [2]:
len(df)

2000

In [3]:
from IPython.display import Markdown, display
display(Markdown('> '+df['review'][0]))

> how do films like mouse hunt get into theatres ? 
isn't there a law or something ? 
this diabolical load of claptrap from steven speilberg's dreamworks studio is hollywood family fare at its deadly worst . 
mouse hunt takes the bare threads of a plot and tries to prop it up with overacting and flat-out stupid slapstick that makes comedies like jingle all the way look decent by comparison . 
writer adam rifkin and director gore verbinski are the names chiefly responsible for this swill . 
the plot , for what its worth , concerns two brothers ( nathan lane and an appalling lee evens ) who inherit a poorly run string factory and a seemingly worthless house from their eccentric father . 
deciding to check out the long-abandoned house , they soon learn that it's worth a fortune and set about selling it in auction to the highest bidder . 
but battling them at every turn is a very smart mouse , happy with his run-down little abode and wanting it to stay that way . 
the story alternates between unfunny scenes of the brothers bickering over what to do with their inheritance and endless action sequences as the two take on their increasingly determined furry foe . 
whatever promise the film starts with soon deteriorates into boring dialogue , terrible overacting , and increasingly uninspired slapstick that becomes all sound and fury , signifying nothing . 
the script becomes so unspeakably bad that the best line poor lee evens can utter after another run in with the rodent is : " i hate that mouse " . 
oh cringe ! 
this is home alone all over again , and ten times worse . 
one touching scene early on is worth mentioning . 
we follow the mouse through a maze of walls and pipes until he arrives at his makeshift abode somewhere in a wall . 
he jumps into a tiny bed , pulls up a makeshift sheet and snuggles up to sleep , seemingly happy and just wanting to be left alone . 
it's a magical little moment in an otherwise soulless film . 
a message to speilberg : if you want dreamworks to be associated with some kind of artistic credibility , then either give all concerned in mouse hunt a swift kick up the arse or hire yourself some decent writers and directors . 
this kind of rubbish will just not do at all . 


In [4]:
# Check for the existence of NaN values in a cell:
df.isnull().sum()

label      0
review    35
dtype: int64

In [5]:
df.dropna(inplace=True)

len(df)

1965

In [6]:
blanks = []  # start with an empty list

for i,lb,rv in df.itertuples():  # iterate over the DataFrame
    if type(rv)==str:            # avoid NaN values
        if rv.isspace():         # test 'review' for whitespace
            blanks.append(i)     # add matching index numbers to the list
        
print(len(blanks), 'blanks: ', blanks)

27 blanks:  [57, 71, 147, 151, 283, 307, 313, 323, 343, 351, 427, 501, 633, 675, 815, 851, 977, 1079, 1299, 1455, 1493, 1525, 1531, 1763, 1851, 1905, 1993]


In [7]:
df.drop(blanks, inplace=True)

In [8]:
len(df)

1938

In [9]:
df['label'].value_counts()

label
neg    969
pos    969
Name: count, dtype: int64

In [10]:
from sklearn.model_selection import train_test_split

X = df['review']
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=43)

In [11]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC

# naive bayes
text_clf_nb = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', MultinomialNB()),
])


#linear svc:
text_clf_lsvc = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', LinearSVC()),
])

In [12]:
# fit the data
text_clf_nb.fit(X_train, y_train)

In [13]:
predictions = text_clf_nb.predict(X_test)

In [15]:
# report the confusion matrix
from sklearn import metrics
print(metrics.confusion_matrix(y_test, predictions))

[[288  19]
 [138 195]]


In [16]:
# print a classification report
print(metrics.classification_report(y_test, predictions))

              precision    recall  f1-score   support

         neg       0.68      0.94      0.79       307
         pos       0.91      0.59      0.71       333

    accuracy                           0.75       640
   macro avg       0.79      0.76      0.75       640
weighted avg       0.80      0.75      0.75       640



In [17]:
# print the overall accuracy
print(metrics.accuracy_score(y_test, predictions))

0.7546875


# second pipeline

In [18]:
text_clf_lsvc.fit(X_train, y_train)

In [19]:
predictions2 = text_clf_lsvc.predict(X_test)

In [20]:
# report confusion matrix
from sklearn import metrics
print(metrics.confusion_matrix(y_test, predictions2))

[[238  69]
 [ 44 289]]


In [22]:
# Print a classification report
print(metrics.classification_report(y_test,predictions2))

              precision    recall  f1-score   support

         neg       0.84      0.78      0.81       307
         pos       0.81      0.87      0.84       333

    accuracy                           0.82       640
   macro avg       0.83      0.82      0.82       640
weighted avg       0.82      0.82      0.82       640



In [23]:
# Print the overall accuracy
print(metrics.accuracy_score(y_test, predictions2))

0.8234375


# STOP Words

In [24]:
from sklearn.feature_extraction import text
print(text.ENGLISH_STOP_WORDS)

frozenset({'thin', 'mostly', 'very', 'below', 'who', 'besides', 'that', 'nowhere', 'cannot', 'anyone', 'system', 'when', 'into', 'somehow', 'becoming', 'beside', 'first', 'anywhere', 'elsewhere', 'you', 'of', 'her', 'has', 'however', 'though', 'herein', 'is', 'also', 'hundred', 'meanwhile', 'move', 'not', 'thus', 'all', 'moreover', 'since', 'top', 'many', 'thereby', 'nor', 'without', 'indeed', 'eg', 'them', 'became', 'your', 'over', 'get', 'whereafter', 'whose', 'less', 'well', 'are', 'eleven', 'whence', 'detail', 'ie', 'sincere', 'an', 'becomes', 'therein', 'or', 'latterly', 'least', 'afterwards', 'about', 'had', 'they', 'give', 'either', 'the', 'to', 'own', 'have', 'we', 'us', 'every', 'thence', 'there', 'hers', 'couldnt', 'whole', 'third', 'yours', 'whereas', 'thru', 'beyond', 'go', 'seems', 'along', 'across', 'he', 'thereafter', 'onto', 'while', 'etc', 'former', 're', 'may', 'toward', 'towards', 'seem', 'found', 'beforehand', 'behind', 'last', 'mill', 'else', 'further', 'un', 'befo

In [25]:
len(text.ENGLISH_STOP_WORDS)

318

In [26]:
stopwords = ['a', 'about', 'an', 'and', 'are', 'as', 'at', 'be', 'been', 'but', 'by', 'can', \
             'even', 'ever', 'for', 'from', 'get', 'had', 'has', 'have', 'he', 'her', 'hers', 'his', \
             'how', 'i', 'if', 'in', 'into', 'is', 'it', 'its', 'just', 'me', 'my', 'of', 'on', 'or', \
             'see', 'seen', 'she', 'so', 'than', 'that', 'the', 'their', 'there', 'they', 'this', \
             'to', 'was', 'we', 'were', 'what', 'when', 'which', 'who', 'will', 'with', 'you']

In [27]:
# RUN THIS CELL TO ADD STOPWORDS TO THE LINEAR SVC PIPELINE:
text_clf_lsvc2 = Pipeline([('tfidf', TfidfVectorizer(stop_words=stopwords)),
                     ('clf', LinearSVC()),
])
text_clf_lsvc2.fit(X_train, y_train)

In [28]:
predictions = text_clf_lsvc2.predict(X_test)
print(metrics.confusion_matrix(y_test,predictions))

[[251  56]
 [ 44 289]]


In [29]:
print(metrics.classification_report(y_test,predictions))

              precision    recall  f1-score   support

         neg       0.85      0.82      0.83       307
         pos       0.84      0.87      0.85       333

    accuracy                           0.84       640
   macro avg       0.84      0.84      0.84       640
weighted avg       0.84      0.84      0.84       640



In [30]:
print(metrics.accuracy_score(y_test,predictions))

0.84375


## Next, feed new data to the model's predict() method

In [31]:
myreview = "A movie I really wanted to love was terrible. \
I'm sure the producers had the best intentions, but the execution was lacking."

In [32]:
print(text_clf_nb.predict([myreview]))

['neg']


In [33]:
print(text_clf_lsvc.predict([myreview]))

['neg']


In [34]:
print(text_clf_lsvc2.predict([myreview]))

['neg']
