In [1]:
import pandas as pd
import numpy as np
import nltk
import spacy
import gensim
import sklearn
from sklearn import metrics

In [2]:
## Import the data
df = pd.read_csv( '/Volumes/LACIE_SHARE/python_scripts/Data/ratings.tsv', sep='\t', header = 0)

df.head()




Unnamed: 0,label,review
0,neg,how do films like mouse hunt get into theatres...
1,neg,some talented actresses are blessed with a dem...
2,pos,this has been an extraordinary year for austra...
3,pos,according to hollywood movies made in last few...
4,neg,my first press screening of 1998 and already i...


In [3]:
## Check for and remove missing values and blank strings
print(df.info())

## a few null reviews.

print('{} blank labels'.format(np.sum(df['label']=="")))
print('{} blank reviews'.format(np.sum(df['review'] == "")))



df = df.dropna(axis=0)
print(df.info())




<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8000 entries, 0 to 7999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   label   8000 non-null   object
 1   review  7945 non-null   object
dtypes: object(2)
memory usage: 125.1+ KB
None
0 blank labels
0 blank reviews
<class 'pandas.core.frame.DataFrame'>
Int64Index: 7945 entries, 0 to 7999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   label   7945 non-null   object
 1   review  7945 non-null   object
dtypes: object(2)
memory usage: 186.2+ KB
None


In [4]:
from nltk import wordpunct_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer


sw = set(stopwords.words('english'))
wn = WordNetLemmatizer()


def tokenize(x):
    x = x.lower()
    x = x.replace('\r', '') ## REmove \r \n
    x = x.replace('\n', '') #remove \n
    x = x.replace("\'", "'") #replace \' with '
    tokens = wordpunct_tokenize(x)
    tokens = [tok for tok in tokens if tok.isalnum()]
    
    tokens = [tok for tok in tokens if tok not in sw]
    
    tokens = [wn.lemmatize(tok) for tok in tokens]
    return(tokens)



df['tokens'] = df['review'].apply(tokenize)
df['clean'] = df['tokens'].apply(lambda x: " ".join(x))

In [5]:
## Split the data into a training set and a test set. 
##Use test size=0.33, stratify=y, and random state=801 (where y is the label positive or negative)
from sklearn.model_selection import train_test_split

train, test = train_test_split(df, test_size=.33, stratify=df.label, random_state=801)

y_train = (train['label']=='pos').astype(int)
y_test = (test['label']=='pos').astype(int)







In [6]:
## Vectorize the data using TD-IDF. Be sure that all model development is with the training data 
## (fit the TD-IDF transformer on the training data, then transform to both training and test data).
from sklearn.feature_extraction.text import TfidfVectorizer

tf_idf = TfidfVectorizer()
tf_idf.fit(train)

tf_idf_train = tf_idf.transform(train['clean'])
tf_idf_test = tf_idf.transform(test['clean'])



In [31]:
## Build a machine learning classifier. Try out various models including:
## Support Vector Classifier
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.neural_network import MLPClassifier

linear_svc = SVC(kernel='linear')  
linear_svc.fit(tf_idf_train, y_train) 

linear_predict = linear_svc.predict(tf_idf_test)
linear_predict_train = linear_svc.predict(tf_idf_train)

sigmoid_svc = SVC(kernel='sigmoid')  
sigmoid_svc.fit(tf_idf_train, y_train) 

sigmoid_predict_train = sigmoid_svc.predict(tf_idf_train)
sigmoid_predict = sigmoid_svc.predict(tf_idf_test)



## Multilayer Perceptron
mlp = MLPClassifier()

mlp.fit(tf_idf_train, y_train)

mlp_predict = mlp.predict(tf_idf_test)
mlp_predict_train = mlp.predict(tf_idf_train)



## Multinomial Naive Bayes
nb = MultinomialNB()
nb.fit(tf_idf_train, y_train)

nb_predict = nb.predict(tf_idf_test)
nb_predict_train = nb.predict(tf_idf_train)

## voting classifier
vc = VotingClassifier(estimators=[
        ('svc', sigmoid_svc), ('mlp', mlp), ('gnb', nb)],
        voting='hard', 
        weights=[1,1,2])
vc.fit(tf_idf_train, y_train)
vc_predict = vc.predict(tf_idf_test)
vc_predict_train = vc.predict(tf_idf_train)


In [23]:
def print_metrics(actual, predicted, model_name):
    print("metrics for {}:".format(model_name))
    print(metrics.confusion_matrix(predicted, actual))
    print('we had an accuracy of {}, precision of {}, and recall of {}'.format(metrics.accuracy_score(predicted, actual),
                                                                           metrics.precision_score(predicted, actual),
                                                                         metrics.recall_score(predicted, actual)))
    print("")


In [32]:
print_metrics(y_train, linear_predict_train, "linear support vector classifier") 
print_metrics(y_train, sigmoid_predict_train, "sigmoid support vector classifier") 
print_metrics(y_train, mlp_predict_train, "multilayer perceptron") 
print_metrics(y_train, nb_predict_train, "naive beyes") 
print_metrics(y_train, vc_predict_train, 'voting classifier')

metrics for linear support vector classifier:
[[ 235  159]
 [2427 2502]]
we had an accuracy of 0.5141837309787713, precision of 0.9402480270574972, and recall of 0.5076080340839927

metrics for sigmoid support vector classifier:
[[ 259  183]
 [2403 2478]]
we had an accuracy of 0.5141837309787713, precision of 0.9312288613303269, and recall of 0.5076828518746158

metrics for multilayer perceptron:
[[ 259  183]
 [2403 2478]]
we had an accuracy of 0.5141837309787713, precision of 0.9312288613303269, and recall of 0.5076828518746158

metrics for naive beyes:
[[2622 2623]
 [  40   38]]
we had an accuracy of 0.4997182040202893, precision of 0.014280345734686208, and recall of 0.48717948717948717

metrics for voting classifier:
[[2652 2650]
 [  10   11]]
we had an accuracy of 0.5002817959797107, precision of 0.004133784291619692, and recall of 0.5238095238095238



# Our classifiers did not do very well on our data for raw accuracy. Even on our training data, they only did a little bit better than a naive guess. Some of the precision rates were fairly high, though.

In [24]:
print_metrics(y_test, linear_predict, "linear support vector classifier") 
print_metrics(y_test, sigmoid_predict, "sigmoid support vector classifier") 
print_metrics(y_test, mlp_predict, "multilayer perceptron") 
print_metrics(y_test, nb_predict, "naive beyes") 
print_metrics(y_test, vc_predict, 'voting classifier')



metrics for linear support vector classifier:
[[ 100   82]
 [1211 1229]]
we had an accuracy of 0.5068649885583524, precision of 0.9374523264683448, and recall of 0.5036885245901639

metrics for sigmoid support vector classifier:
[[ 114  100]
 [1197 1211]]
we had an accuracy of 0.5053394355453852, precision of 0.92372234935164, and recall of 0.502906976744186

metrics for multilayer perceptron:
[[ 114  100]
 [1197 1211]]
we had an accuracy of 0.5053394355453852, precision of 0.92372234935164, and recall of 0.502906976744186

metrics for naive beyes:
[[1291 1287]
 [  20   24]]
we had an accuracy of 0.5015255530129672, precision of 0.018306636155606407, and recall of 0.5454545454545454

metrics for voting classifier:
[[1307 1308]
 [   4    3]]
we had an accuracy of 0.4996186117467582, precision of 0.002288329519450801, and recall of 0.42857142857142855



In [None]:
## What model performs the best? 
## Experiment with changing the hyper-parameters, changing the vectorizer, 
## adding bi-grams and/or using a voting classifier to increase model accuracy.

In [25]:
## Using Vader sentiment analysis, predict whether or not the movie review is positive or negative. 
# (Use a positive compound score for “positive” and a negative compound score for “negative”).

from nltk.sentiment.vader import SentimentIntensityAnalyzer
sid = SentimentIntensityAnalyzer()

train['scores'] = train['review'].apply(lambda x: sid.polarity_scores(x))
train['compound'] = train['scores'].apply(lambda x: x['compound'])

test['scores'] = test['review'].apply(lambda x: sid.polarity_scores(x))
test['compound'] = test['scores'].apply(lambda x: x['compound'])


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats i

In [26]:
## How does the accuracy of the sentiment analysis compare with that of the predictive model?
vader_test_predict = (test['compound'] > 0).astype(int)

print_metrics(y_test, vader_test_predict, 'vader sentiment analysis')

metrics for vader sentiment analysis:
[[ 722  177]
 [ 589 1134]]
we had an accuracy of 0.7078565980167811, precision of 0.8649885583524027, and recall of 0.6581543818920488



# The Vader predictions were much better than any of our predictive models for raw accuracy, but they had lower precision rates. 

In [29]:
## Try doing sentiment analysis with the TextBlob library. 
## How does the accuracy of TextBlob sentiments compare with Vader and the predictive model?
from textblob import TextBlob

textblob_test = test['review'].apply(lambda x: TextBlob(x).sentiment[0])

textblob_test_predict = (textblob_test > 0).astype(int)

print_metrics(y_test, textblob_test_predict, 'textblob sentiment analysis')



metrics for textblob sentiment analysis:
[[ 599   59]
 [ 712 1252]]
we had an accuracy of 0.7059496567505721, precision of 0.9549961861174676, and recall of 0.6374745417515275



# The textblob model worked pretty similar to Vader, but with a higher precision rate and lower recall rate. The textblob model had a precision rate and accuracy rate higher than our predictive models. 

In [39]:
## Run LDA topic modeling using gensim on the movie reviews. 

## How many topics are there? What are the most common words in each topic?

import gensim.corpora as corpora

ntopics = 3

# Create Dictionary
id2word = corpora.Dictionary(df['tokens'])

# Create Corpus: Term Document Frequency
corpus = [id2word.doc2bow(text) for text in df['tokens']]

# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=ntopics, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=10,
                                           passes=10,
                                           alpha='symmetric',
                                           iterations=100,
                                           per_word_topics=True)
## Does this create bigram topics?

ldatopics = lda_model.show_topics(formatted=False)
print(lda_model.print_topics())


[(0, '0.039*"br" + 0.030*"movie" + 0.015*"film" + 0.012*"one" + 0.008*"like" + 0.007*"time" + 0.006*"good" + 0.005*"would" + 0.005*"get" + 0.005*"really"'), (1, '0.012*"killer" + 0.010*"cop" + 0.009*"police" + 0.006*"monster" + 0.006*"killing" + 0.006*"chase" + 0.005*"blood" + 0.005*"detective" + 0.005*"island" + 0.004*"rex"'), (2, '0.016*"film" + 0.008*"life" + 0.007*"performance" + 0.006*"story" + 0.005*"best" + 0.005*"role" + 0.004*"love" + 0.004*"wonderful" + 0.004*"great" + 0.004*"world"')]


In [40]:
import pyLDAvis.gensim

pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, dictionary=lda_model.id2word)
vis

# I first tried using 8 topics with my LDA visualizatoin, but that seemed like too many because there were overlapping ones. I decided that there were three topics.
# It looked like there was one for action films, one for drama/romance films, and one that talked about audience reactions to the films. 