Author: Dominique Grimes

Date: April 2, 2023

In [114]:
# Load libraries
import pandas as pd
from textblob import TextBlob
import re
import string
from nltk.corpus import stopwords
import nltk
from nltk.stem.porter import PorterStemmer
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

# Part 1: Using the TextBlob Sentiment Analyzer

## Import the movie review data as a data frame and ensure that the data is loaded properly.

In [80]:
# Load the data into a df. View the data to verify it loaded properly.
df = pd.read_csv('labeledTrainData.tsv',sep="\t")
df.head()

Unnamed: 0,id,sentiment,review
0,5814_8,1,With all this stuff going down at the moment w...
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi..."
2,7759_3,0,The film starts with a manager (Nicholas Bell)...
3,3630_4,0,It must be assumed that those who praised this...
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...


## How many of each positive and negative reviews are there?

In [69]:
# Assuming positive is sentiment==1 and negative is sentiment == 0, there are 
# 12,500 each of positive and negative reviews.
df['sentiment'].value_counts()

1    12500
0    12500
Name: sentiment, dtype: int64

## Use TextBlob to classify each movie review as positive or negative. Assume that a polarity score greater than or equal to zero is a positive sentiment and less than 0 is a negative sentiment.

In [70]:
# Apply the TextBlob sentiment calculation to each review row in the df.
# Add a column that shows the TextBlob sentiment polarity and subjectivity.

def getSubjectivity(text):
    return TextBlob(text).sentiment.subjectivity
def getPolarity(text):
    return TextBlob(text).sentiment.polarity

df['polarity'] = df['review'].apply(getPolarity)
df['subjectivity'] = df['review'].apply(getSubjectivity)

# Determine the count of polarity rows with positive and negative values.
# Assign new predict columns with values of 1 for positive and 0 for negative.
df['predict'] = df['polarity'].apply(lambda x:1 if x >= 0 else 0)
df.head()

Unnamed: 0,id,sentiment,review,polarity,subjectivity,predict
0,5814_8,1,With all this stuff going down at the moment w...,0.001277,0.606746,1
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi...",0.256349,0.531111,1
2,7759_3,0,The film starts with a manager (Nicholas Bell)...,-0.053941,0.562933,0
3,3630_4,0,It must be assumed that those who praised this...,0.134753,0.492901,1
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...,-0.024842,0.459818,0


In [71]:
# There are 19,017 positive reviews based on TextBlob.
# There are 5,983 negative reviews based on TextBlob.
df['predict'].value_counts()

1    19017
0     5983
Name: predict, dtype: int64

## Check the accuracy of this model. Is this model better than random guessing?

In [72]:
# Use accuracy score from sklearn to calculate accuracy of TextBlob model against given sentiment.
accuracy_score(df.sentiment,df.predict)

# Is this model better than random guessing?
# A random guess for binary is 50%. 
# Based on the accuracy score calculation, the Text Blob model is about 18% more accurate. 

0.68524

# Part 2: Prepping Text for a Custom Model

## Convert all text to lowercase letters.
## Remove punctuation and special characters from the text.
## Remove stop words.

In [81]:
# Use translate to remove punctuation and special characters.
df['review'] = df['review'].str.translate(str.maketrans('','', string.punctuation))

# Removing stop words with nltk and making lowercase.
stop = stopwords.words('english')
df['review'] = df['review'].apply(lambda words: ' '.join(word.lower() for word in words.split() if word not in stop))

In [82]:
df.head(10)

Unnamed: 0,id,sentiment,review
0,5814_8,1,with stuff going moment mj ive started listeni...
1,2381_9,1,the classic war worlds timothy hines entertain...
2,7759_3,0,the film starts manager nicholas bell giving w...
3,3630_4,0,it must assumed praised film greatest filmed o...
4,9495_8,1,superbly trashy wondrously unpretentious 80s e...
5,8196_8,1,i dont know people think bad movie its got pre...
6,7166_2,0,this movie could good comes way short cheesy s...
7,10633_1,0,i watched video friends house im glad i waste ...
8,319_1,0,a friend mine bought film £1 even grossly over...
9,8713_10,1,br br this movie full references like mad max ...


## Apply NLTK’s PorterStemmer.

In [83]:
# Stem words in review column using NLTK's PorterStemmer

porter=PorterStemmer()

def stem_sentences(sentence):
    tokens = sentence.split()
    stemmed_tokens = [porter.stem(token) for token in tokens]
    return ' '.join(stemmed_tokens)

df['review'] = df['review'].apply(stem_sentences)

In [84]:
# View df after stemming applied
df.head(10)

Unnamed: 0,id,sentiment,review
0,5814_8,1,with stuff go moment mj ive start listen music...
1,2381_9,1,the classic war world timothi hine entertain f...
2,7759_3,0,the film start manag nichola bell give welcom ...
3,3630_4,0,it must assum prais film greatest film opera e...
4,9495_8,1,superbl trashi wondrous unpretenti 80 exploit ...
5,8196_8,1,i dont know peopl think bad movi it got pretti...
6,7166_2,0,thi movi could good come way short cheesi spec...
7,10633_1,0,i watch video friend hous im glad i wast money...
8,319_1,0,a friend mine bought film £1 even grossli over...
9,8713_10,1,br br thi movi full refer like mad max ii the ...


## Create a bag-of-words matrix from your stemmed text, where each row is a word-count vector for a single movie review. Display the dimensions of your bag-of-words matrix.

In [107]:
# Create a Numpy array from the stemmed review column 
# to create text varible for bag-of-words matrix & tf-idf matrix
text_data=np.array(df['review'])

# Create the bag-of-words feature matrix.
count=CountVectorizer()
bag_of_words=count.fit_transform(text_data)

# View bag-of-words matrix dimensions
bag_of_words

<25000x92407 sparse matrix of type '<class 'numpy.int64'>'
	with 2569281 stored elements in Compressed Sparse Row format>

## Create a term frequency-inverse document frequency (tf-idf) matrix from your stemmed text, for your movie reviews. Display the dimensions of your tf-idf matrix. These dimensions should be the same as your bag-of-words matrix.

In [116]:
# Creat the tf-idf matrix
tfidf=TfidfVectorizer()
feature_matrix=tfidf.fit_transform(text_data)

# View tf-idf matrix dimensions
feature_matrix

<25000x92407 sparse matrix of type '<class 'numpy.float64'>'
	with 2569281 stored elements in Compressed Sparse Row format>