Dan Clayton<br>
DSC-550 Exercise 3.2<BR>


In [1]:
#Import the needed libraries
import pandas as pd
import numpy as np
from textblob import TextBlob
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer


In [2]:
#Part 1: Using the TextBlob Sentiment Analyzer
#Import the movie review data as a data frame and ensure that the data is loaded properly.
df = pd.read_csv('labeledTrainData.tsv', sep='\t')
df.head(10)

Unnamed: 0,id,sentiment,review
0,5814_8,1,With all this stuff going down at the moment w...
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi..."
2,7759_3,0,The film starts with a manager (Nicholas Bell)...
3,3630_4,0,It must be assumed that those who praised this...
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...
5,8196_8,1,I dont know why people think this is such a ba...
6,7166_2,0,"This movie could have been very good, but come..."
7,10633_1,0,I watched this video at a friend's house. I'm ...
8,319_1,0,"A friend of mine bought this film for £1, and ..."
9,8713_10,1,<br /><br />This movie is full of references. ...


In [3]:
#How many of each positive and negative reviews are there?
print(df[['sentiment', 'review']].groupby('sentiment').count())

#Conclusion--there are 12,500 positive and 12,500 negative reviews

           review
sentiment        
0           12500
1           12500


In [4]:
#Install texblob package
import sys
#!{sys.executable} -m pip install -U textblob
#!{sys.executable} -m pip install nltk

In [5]:
#Use TextBlob to classify each movie review as positive or negative. 
#Assume that a polarity score greater than or equal to zero is a positive sentiment and less than 0 is a negative sentiment.

#Returns a 1 for positive moview review, 0 for negative movie review
def get_sentiment(movie_review):
    #Build a text blobl of the movie review
    review = TextBlob(movie_review)
    
    #Calculate the sentiment
    review.sentiment
    
    #If the sentiment is >= 0 then positive, else return negative
    if review.sentiment.polarity >= 0:
        return 1
    else:
        return 0

#Use the previously defined function to populate the sentiment as provided by textblob
df['blob_sentiment'] = df['review'].apply(get_sentiment)
    

In [6]:
#Check the accuracy of this model. Is this model better than random guessing?
print(len(df[df['sentiment']==df['blob_sentiment']])/len(df))

#Yes--blob sentiment was correct 68.5% of the time--better than 50/50 random guessing

0.68524


In [7]:
#For up to five points extra credit, use another prebuilt text sentiment analyzer, e.g., VADER, and repeat steps (3) and (4).
import nltk
nltk.downloader.download('vader_lexicon')
import nltk.corpus
from nltk.tokenize import word_tokenize # Passing the string text into word tokenize for bre



def get_vader_sentiment(review):
    #Create an empty sentiment analyzer
    sid_obj = SentimentIntensityAnalyzer()
    #Calculate the sentiment for this review
    sentiment_dict = sid_obj.polarity_scores(review)
    
    if sentiment_dict['compound'] >= 0:
        return 1
    else:
        return 0

#print(get_vader_sentiment(df['review'].iloc[0]))

#Populate the vader sentiment using the function defined above
df['vader_sentiment'] = df['review'].apply(get_vader_sentiment)


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\dancl\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [8]:
#Check to see how accurate the vader sentiment was:
print(len(df[df['sentiment']==df['vader_sentiment']])/len(df))

#At 69.2% the Vader sentiment was slightly more accurate than the blob sentiment!

0.69216


In [9]:
#Part 2: Prepping Text for a Custom Model

In [10]:
#Convert all text to lowercase letters.
df['review'] = df['review'].str.lower()

In [12]:
#Remove punctuation and special characters from the text.
#df['review'] = df['review'].str.extract('(\w+)', expand = False)
df['review'] = df['review'].str.replace('[^\w\s]','')

In [15]:
#Remove stop words.
from nltk.corpus import stopwords
nltk.download('punkt')
nltk.download('stopwords')
from nltk.tokenize import word_tokenize
stop_words = set(stopwords.words('english'))


#Function to remove stop words from the string given
def rem_stops(txt):
    text_tokens = word_tokenize(txt)
    return [w for w in text_tokens if not w.lower() in stop_words]
    #return [word for word in text_tokens if not word in stopwords.words()]

#Remove all the stop words from the reviews
df['review'] = df['review'].apply(rem_stops)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\dancl\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\dancl\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [17]:
#Apply NLTK’s PorterStemmer.
from nltk.stem import *
from nltk.stem.porter import *
stemmer = PorterStemmer()

def get_singles(txt):
    return [stemmer.stem(plural) for plural in txt]

df['review'] = df['review'].apply(get_singles)


In [29]:
#Create a bag-of-words matrix from your stemmed text (output from (4)), where each row is a word-count vector 
#for a single movie review (see sections 5.3 & 6.8 in the Machine Learning with Python Cookbook). 
#Display the dimensions of your bag-of-words matrix. The number of rows in this matrix should be the same as 
#the number of rows in your original data frame.

#df['review'].head(10)

#Import the neccesary library
from sklearn.feature_extraction.text import CountVectorizer

#Instantiate the vectorizer object
count = CountVectorizer()

def get_word_vector(review):
    #bag of words object
    bow = count.fit_transform(review)
    return bow.toarray()

#Blank dataframe on which to append the word vectors
#df1 = pd.DataFrame()

#Populate the sparse arrays in the dataframe
df['bow'] = df['review'].apply(get_word_vector)

#Check the size of the sparse array (actually a dataframe--converted to array below)
print(len(df['bow']))


25000


In [36]:
#Convert dataframe into series
s = pd.Series(df['bow'])

#Create an array from the series
df_array = np.array(s)

#Show the array stats
print('Array length: ', len(df_array))
print(type(df_array))


Array length:  25000
<class 'numpy.ndarray'>


In [43]:
#Create a term frequency-inverse document frequency (tf-idf) matrix from your stemmed text, for your movie 
#reviews (see section 6.9 in the Machine Learning with Python Cookbook). 
#Display the dimensions of your tf-idf matrix. These dimensions should be the same as your bag-of-words matrix.

#Import the needed library
from sklearn.feature_extraction.text import TfidfVectorizer

#Create the feature matrix
tfidf = TfidfVectorizer()

def get_fmatrix(txt):
    return tfidf.fit_transform(txt)

df['f_matrix'] = df['review'].apply(get_fmatrix)


In [45]:
#Converting the dataframe to an array, per the instructions
#Convert dataframe into series
s = pd.Series(df['f_matrix'])

#Create an array from the series
fm_array = np.array(s)

#Show the array stats
print('Array length: ', len(fm_array))
print(type(fm_array))


Array length:  25000
<class 'numpy.ndarray'>
