# Part 1: Existing Machine Learning Services
### use Google's cloud natural language API to analyze the sentiment of movie reviews

In [1]:
# download labelled movie reviews
from nltk import download
download('movie_reviews')

[nltk_data] Downloading package movie_reviews to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\movie_reviews.zip.


True

In [2]:
# load the data
from nltk.corpus import movie_reviews

# extract words from reviews, pair with sentiment labels
reviews_pos = []
for fileid in movie_reviews.fileids('pos'):
    review = movie_reviews.raw(fileid)
    reviews_pos.append(review)

reviews_neg = []
for fileid in movie_reviews.fileids('neg'):
    review = movie_reviews.raw(fileid)
    reviews_neg.append(review)

In [3]:
# check the size of the data
print('# of positive reviews:', len(reviews_pos))
print('# of negative reviews:', len(reviews_neg))

# of positive reviews: 1000
# of negative reviews: 1000


#### run all the reviews through Google's NLP API once, and save the results for potential repeated use in the future

In [57]:
# use Google NLP sentiment analysis
from google.cloud import language
from google.cloud.language import enums
from google.cloud.language import types

# instantiates a client
client = language.LanguageServiceClient()

In [12]:
# get numerical sentiment results
def get_score(review):
    document = types.Document(content=review, type=enums.Document.Type.PLAIN_TEXT)
    # analyze the sentiment of the text
    sentiment = client.analyze_sentiment(document=document).document_sentiment
    return sentiment.score, sentiment.magnitude

In [15]:
scores_pos = []
for review in reviews_pos:
    score = get_score(review)
    scores_pos.append(score)

scores_neg = []
for review in reviews_neg:
    score = get_score(review)
    scores_neg.append(score)

In [47]:
# save scores in csv format
import pandas as pd

df_pos = pd.DataFrame(scores_pos, columns=['score', 'magnitude'])
df_pos.to_csv('pos.csv')
df_neg = pd.DataFrame(scores_neg, columns=['score', 'magnitude'])
df_neg.to_csv('neg.csv')

Unnamed: 0.1,Unnamed: 0,score,magnitude
0,0,-0.1,10.4
1,1,0.0,18.200001
2,2,0.0,10.1


#### from now on, we can simply import saved results

In [5]:
# import saved scores
import pandas as pd

df_pos = pd.read_csv('pos.csv', index_col=0)
df_neg = pd.read_csv('neg.csv', index_col=0)

scores_pos = df_pos.values.tolist()
scores_neg = df_neg.values.tolist()

In [6]:
# label the sentiment of each review
def score_review(score):
    if score >= 0:
        return 'pos'
    elif score < 0:
        return 'neg'

In [7]:
results_pos = []
for score in scores_pos:
    result = score_review(score[0])
    results_pos.append(result)

results_neg = []
for score in scores_neg:
    result = score_review(score[0])
    results_neg.append(result)

In [8]:
# see accuracy of the classification
correct_pos = results_pos.count('pos')
accuracy_pos = float(correct_pos) / len(results_pos)
correct_neg = results_neg.count('neg')
accuracy_neg = float(correct_neg) / len(results_neg)
correct_all = correct_pos + correct_neg
accuracy_all = float(correct_all) / (len(results_pos)+len(results_neg))

print('Positive reviews: {}% correct'.format(accuracy_pos*100))
print('Negative reviews: {}% correct'.format(accuracy_neg*100))
print('Overall accuracy: {}% correct'.format(accuracy_all*100))

Positive reviews: 83.1% correct
Negative reviews: 87.8% correct
Overall accuracy: 85.45% correct
