In [None]:
import numpy as np
import pandas as pd

# Code to read csv file into colaboratory:
!pip install -U -q PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [None]:
'''
downloaded = drive.CreateFile({'id':'1q9Yh9GorYkl_xf3O_P4zBbPYBXtTcuWx'}) 
downloaded.GetContentFile('moviereviews.tsv') 

df= pd.read_csv("moviereviews.tsv", sep='\t')

df.head()

'''

'\ndownloaded = drive.CreateFile({\'id\':\'1q9Yh9GorYkl_xf3O_P4zBbPYBXtTcuWx\'}) \ndownloaded.GetContentFile(\'moviereviews.tsv\') \n\ndf= pd.read_csv("moviereviews.tsv", sep=\'\t\')\n\ndf.head()\n\n'

# Sentiment Analysis
Now that we've seen word vectors we can start to investigate sentiment analysis. The goal is to find commonalities between documents, with the understanding that similarly *combined* vectors should correspond to similar sentiments.

While the scope of sentiment analysis is very broad, we will focus our work in two ways.

### 1. Polarity classification
We won't try to determine if a sentence is objective or subjective, fact or opinion. Rather, we care only if the text expresses a *positive*, *negative* or *neutral* opinion.
### 2. Document level scope
We'll also try to aggregate all of the sentences in a document or paragraph, to arrive at an overall opinion.
### 3. Coarse analysis
We won't try to perform a fine-grained analysis that would determine the degree of positivity/negativity. That is, we're not trying to guess how many stars a reviewer awarded, just whether the review was positive or negative.

## Broad Steps:
* First, consider the text being analyzed. A model trained on paragraph-long movie reviews might not be effective on tweets. Make sure to use an appropriate model for the task at hand.
* Next, decide the type of analysis to perform. In the previous section on text classification we used a bag-of-words technique that considered only single tokens, or *unigrams*. Some rudimentary sentiment analysis models go one step further, and consider two-word combinations, or *bigrams*. In this section, we'd like to work with complete sentences, and for this we're going to import a trained NLTK lexicon called *VADER*.

## NLTK's VADER module | Valence Aware Dictionary for sEntiment Reasoning
VADER is an NLTK module that provides sentiment scores based on words used ("completely" boosts a score, while "slightly" reduces it), on capitalization & punctuation ("GREAT!!!" is stronger than "great."), and negations (words like "isn't" and "doesn't" affect the outcome).
<br>To view the source code visit https://www.nltk.org/_modules/nltk/sentiment/vader.html

**Download the VADER lexicon.** You only need to do this once.

In [None]:
import nltk
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

<div class="alert alert-danger">NOTE: At the time of this writing there's a <a href='https://github.com/nltk/nltk/issues/2053'>known issue</a> with SentimentIntensityAnalyzer that raises a harmless warning on loading<br>
<tt><font color=black>&emsp;UserWarning: The twython library has not been installed.<br>&emsp;Some functionality from the twitter package will not be available.</tt>

This is due to be fixed in an upcoming NLTK release. For now, if you want to avoid it you can (optionally) install the NLTK twitter library with<br>
<tt><font color=black>&emsp;conda install nltk[twitter]</tt><br>or<br>
<tt><font color=black>&emsp;pip3 install -U nltk[twitter]</tt></div>

In [None]:
# !pip3 install -U nltk[twitter]

In [None]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer

sid = SentimentIntensityAnalyzer()

In [None]:
!pip3 install -U nltk[twitter]

Requirement already up-to-date: nltk[twitter] in /usr/local/lib/python3.6/dist-packages (3.5)


In [None]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer

sid = SentimentIntensityAnalyzer()

VADER's `SentimentIntensityAnalyzer()` takes in a string and returns a dictionary of scores in each of four categories:
* negative [0,1]
* neutral  [0,1]
* positive [0,1]
* compound *(computed by normalizing the scores above)* [-1,1]

In [None]:
a = 'This was a good movie.'
sid.polarity_scores(a)

{'compound': 0.4404, 'neg': 0.0, 'neu': 0.508, 'pos': 0.492}

In [None]:
a = 'This was the best, most awesome movie EVER MADE!!!'
sid.polarity_scores(a)

{'compound': 0.8877, 'neg': 0.0, 'neu': 0.425, 'pos': 0.575}

In [None]:
a = 'This was the worst film to ever disgrace the screen.'
sid.polarity_scores(a)

{'compound': -0.8074, 'neg': 0.477, 'neu': 0.523, 'pos': 0.0}

## Use VADER to analyze Amazon Reviews
For this exercise we're going to apply `SentimentIntensityAnalyzer` to a dataset of 10,000 Amazon reviews. Like our movie reviews datasets, these are labeled as either "pos" or "neg". At the end we'll determine the accuracy of our sentiment analysis with VADER.

In [None]:
import numpy as np
import pandas as pd

downloaded = drive.CreateFile({'id':'1kb-mL5Dl-5VoV-ZREdKqwG_FCWCXO1uj'}) 
downloaded.GetContentFile('amazonreviews.tsv') 

df= pd.read_csv("amazonreviews.tsv", sep='\t')

df.head()

Unnamed: 0,label,review
0,pos,Stuning even for the non-gamer: This sound tra...
1,pos,The best soundtrack ever to anything.: I'm rea...
2,pos,Amazing!: This soundtrack is my favorite music...
3,pos,Excellent Soundtrack: I truly like this soundt...
4,pos,"Remember, Pull Your Jaw Off The Floor After He..."


In [None]:
df.shape

(10000, 2)

In [None]:
df['label'].value_counts()

neg    5097
pos    4903
Name: label, dtype: int64

### Clean the data:
Recall that our moviereviews.tsv file contained empty records. Let's check to see if any exist in amazonreviews.tsv.

In [None]:
# REMOVE NaN VALUES AND EMPTY STRINGS:
df.dropna(inplace=True)

blanks = []  # start with an empty list

for index,label,review in df.itertuples():  # iterate over the DataFrame
    if type(review)==str:            # avoid NaN values
        if review.isspace():         # test 'review' for whitespace
            blanks.append(index)     # add matching index numbers to the list

df.drop(blanks, inplace=True)

In [None]:
df['label'].value_counts()

neg    5097
pos    4903
Name: label, dtype: int64

In [None]:
blanks # empty

# if blanks[] was not empty --> df.drop(blanks, inplace= True)

[]

In this case there were no empty records. Good!

## Let's run the first review through VADER

In [None]:
df.iloc[0]['review']

'Stuning even for the non-gamer: This sound track was beautiful! It paints the senery in your mind so well I would recomend it even to people who hate vid. game music! I have played the game Chrono Cross but out of all of the games I have ever played it has the best music! It backs away from crude keyboarding and takes a fresher step with grate guitars and soulful orchestras. It would impress anyone who cares to listen! ^_^'

In [None]:
# Below we are displaying the text as a script which is more readable (not like above)
from IPython.display import Markdown, display
display(Markdown('> '+df['review'][0]))

> Stuning even for the non-gamer: This sound track was beautiful! It paints the senery in your mind so well I would recomend it even to people who hate vid. game music! I have played the game Chrono Cross but out of all of the games I have ever played it has the best music! It backs away from crude keyboarding and takes a fresher step with grate guitars and soulful orchestras. It would impress anyone who cares to listen! ^_^

In [None]:
sid.polarity_scores(df.loc[0]['review'])

{'compound': 0.9454, 'neg': 0.088, 'neu': 0.669, 'pos': 0.243}

In [None]:
df.loc[0]['label']

'pos'

Great! Our first review was labeled "positive", and earned a positive compound score.

## Adding Scores and Labels to the DataFrame
In this next section we'll add columns to the original DataFrame to store polarity_score dictionaries, extracted compound scores, and new "pos/neg" labels derived from the compound score. We'll use this last column to perform an accuracy test.

In [None]:

# lamda take that review and then apply polarity score to that particular review
df['scores'] = df['review'].apply(lambda review: sid.polarity_scores(review))

df.head()

Unnamed: 0,label,review,scores
0,pos,Stuning even for the non-gamer: This sound tra...,"{'neg': 0.088, 'neu': 0.669, 'pos': 0.243, 'co..."
1,pos,The best soundtrack ever to anything.: I'm rea...,"{'neg': 0.018, 'neu': 0.837, 'pos': 0.145, 'co..."
2,pos,Amazing!: This soundtrack is my favorite music...,"{'neg': 0.04, 'neu': 0.692, 'pos': 0.268, 'com..."
3,pos,Excellent Soundtrack: I truly like this soundt...,"{'neg': 0.09, 'neu': 0.615, 'pos': 0.295, 'com..."
4,pos,"Remember, Pull Your Jaw Off The Floor After He...","{'neg': 0.0, 'neu': 0.746, 'pos': 0.254, 'comp..."


In [None]:
# compound is usually useful, so adding that as a column as well
df['compound']  = df['scores'].apply(lambda score_dict: score_dict['compound'])

df.head()

Unnamed: 0,label,review,scores,compound
0,pos,Stuning even for the non-gamer: This sound tra...,"{'neg': 0.088, 'neu': 0.669, 'pos': 0.243, 'co...",0.9454
1,pos,The best soundtrack ever to anything.: I'm rea...,"{'neg': 0.018, 'neu': 0.837, 'pos': 0.145, 'co...",0.8957
2,pos,Amazing!: This soundtrack is my favorite music...,"{'neg': 0.04, 'neu': 0.692, 'pos': 0.268, 'com...",0.9858
3,pos,Excellent Soundtrack: I truly like this soundt...,"{'neg': 0.09, 'neu': 0.615, 'pos': 0.295, 'com...",0.9814
4,pos,"Remember, Pull Your Jaw Off The Floor After He...","{'neg': 0.0, 'neu': 0.746, 'pos': 0.254, 'comp...",0.9781


In [None]:

# translating the compounding scores and creating a new column
# if compound score >0 -> positive else negative
df['comp_score'] = df['compound'].apply(lambda score: 'pos' if score >=0 else 'neg')

df.head()

Unnamed: 0,label,review,scores,compound,comp_score
0,pos,Stuning even for the non-gamer: This sound tra...,"{'neg': 0.088, 'neu': 0.669, 'pos': 0.243, 'co...",0.9454,pos
1,pos,The best soundtrack ever to anything.: I'm rea...,"{'neg': 0.018, 'neu': 0.837, 'pos': 0.145, 'co...",0.8957,pos
2,pos,Amazing!: This soundtrack is my favorite music...,"{'neg': 0.04, 'neu': 0.692, 'pos': 0.268, 'com...",0.9858,pos
3,pos,Excellent Soundtrack: I truly like this soundt...,"{'neg': 0.09, 'neu': 0.615, 'pos': 0.295, 'com...",0.9814,pos
4,pos,"Remember, Pull Your Jaw Off The Floor After He...","{'neg': 0.0, 'neu': 0.746, 'pos': 0.254, 'comp...",0.9781,pos


## Report on Accuracy
Finally, we'll use scikit-learn to determine how close VADER came to our original 10,000 labels.

In [None]:
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix

# comparing "label" which is the already true-correct label, with the compound
# score which we calculated afterwards
accuracy_score(df['label'],df['comp_score'])

0.7091

In [None]:

print(classification_report(df['label'],df['comp_score']))

# vaden is not good at detecting sarcasm

              precision    recall  f1-score   support

         neg       0.86      0.51      0.64      5097
         pos       0.64      0.91      0.75      4903

    accuracy                           0.71     10000
   macro avg       0.75      0.71      0.70     10000
weighted avg       0.75      0.71      0.70     10000



In [None]:
print(confusion_matrix(df['label'],df['comp_score']))

# 2622 correctly classified as positive
# 434 inclorreclty classified as positive

# 2475 incorrecly classified as negative
# 4469 correctly classified as negative

[[2622 2475]
 [ 434 4469]]


In [None]:
# You can make the confusion matrix less confusing by adding labels:
#from sklearn import metrics
#df = pd.DataFrame(metrics.confusion_matrix(y_test,predictions), index=['negative','positive'], columns=['negative','positive'])
#df

# but here we hadn't split the data

This tells us that VADER correctly identified an Amazon review as "positive" or "negative" roughly 71% of the time.
# Sentiment Analysis Project

## Task #1: Perform vector arithmetic on your own words
Write code that evaluates vector arithmetic on your own set of related words. The goal is to come as close to an expected word as possible.

In [None]:
!python -m spacy download en_core_web_lg
# !python -m spacy download en_vectors_web_lg

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_lg')


In [None]:
# Import spaCy and load the language library. Remember to use a larger model!
import spacy
nlp = spacy.load('en_core_web_lg')

In [None]:
# Choose the words you wish to compare, and obtain their vectors
word1 = nlp.vocab['wolf'].vector
word2 = nlp.vocab['dog'].vector
word3 = nlp.vocab['cat'].vector

In [None]:
# Import spatial and define a cosine_similarity function
from scipy import spatial

cosine_similarity = lambda x, y: 1 - spatial.distance.cosine(x, y)

In [None]:
# Write an expression for vector arithmetic
# For example: new_vector = word1 - word2 + word3
new_vector = word1 - word2 + word3

In [None]:
# List the top ten closest vectors in the vocabulary to the result of the expression above
computed_similarities = []

for word in nlp.vocab:
    if word.has_vector: #not all words have vectors in spacy
        if word.is_lower:
            if word.is_alpha: # if they are alphabetic
                similarity = cosine_similarity(new_vector, word.vector)
                computed_similarities.append((word, similarity))

computed_similarities = sorted(computed_similarities, key=lambda item: -item[1]) #in descending indexing

print([w[0].text for w in computed_similarities[:10]])

['wolf', 'wolves', 'panther', 'lynx', 'owl', 'tiger', 'lion', 'fox', 'cat', 'otter']


### CHALLENGE: Write a function that takes in 3 strings, performs a-b+c arithmetic, and returns a top-ten result

In [None]:
def vector_math(a,b,c):
    new_vector = nlp.vocab[a].vector - nlp.vocab[b].vector + nlp.vocab[c].vector
    computed_similarities = []

    for word in nlp.vocab:
        if word.has_vector:
            if word.is_lower:
                if word.is_alpha:
                    similarity = cosine_similarity(new_vector, word.vector)
                    computed_similarities.append((word, similarity))

    computed_similarities = sorted(computed_similarities, key=lambda item: -item[1])

    return [w[0].text for w in computed_similarities[:10]]

In [None]:
# Test the function on known words:
vector_math('king','man','woman')

['king',
 'queen',
 'prince',
 'kings',
 'princess',
 'royal',
 'throne',
 'queens',
 'monarch',
 'kingdom']

## Task #2: Perform VADER Sentiment Analysis on your own review
Write code that returns a set of SentimentIntensityAnalyzer polarity scores based on your own written review.

In [None]:
# Import SentimentIntensityAnalyzer and create an sid object
from nltk.sentiment.vader import SentimentIntensityAnalyzer

sid = SentimentIntensityAnalyzer()

In [None]:
# Write a review as one continuous string (multiple sentences are ok)
my_review = 'This movie portrayed real people, and was based on actual events.'

In [None]:
# Obtain the sid scores for your review
sid.polarity_scores(my_review)

{'compound': 0.0, 'neg': 0.0, 'neu': 1.0, 'pos': 0.0}

### CHALLENGE: Write a function that takes in a review and returns a score of "Positive", "Negative" or "Neutral"

In [None]:
def review_rating(string):
    scores = sid.polarity_scores(string)
    if scores['compound'] == 0:
        return 'Neutral'
    elif scores['compound'] > 0:
        return 'Positive'
    else:
        return 'Negative'

In [None]:
# Test the function on your review above:
review_rating(my_review)

'Neutral'