In [1]:
import spacy

In [2]:
nlp = spacy.load('en_core_web_md')

In [7]:
# nlp(u'lion').vector

In [6]:
nlp('the quick brown fox jumped').vector.shape

(300,)

In [23]:
tokens = nlp(u"birthday birth born")

In [24]:
for token1 in tokens:
    for token2 in tokens:
        print(f'{token1.text:{10}} {token2.text:{10}}  {token1.similarity(token2):{10}}')

birthday   birthday           1.0
birthday   birth       0.45223161578178406
birthday   born        0.32044854760169983
birth      birthday    0.45223161578178406
birth      birth              1.0
birth      born        0.6610350608825684
born       birthday    0.32044854760169983
born       birth       0.6610350608825684
born       born               1.0


In [25]:
# Since these are all represented as vectors, we can perform arithmetic and calculate cosine similarity

from scipy import spatial

cosine_similarity = lambda vec1,vec2: 1 - spatial.distance.cosine(vec1,vec2)

In [27]:
king = nlp.vocab['king'].vector
man = nlp.vocab['man'].vector
woman = nlp.vocab['woman'].vector

In [28]:
vec1 = king - man + woman

In [36]:
vec2 = nlp(u'queen').vector

In [37]:
cosine_similarity(vec1, vec2)

0.7880843877792358

In [44]:
# Checking all the similar words related to vec1

similar_words = []

for word in nlp.vocab:
    similarity = cosine_similarity(vec1, word.vector)
    if similarity > 0.5:
        similar_words.append((word, similarity))

  dist = 1.0 - uv / np.sqrt(uu * vv)


In [45]:
print([t[0].text for t in similar_words])

['MAHARAJAS', 'WOMAN', 'Kingdom', 'King', 'Enthroned', 'Empresses', 'kingdome', 'maharajas', 'Prince', 'Kings', 'woma', 'royal', 'KING', 'Coronation', 'KINGLY', 'KINGDOM', 'KINDOM', 'Princesses', 'Queen', 'REGENCY', 'commoner', 'THRONE', 'queens', 'highness', 'PRINCELY', 'Businesswoman', 'SCEPTER', 'Kingdoms', 'QUEEN', 'princes', 'KINGS', 'QUEENS', 'Sultans', 'sceptre', 'ROYAL', 'LADY', 'KINGDOME', 'king', 'BUSINESSWOMAN', 'Regency', 'consort', 'HIGHNESS', 'PRETENDER', 'royals', 'kindom', 'PRINCE', 'womAn', 'CONSORT', 'Protista', 'plantae', 'protista', 'Plantae', 'SCEPTRE', 'WOman', 'Kindom', 'CORONATION', 'kumbia', 'pricess', 'Scepter', 'empresses', 'lady', 'PRINCES', 'scepter', 'Princes', 'woman', 'princess', 'Kingdome', 'PRINCESS', 'duchesses', 'princesses', 'coronations', 'queen', 'DUCAL', 'Princess', 'kings', 'princely', 'kingly', 'ducal', 'Consort', 'pretender', 'SULTANS', 'Kingly', 'Royals', 'regency', 'ROYALS', 'prince', 'KINGDOMS', 'coronation', 'PRINCESSES', 'kingdoms', 'Lady

# Sentiment analysis using VADER

In [46]:
import nltk

In [47]:
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to C:\Users\Chayan
[nltk_data]     Shrangraj\AppData\Roaming\nltk_data...


True

In [48]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [49]:
sid = SentimentIntensityAnalyzer()

In [50]:
a = "This is a good movie"

In [52]:
sid.polarity_scores(a)

{'neg': 0.0, 'neu': 0.508, 'pos': 0.492, 'compound': 0.4404}

In [53]:
b = "This is the best movie I EVER WATCHED!!!"

In [55]:
sid.polarity_scores(b)

{'neg': 0.0, 'neu': 0.542, 'pos': 0.458, 'compound': 0.7249}

In [56]:
c = "The movie was the worst ever movie I watched"

In [57]:
sid.polarity_scores(c)

{'neg': 0.369, 'neu': 0.631, 'pos': 0.0, 'compound': -0.6249}

In [58]:
import pandas as pd

In [59]:
df = pd.read_csv('../UPDATED_NLP_COURSE/TextFiles/amazonreviews.tsv', delimiter='\t')

In [69]:
df.head(10)

Unnamed: 0,label,review
0,pos,Stuning even for the non-gamer: This sound tra...
1,pos,The best soundtrack ever to anything.: I'm rea...
2,pos,Amazing!: This soundtrack is my favorite music...
3,pos,Excellent Soundtrack: I truly like this soundt...
4,pos,"Remember, Pull Your Jaw Off The Floor After He..."
5,pos,an absolute masterpiece: I am quite sure any o...
6,neg,"Buyer beware: This is a self-published book, a..."
7,pos,Glorious story: I loved Whisper of the wicked ...
8,pos,A FIVE STAR BOOK: I just finished reading Whis...
9,pos,Whispers of the Wicked Saints: This was a easy...


In [75]:
len(df)

10000

In [66]:
# Positive review
df.review[1]

"The best soundtrack ever to anything.: I'm reading a lot of reviews saying that this is the best 'game soundtrack' and I figured that I'd write a review to disagree a bit. This in my opinino is Yasunori Mitsuda's ultimate masterpiece. The music is timeless and I'm been listening to it for years now and its beauty simply refuses to fade.The price tag on this is pretty staggering I must say, but if you are going to buy any cd for this much money, this is the only one that I feel would be worth every penny."

In [68]:
# Negativ review
df.review[6]

'Buyer beware: This is a self-published book, and if you want to know why--read a few paragraphs! Those 5 star reviews must have been written by Ms. Haddon\'s family and friends--or perhaps, by herself! I can\'t imagine anyone reading the whole thing--I spent an evening with the book and a friend and we were in hysterics reading bits and pieces of it to one another. It is most definitely bad enough to be entered into some kind of a "worst book" contest. I can\'t believe Amazon even sells this kind of thing. Maybe I can offer them my 8th grade term paper on "To Kill a Mockingbird"--a book I am quite sure Ms. Haddon never heard of. Anyway, unless you are in a mood to send a book to someone as a joke---stay far, far away from this one!'

In [72]:
# Checking the null values
df.isnull().sum()

label     0
review    0
dtype: int64

In [74]:
df.label.value_counts()

neg    5097
pos    4903
Name: label, dtype: int64

In [80]:

no_review = []
for i, lb, rv in df.itertuples():
    if rv.isspace():
        no_review.append(i)
        
no_review  # No review is empty

[]

In [85]:
sid.polarity_scores(df.review[6])

{'neg': 0.124, 'neu': 0.806, 'pos': 0.069, 'compound': -0.8744}

In [84]:
df['score'] = df.review.apply(lambda review: sid.polarity_scores(review)['compound'])

In [104]:
df.head(10)

Unnamed: 0,label,review,score,predicted labels
0,pos,Stuning even for the non-gamer: This sound tra...,0.9454,pos
1,pos,The best soundtrack ever to anything.: I'm rea...,0.8957,pos
2,pos,Amazing!: This soundtrack is my favorite music...,0.9858,pos
3,pos,Excellent Soundtrack: I truly like this soundt...,0.9814,pos
4,pos,"Remember, Pull Your Jaw Off The Floor After He...",0.9781,pos
5,pos,an absolute masterpiece: I am quite sure any o...,0.99,pos
6,neg,"Buyer beware: This is a self-published book, a...",-0.8744,neg
7,pos,Glorious story: I loved Whisper of the wicked ...,0.9908,pos
8,pos,A FIVE STAR BOOK: I just finished reading Whis...,0.8353,pos
9,pos,Whispers of the Wicked Saints: This was a easy...,0.8196,pos


In [88]:
df['predicted labels'] = df.score.apply(lambda score: 'pos' if score>0 else 'neg')

In [93]:
len(df[df['label'] != df['predicted labels']])

2878

In [102]:
accuracy = 100 - (len(df[df['label'] != df['predicted labels']])/len(df)) * 100

In [110]:
accuracy

71.22

In [111]:
from sklearn.metrics import classification_report

In [112]:
print(classification_report(df.label, df['predicted labels']))

              precision    recall  f1-score   support

         neg       0.85      0.53      0.65      5097
         pos       0.65      0.90      0.75      4903

   micro avg       0.71      0.71      0.71     10000
   macro avg       0.75      0.72      0.70     10000
weighted avg       0.75      0.71      0.70     10000

