# NLP Sentiment Analysis

In [3]:
# libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os

In [12]:
#os.listdir('E:/analytics/projects/dsprojects/data/txt/')
os.listdir('../data/txt/')

['HP1.txt']

In [13]:
# data as text file
#Initiate an empty list
data = []
#Open the text file and add each of the line as components in the list. Also empty lines are not appended.
with open('../data/txt/HP1.txt',"r") as myfile:
    for line in myfile:
        line = line.replace('\n', '')
        if line == '':
            continue
        else:
            data.append(str(line))   
myfile.close()

In [19]:
data

["Harry Potter and the Sorcerer's Stone ",
 'CHAPTER ONE ',
 'THE BOY WHO LIVED ',
 "Mr. and Mrs. Dursley, of number four, Privet Drive, were proud to say that they were perfectly normal, thank you very much. They were the last people you'd expect to be involved in anything strange or mysterious, because they just didn't hold with such nonsense. ",
 'Mr. Dursley was the director of a firm called Grunnings, which made drills. He was a big, beefy man with hardly any neck, although he did have a very large mustache. Mrs. Dursley was thin and blonde and had nearly twice the usual amount of neck, which came in very useful as she spent so much of her time craning over garden fences, spying on the neighbors. The Dursleys had a small son called Dudley and in their opinion there was no finer boy anywhere. ',
 "The Dursleys had everything they wanted, but they also had a secret, and their greatest fear was that somebody would discover it. They didn't think they could bear it if anyone found out 

In [14]:
# additional libraries
#Import the libraries for preprocessing, stemming, lemmatization, removing english stopwords & punctuations
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer

porter=PorterStemmer()
wordnet_lemmatizer = WordNetLemmatizer()
nltk.download('wordnet')

from nltk.corpus import stopwords
nltk.download('stopwords')
stop = set(stopwords.words('english'))

import string
exclude = set(string.punctuation)

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\du\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\du\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [20]:
#Create a single function to preprocess
def clean(doc):
    stop_free = " ".join([i for i in doc.lower().split() if i not in stop])
    punc_free = "".join(ch for ch in stop_free if ch not in exclude)
    normalized = " ".join(wordnet_lemmatizer.lemmatize(word) for word in punc_free.split())
    return normalized

In [21]:
doc_clean = [clean(line).split() for line in data] 

In [22]:
doc_clean

[['harry', 'potter', 'sorcerer', 'stone'],
 ['chapter', 'one'],
 ['boy', 'lived'],
 ['mr',
  'mr',
  'dursley',
  'number',
  'four',
  'privet',
  'drive',
  'proud',
  'say',
  'perfectly',
  'normal',
  'thank',
  'much',
  'last',
  'people',
  'expect',
  'involved',
  'anything',
  'strange',
  'mysterious',
  'hold',
  'nonsense'],
 ['mr',
  'dursley',
  'director',
  'firm',
  'called',
  'grunnings',
  'made',
  'drill',
  'big',
  'beefy',
  'man',
  'hardly',
  'neck',
  'although',
  'large',
  'mustache',
  'mr',
  'dursley',
  'thin',
  'blonde',
  'nearly',
  'twice',
  'usual',
  'amount',
  'neck',
  'came',
  'useful',
  'spent',
  'much',
  'time',
  'craning',
  'garden',
  'fence',
  'spying',
  'neighbor',
  'dursleys',
  'small',
  'son',
  'called',
  'dudley',
  'opinion',
  'finer',
  'boy',
  'anywhere'],
 ['dursleys',
  'everything',
  'wanted',
  'also',
  'secret',
  'greatest',
  'fear',
  'somebody',
  'would',
  'discover',
  'it',
  'think',
  'could',

### Topic Modeling
- LDA - In natural language processing, latent Dirichlet allocation (LDA) is a generative statistical model that allows sets of observations to be explained by unobserved groups that explain why some parts of the data are similar.
- LDA is a three-level hierarchical Bayesian model, in which each item of a collection is modeled as a finite mixture over an underlying set of topics. Each topic is, in turn, modeled as an infinite mixture over an underlying set of topic probabilities. In the context of text modeling, the topic probabilities provide an explicit representation of a document.
- The LDA model is highly modular and can extended easily. It can be used effectively to quantify the relationship between topics in a document.

In [24]:
#Import the Gensim libraries
import gensim
from gensim import corpora

In [25]:
dict2 = corpora.Dictionary(doc_clean)

In [26]:
#Create the corpora matrix
doc_term_matrix = [dict2.doc2bow(doc) for doc in doc_clean]

In [27]:
#Initiate the model
Lda = gensim.models.ldamodel.LdaModel

In [29]:
#Fit the model with corpora matrix with dictionary for 20 topics
ldamodel = Lda(doc_term_matrix, num_topics=20, id2word = dict2, passes=50)

In [30]:
#Check the results for 20 topics
for idx, topic in ldamodel.print_topics(-1):    print('Topic: {} Word: {}'.format(idx, topic))

Topic: 0 Word: 0.019*"harry" + 0.011*"scar" + 0.011*"went" + 0.011*"pointing" + 0.010*"figure" + 0.009*"forehead" + 0.008*"like" + 0.008*"foot" + 0.008*"past" + 0.008*"jumped"
Topic: 1 Word: 0.014*"granger" + 0.013*"shaking" + 0.013*"died" + 0.013*"mean" + 0.012*"fast" + 0.010*"handle" + 0.009*"murmured" + 0.009*"knew" + 0.009*"attention" + 0.009*"parent"
Topic: 2 Word: 0.020*"harry" + 0.014*"back" + 0.014*"room" + 0.013*"hermione" + 0.012*"ron" + 0.009*"them" + 0.008*"went" + 0.008*"harrys" + 0.007*"even" + 0.007*"filch"
Topic: 3 Word: 0.043*"harry" + 0.022*"him" + 0.014*"get" + 0.012*"ron" + 0.012*"back" + 0.010*"them" + 0.010*"hermione" + 0.010*"see" + 0.009*"head" + 0.009*"could"
Topic: 4 Word: 0.034*"house" + 0.024*"whats" + 0.016*"point" + 0.015*"out" + 0.012*"long" + 0.011*"fifty" + 0.010*"end" + 0.009*"gryffindor" + 0.009*"quickly" + 0.009*"peeve"
Topic: 5 Word: 0.027*"harry" + 0.015*"would" + 0.015*"quidditch" + 0.015*"hed" + 0.014*"never" + 0.014*"point" + 0.013*"gryffindor" 

In [31]:
#Check the score for a random sentence (100th) in the book
example = doc_clean[100]
print(example,'\n')

['hagrid', 'said', 'dumbledore', 'sounding', 'relieved', 'at', 'last', 'get', 'motorcycle'] 



In [33]:
example_lda = dict2.doc2bow(example)

In [34]:
for index, score in sorted(ldamodel[example_lda], key=lambda tup: -1*tup[1]):
    print("Score: {}\t Topic: {}".format(score, ldamodel.print_topic(index, 5)))

Score: 0.5066142678260803	 Topic: 0.028*"harry" + 0.026*"looked" + 0.019*"face" + 0.014*"eye" + 0.013*"like"
Score: 0.40336909890174866	 Topic: 0.057*"said" + 0.037*"you" + 0.025*"it" + 0.024*"know" + 0.020*"got"


In [38]:
#!pip install pyLDAvis

In [39]:
#Visualize the lda model with interactive plots
import pyLDAvis
import pyLDAvis.gensim 

In [44]:
#lda_display = pyLDAvis.gensim.prepare(ldamodel, doc_term_matrix, dict2, sort_topics=False)
#pyLDAvis.display(lda_display)

In [45]:
doc_term_matrix

[[(0, 1), (1, 1), (2, 1), (3, 1)],
 [(4, 1), (5, 1)],
 [(6, 1), (7, 1)],
 [(8, 1),
  (9, 1),
  (10, 1),
  (11, 1),
  (12, 1),
  (13, 1),
  (14, 1),
  (15, 1),
  (16, 2),
  (17, 1),
  (18, 1),
  (19, 1),
  (20, 1),
  (21, 1),
  (22, 1),
  (23, 1),
  (24, 1),
  (25, 1),
  (26, 1),
  (27, 1),
  (28, 1)],
 [(6, 1),
  (10, 2),
  (16, 2),
  (17, 1),
  (29, 1),
  (30, 1),
  (31, 1),
  (32, 1),
  (33, 1),
  (34, 1),
  (35, 2),
  (36, 1),
  (37, 1),
  (38, 1),
  (39, 1),
  (40, 1),
  (41, 1),
  (42, 1),
  (43, 1),
  (44, 1),
  (45, 1),
  (46, 1),
  (47, 1),
  (48, 1),
  (49, 1),
  (50, 1),
  (51, 1),
  (52, 1),
  (53, 2),
  (54, 1),
  (55, 1),
  (56, 1),
  (57, 1),
  (58, 1),
  (59, 1),
  (60, 1),
  (61, 1),
  (62, 1),
  (63, 1),
  (64, 1)],
 [(1, 5),
  (6, 1),
  (10, 1),
  (16, 3),
  (26, 1),
  (40, 1),
  (41, 4),
  (54, 1),
  (56, 1),
  (57, 1),
  (65, 1),
  (66, 1),
  (67, 1),
  (68, 1),
  (69, 1),
  (70, 1),
  (71, 1),
  (72, 1),
  (73, 1),
  (74, 1),
  (75, 1),
  (76, 1),
  (77, 1),
  (78,

In [46]:
for idx, topic in ldamodel.print_topics(-1):
    print('Topic : {} Word : {}'.format(idx, topic))

Topic : 0 Word : 0.019*"harry" + 0.011*"scar" + 0.011*"went" + 0.011*"pointing" + 0.010*"figure" + 0.009*"forehead" + 0.008*"like" + 0.008*"foot" + 0.008*"past" + 0.008*"jumped"
Topic : 1 Word : 0.014*"granger" + 0.013*"shaking" + 0.013*"died" + 0.013*"mean" + 0.012*"fast" + 0.010*"handle" + 0.009*"murmured" + 0.009*"knew" + 0.009*"attention" + 0.009*"parent"
Topic : 2 Word : 0.020*"harry" + 0.014*"back" + 0.014*"room" + 0.013*"hermione" + 0.012*"ron" + 0.009*"them" + 0.008*"went" + 0.008*"harrys" + 0.007*"even" + 0.007*"filch"
Topic : 3 Word : 0.043*"harry" + 0.022*"him" + 0.014*"get" + 0.012*"ron" + 0.012*"back" + 0.010*"them" + 0.010*"hermione" + 0.010*"see" + 0.009*"head" + 0.009*"could"
Topic : 4 Word : 0.034*"house" + 0.024*"whats" + 0.016*"point" + 0.015*"out" + 0.012*"long" + 0.011*"fifty" + 0.010*"end" + 0.009*"gryffindor" + 0.009*"quickly" + 0.009*"peeve"
Topic : 5 Word : 0.027*"harry" + 0.015*"would" + 0.015*"quidditch" + 0.015*"hed" + 0.014*"never" + 0.014*"point" + 0.013*"

In [51]:
example = doc_clean[100]
print(example, '\n')
example_lda = dict2.doc2bow(example)
for index, score in sorted(ldamodel[example_lda], key=lambda tup: -1*tup[1]):
    print("Score  {} \t Topic : {}".format(score, ldamodel.print_topic(index, 5)))

['hagrid', 'said', 'dumbledore', 'sounding', 'relieved', 'at', 'last', 'get', 'motorcycle'] 

Score  0.5068292021751404 	 Topic : 0.028*"harry" + 0.026*"looked" + 0.019*"face" + 0.014*"eye" + 0.013*"like"
Score  0.4031541049480438 	 Topic : 0.057*"said" + 0.037*"you" + 0.025*"it" + 0.024*"know" + 0.020*"got"


## spacy Topic Modeling
- Topic model is a type of statistical model for discovering the abstract “topics” that occur in a collection of documents. Topic modeling is a frequently used text-mining tool for discovery of hidden semantic structures in a text body.
- use spaCy for tagging, parsing and entity recognition

In [53]:
#!pip install spacy

In [54]:
import spacy
from spacy import displacy
from spacy.gold import GoldParse, Doc
from spacy.vocab import Vocab
from collections import Counter
import en_core_web_sm
nlp = en_core_web_sm.load()

In [60]:
data[1], data[2], data[3]

('CHAPTER ONE ',
 'THE BOY WHO LIVED ',
 "Mr. and Mrs. Dursley, of number four, Privet Drive, were proud to say that they were perfectly normal, thank you very much. They were the last people you'd expect to be involved in anything strange or mysterious, because they just didn't hold with such nonsense. ")

In [67]:
#Create a longer sentence. Since we need a paragraph!
sentences = data[4]+data[5]+data[6]+data[7]+data[8]+data[9]
sentences

'Mr. Dursley was the director of a firm called Grunnings, which made drills. He was a big, beefy man with hardly any neck, although he did have a very large mustache. Mrs. Dursley was thin and blonde and had nearly twice the usual amount of neck, which came in very useful as she spent so much of her time craning over garden fences, spying on the neighbors. The Dursleys had a small son called Dudley and in their opinion there was no finer boy anywhere. The Dursleys had everything they wanted, but they also had a secret, and their greatest fear was that somebody would discover it. They didn\'t think they could bear it if anyone found out about the Potters. Mrs. Potter was Mrs. Dursley\'s sister, but they hadn\'t met for several years; in fact, Mrs. Dursley pretended she didn\'t have a sister, because her sister and her good-for-nothing husband were as unDursleyish as it was possible to be. The Dursleys shuddered to think what the neighbors would say if the Potters arrived in the street. 

In [62]:
#Only looking at tagging nouns
vocab = Vocab(tag_map={'DL': {'pos': 'NOUN'}})
doc = Doc(Vocab(), words=['Harry', 'Dursley', 'Dudley','Petunia' ])
gold = GoldParse(doc, entities=['DL', 'DL', 'DL', 'DL', 'DL'])

In [69]:
article = nlp(sentences)
labels = [x.label_ for x in article.ents]
items = [x.text for x in article.ents]
Counter(items).most_common(5)

[('Dursley', 14),
 ('Dudley', 5),
 ('Dursleys', 4),
 ('Privet Drive', 2),
 ('Grunnings', 1)]

In [68]:
[(x.orth_,x.pos_, x.lemma_) for x in [y for y in nlp(str(sentences)) if not y.is_stop and y.pos_ != 'PUNCT']]

[('Mr.', 'PROPN', 'Mr.'),
 ('Dursley', 'PROPN', 'Dursley'),
 ('director', 'NOUN', 'director'),
 ('firm', 'NOUN', 'firm'),
 ('called', 'VERB', 'call'),
 ('Grunnings', 'PROPN', 'Grunnings'),
 ('drills', 'NOUN', 'drill'),
 ('big', 'ADJ', 'big'),
 ('beefy', 'ADJ', 'beefy'),
 ('man', 'NOUN', 'man'),
 ('hardly', 'ADV', 'hardly'),
 ('neck', 'NOUN', 'neck'),
 ('large', 'ADJ', 'large'),
 ('mustache', 'NOUN', 'mustache'),
 ('Mrs.', 'PROPN', 'Mrs.'),
 ('Dursley', 'PROPN', 'Dursley'),
 ('thin', 'ADJ', 'thin'),
 ('blonde', 'ADJ', 'blonde'),
 ('nearly', 'ADV', 'nearly'),
 ('twice', 'DET', 'twice'),
 ('usual', 'ADJ', 'usual'),
 ('neck', 'NOUN', 'neck'),
 ('came', 'VERB', 'come'),
 ('useful', 'ADJ', 'useful'),
 ('spent', 'VERB', 'spend'),
 ('time', 'NOUN', 'time'),
 ('craning', 'VERB', 'crane'),
 ('garden', 'NOUN', 'garden'),
 ('fences', 'NOUN', 'fence'),
 ('spying', 'VERB', 'spy'),
 ('neighbors', 'NOUN', 'neighbor'),
 ('Dursleys', 'PROPN', 'Dursleys'),
 ('small', 'ADJ', 'small'),
 ('son', 'NOUN', 'so

In [70]:
dict([(str(x), x.label_) for x in nlp(str(sentences[:])).ents])

{'Dursley': 'PERSON',
 'Grunnings': 'ORG',
 'Dursleys': 'PERSON',
 'Dudley': 'PERSON',
 'Potter': 'PERSON',
 'several years': 'DATE',
 'Potters': 'PERSON',
 'Tuesday': 'DATE',
 'half past eight': 'CARDINAL',
 'four': 'CARDINAL',
 'first': 'ORDINAL',
 'second': 'ORDINAL',
 'Privet Drive': 'LOC',
 'that day': 'DATE'}

In [71]:
displacy.render(nlp(str(sentences)), jupyter=True, style='ent')

  from IPython.core.display import display, HTML


In [84]:
from IPython.display import display, HTML

In [85]:
displacy.render(nlp(str(sentences[0])), style='dep', jupyter = True, options = {'distance': 100})
#not working

  from IPython.core.display import display, HTML


-  Other than LDA and spaCy, one can also use nltk library to tokenize and tag for topic modelling
-  https://www.nltk.org/book/ch05.html

# Text Blob
-  TextBlob is versatile and can be used for speech tagging, tokenization, lemmatization, translation, spelling correction, word dictionary and corpora creation, parsing and n-grams splitting for machine learning uses

In [87]:
#!pip install textblob

In [88]:
from textblob import TextBlob

In [89]:
for sentence in data[4:8]:
    print(sentence, '\n')
    senti = TextBlob(sentence)
    print(senti.sentiment, '\n')

Mr. Dursley was the director of a firm called Grunnings, which made drills. He was a big, beefy man with hardly any neck, although he did have a very large mustache. Mrs. Dursley was thin and blonde and had nearly twice the usual amount of neck, which came in very useful as she spent so much of her time craning over garden fences, spying on the neighbors. The Dursleys had a small son called Dudley and in their opinion there was no finer boy anywhere.  

Sentiment(polarity=-0.02485347985347985, subjectivity=0.36144688644688644) 

The Dursleys had everything they wanted, but they also had a secret, and their greatest fear was that somebody would discover it. They didn't think they could bear it if anyone found out about the Potters. Mrs. Potter was Mrs. Dursley's sister, but they hadn't met for several years; in fact, Mrs. Dursley pretended she didn't have a sister, because her sister and her good-for-nothing husband were as unDursleyish as it was possible to be. The Dursleys shuddered t

-  Polarity is a value between range [-1.0, 1.0] and indicates whether the sentence is negative or positive. Nearer to zero means it is neutral. Subjectivity ranges from [0.0, 1.0] where 0.0 is very objective and 1.0 is very subjective.

## Vader
-  Vader comes from nltk and is another good tool for sentiment analysis

In [91]:
#!pip install vaderSentiment

In [92]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [94]:
analyzer = SentimentIntensityAnalyzer()

In [95]:
for sentence in data[4:8]:
    print(sentence , '\n')
    senti = analyzer.polarity_scores(sentence)
    print(senti, '\n')

Mr. Dursley was the director of a firm called Grunnings, which made drills. He was a big, beefy man with hardly any neck, although he did have a very large mustache. Mrs. Dursley was thin and blonde and had nearly twice the usual amount of neck, which came in very useful as she spent so much of her time craning over garden fences, spying on the neighbors. The Dursleys had a small son called Dudley and in their opinion there was no finer boy anywhere.  

{'neg': 0.025, 'neu': 0.939, 'pos': 0.036, 'compound': 0.2484} 

The Dursleys had everything they wanted, but they also had a secret, and their greatest fear was that somebody would discover it. They didn't think they could bear it if anyone found out about the Potters. Mrs. Potter was Mrs. Dursley's sister, but they hadn't met for several years; in fact, Mrs. Dursley pretended she didn't have a sister, because her sister and her good-for-nothing husband were as unDursleyish as it was possible to be. The Dursleys shuddered to think what

-  Vader takes capital and exclamation marks into account which really adds value in sentiment analysis of online feedback, twitter comments etc. The scores can be used to create features for machine learning prediction models.

## nltk

In [96]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [97]:
sentences = ['MODI is smart, handsome, and funny.',
              'AMIT is very smart, handsome, and funny',
              'GADGARI is VERY SMART, handsome, and FUNNY',
              'SMRITI is very SMART, really handsome, and INCREDIBILY FUNNY !!!',
              'RAVAN is SHIT, readly ugly, and repugnant !!!']
sentences

['MODI is smart, handsome, and funny.',
 'AMIT is very smart, handsome, and funny',
 'GADGARI is VERY SMART, handsome, and FUNNY',
 'SMRITI is very SMART, really handsome, and INCREDIBILY FUNNY !!!',
 'RAVAN is SHIT, readly ugly, and repugnant !!!']

In [98]:
analyzer = SentimentIntensityAnalyzer()

In [99]:
for sentence in sentences:
    print(sentence)
    senti = analyzer.polarity_scores(sentence)
    print(senti, '\n')

MODI is smart, handsome, and funny.
{'neg': 0.0, 'neu': 0.254, 'pos': 0.746, 'compound': 0.8316} 

AMIT is very smart, handsome, and funny
{'neg': 0.0, 'neu': 0.299, 'pos': 0.701, 'compound': 0.8545} 

GADGARI is VERY SMART, handsome, and FUNNY
{'neg': 0.0, 'neu': 0.246, 'pos': 0.754, 'compound': 0.9227} 

SMRITI is very SMART, really handsome, and INCREDIBILY FUNNY !!!
{'neg': 0.0, 'neu': 0.369, 'pos': 0.631, 'compound': 0.9184} 

RAVAN is SHIT, readly ugly, and repugnant !!!
{'neg': 0.586, 'neu': 0.414, 'pos': 0.0, 'compound': -0.8594} 

