# Sentiment Analysis POC

* This notebook is a scratch pad, for sentiment analysis POC. 
* The idea is to try out few pre-trained sentiment analysis models and see which one works for our use case. 

## Installations

In [65]:
# # ## installing required libraries
# ! pip install beautifulsoup4
# ! pip install pandas
# ! pip install numpy
# ! pip install plotly
# ! pip install nbformat
# ! pip install ipykernel
# ! pip install matplotlip
# ! pip install wordcloud
# ! pip install gensim
# ! pip install pyLDAvis
# ! pip install nltk
# ! pip install -U pip setuptools wheel
# ! pip install -U spacy
# ! python -m spacy download en_core_web_trf 
# ! python -m spacy download en_core_web_md
# ! pip install joblib
# ! pip install tqdm
# ! pip install transformers




[notice] A new release of pip available: 22.2.2 -> 22.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


## Importing Data

In [2]:
## lets load 
import pandas as pd
# pd.set_option('display.max_rows', 500)
# pd.set_option('display.max_columns', 500)
# pd.set_option('display.width', 1000)

import re
import string
from bs4 import BeautifulSoup

import nltk
from nltk.stem import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
nltk.download('stopwords')
from pprint import pprint

import spacy
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.model_selection import GridSearchCV

from gensim import corpora, models
from gensim.utils import simple_preprocess
from gensim.models import Phrases
from gensim.models.phrases import Phraser
from gensim.models import CoherenceModel
from gensim.models.ldamodel import LdaModel
from gensim.models.ldamulticore import LdaMulticore

# Plotting tools
import pyLDAvis
import pyLDAvis.sklearn
import pyLDAvis.gensim_models as gensimvis
import matplotlib.pyplot as plt
%matplotlib inline

import plotly.express as px
import plotly.graph_objects as go
import plotly.io as io

# loading library
import pickle

from joblib import dump, load

from tqdm.auto import tqdm

import os
import sys
sys.path.insert(0, os.path.abspath('../utils'))

## importing custom modules
import common_utils
import gensim_utils
import sklearn_utils

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\gaura\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
  from .autonotebook import tqdm as notebook_tqdm
  from imp import reload
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\gaura\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\gaura\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\gaura\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Reading Data

In [3]:
## reading manaully scrapped data
data = pd.read_csv('../data/scrapped_fox_data_clean.csv')
print(data.shape)

(3972, 12)


In [4]:
def preprocess_text(text):
     text = common_utils.clean_html(text)
     text = common_utils.lower_case(text)
     text = common_utils.remove_line_breaks(text)
     text = common_utils.remove_punctuation(text)
     text = common_utils.remove_numbers(text)
     text = common_utils.remove_extra_spaces(text)
     # text = remove_stopwords(text)
     return text

data["clean_text"] = data["text"].progress_apply(preprocess_text)

processing: 100%|██████████| 3972/3972 [00:01<00:00, 2542.73it/s]


In [5]:
data.head()["clean_text"]

0    former governor and first term democratic sen ...
1    president biden urged democrats on wednesday t...
2    the famous naked cowboy in new york citys time...
3    liberal groups in wisconsin seeking to change ...
4    texas gubernatorial nominee beto o’rourke is a...
Name: clean_text, dtype: object

## Sentiment Analysis Using VADER
(Valence Aware Dictionary and sEntiment Reasoner)

##### Notes
* Uses Bag of words approach
* Gives +tive, -tive or neutral values to each of the words in the sentence and then gives combined value of that to tell us whether the sentence is positive, negative or neutral
* Does not account for relationship between words :(

In [6]:
from nltk.sentiment import SentimentIntensityAnalyzer


In [7]:
example = data["text"][0]
example

'Former governor and first term Democratic Sen. Maggie Hassan of New Hampshire and Republican challenger Don Bolduc took aim at each other over inflation, abortion, national security, the border crisis, election denialism, and many more issues in their third and final debate in their crucial battleground state race that’s among a handful across the country that will likely determine if the GOP wins back the Senate majority. But ahead of the verbal crossfire on the debate stage, Bolduc – a former Army general who served ten tours of duty in the war in Afghanistan – was allegedly assaulted as he arrived at the debate site at Saint Anselm College’s New Hampshire Institute of Politics on Wednesday evening. According to the Bolduc campaign, a bystander standing in the crowd outside the debate site took a swing at the former general as he arrived. The campaign says Bolduc was slightly grazed but not injured.&nbsp; Rick Wiley of the Bolduc campaign tells Fox News the person who threw the punc

In [8]:
nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

In [9]:
sia = SentimentIntensityAnalyzer()

In [16]:
sia.polarity_scores(example)

{'neg': 0.105, 'neu': 0.812, 'pos': 0.083, 'compound': -0.9869}

In [21]:
## running the sentiment analysis on entire dataset
print(data["title"][0])
sia.polarity_scores(data["title"][0])

Hassan and Bolduc trade fire in final showdown after GOP nominee comes under attack arriving at debate


{'neg': 0.268, 'neu': 0.732, 'pos': 0.0, 'compound': -0.6705}

In [27]:
## lets rename index to id
data.rename(columns={"index":"id"}, inplace=True)

In [57]:
results = {}
for idx, row in tqdm(data.iterrows(), total=data.shape[0]):
    text = row["title"]
    id = idx
    results[id] = sia.polarity_scores(text)    

100%|██████████| 3972/3972 [00:00<00:00, 5999.72it/s]


In [58]:
results_df = pd.DataFrame.from_dict(results, orient='index')

In [59]:
combined_df = pd.concat([data, results_df], axis=1)
combined_df.head()

Unnamed: 0,title,description,url,last_published_date,authors,text,published_day,published_month,num_authors,author,word_count,line_count,clean_text,neg,neu,pos,compound
0,Hassan and Bolduc trade fire in final showdown...,A bystander took a swing at Republican Senate ...,https://www.foxnews.com/politics/hassan-bolduc...,2022-11-02 22:47:00-04:00,[{'name': 'Paul Steinhauser'}],Former governor and first term Democratic Sen....,2,11,1,Paul_Steinhauser,1271,62,former governor and first term democratic sen ...,0.268,0.732,0.0,-0.6705
1,Biden suggests voting for Republicans is a thr...,President Biden said the only way to repudiate...,https://www.foxnews.com/politics/biden-speech,2022-11-02 22:15:46-04:00,[{'name': 'Haris Alic'}],President Biden urged Democrats on Wednesday t...,2,11,1,Haris_Alic,478,22,president biden urged democrats on wednesday t...,0.298,0.702,0.0,-0.5267
2,NYC's Naked Cowboy makes endorsement for gov w...,New York City's Naked Cowboy endorsed Lee Zeld...,https://www.foxnews.com/politics/nyc-naked-cow...,2022-11-02 21:58:25-04:00,[{'name': 'Adam Sabes'}],The famous Naked Cowboy in New York City's Tim...,2,11,1,Adam_Sabes,205,18,the famous naked cowboy in new york citys time...,0.0,0.757,0.243,0.5423
3,Wisconsin courts shoot down liberal groups' at...,A Wisconsin appeals court and a circuit judge ...,https://www.foxnews.com/politics/wisconsin-cou...,2022-11-02 21:44:40-04:00,[{'name': 'Bradford Betz'}],Liberal groups in Wisconsin seeking to change ...,2,11,1,Bradford_Betz,381,20,liberal groups in wisconsin seeking to change ...,0.29,0.71,0.0,-0.5423
4,Texas gubernatorial candidate Beto O'Rourke jo...,Texas gubernatorial nominee Beto O’Rourke is t...,https://www.foxnews.com/politics/texas-guberna...,2022-11-02 20:38:30-04:00,[{'name': 'Bradford Betz'}],Texas gubernatorial nominee Beto O’Rourke is a...,2,11,1,Bradford_Betz,267,15,texas gubernatorial nominee beto o’rourke is a...,0.0,1.0,0.0,0.0


In [63]:
combined_df.loc[combined_df["pos"].idxmax(), "title"]

"Former President Trump celebrates 'ALL' endorsement wins in primary: 'Great candidates!'"

In [64]:
combined_df.describe()

Unnamed: 0,published_day,published_month,num_authors,word_count,line_count,neg,neu,pos,compound
count,3972.0,3972.0,3972.0,3972.0,3972.0,3972.0,3972.0,3972.0,3972.0
mean,16.787513,8.453424,1.119084,610.804884,31.652064,0.109221,0.812165,0.078617,-0.065043
std,8.934083,1.335676,0.447909,340.7851,19.232276,0.136235,0.158318,0.110806,0.394567
min,1.0,6.0,0.0,31.0,3.0,0.0,0.163,0.0,-0.9661
25%,9.0,7.0,1.0,399.0,21.0,0.0,0.701,0.0,-0.3612
50%,18.0,8.0,1.0,534.0,28.0,0.0,0.823,0.0,0.0
75%,25.0,10.0,1.0,734.0,37.0,0.194,1.0,0.155,0.1779
max,31.0,11.0,5.0,9672.0,647.0,0.837,1.0,0.668,0.936


## Roberta Model

In [90]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoConfig
from scipy.special import softmax

In [91]:
## pulling a specific model pretrained on sentiment analysis
task='sentiment'
MODEL = f"cardiffnlp/twitter-roberta-base-sentiment-latest"

tokenizer = AutoTokenizer.from_pretrained(MODEL)
config = AutoConfig.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [92]:
print(data["title"][1])
sia.polarity_scores(data["title"][1])

Biden suggests voting for Republicans is a threat to democracy


{'neg': 0.298, 'neu': 0.702, 'pos': 0.0, 'compound': -0.5267}

In [93]:
## running on roberta model
def get_roberta_sentiment(text):
    encoded_text = tokenizer(text, return_tensors="pt")
    output = model(**encoded_text)
    scores = output[0][0].detach().numpy()
    scores = softmax(scores)
    ## scores are in order of negative, neutral and positive
    scores_dict = {"roberta_neg":scores[0], "roberta_neu":scores[1], "roberta_pos":scores[2]}
    return scores_dict


scores = get_roberta_sentiment(data["title"][1])
scores

{'roberta_neg': 0.66241485,
 'roberta_neu': 0.3247515,
 'roberta_pos': 0.012833751}

In [109]:
results = {}
for idx, row in tqdm(data.iterrows(), total=data.shape[0]):
    try:
        text = row["text"]
        id = idx
        vader_results = sia.polarity_scores(text)
        vader_results_rename = {}
        for k,v in vader_results.items():
            vader_results_rename[f"vader_{k}"] = v
        roberta_results = get_roberta_sentiment(text)
        both = {**vader_results_rename, **roberta_results}
        results[idx] = both
    except RuntimeError:
        print(f"Broke for id {idx}")


  0%|          | 0/3972 [00:00<?, ?it/s]

Broke for id 0
Broke for id 1


  0%|          | 6/3972 [00:02<37:46,  1.75it/s]

Broke for id 6


  0%|          | 15/3972 [00:03<09:30,  6.93it/s]

Broke for id 8
Broke for id 9
Broke for id 10
Broke for id 11
Broke for id 12
Broke for id 13
Broke for id 14
Broke for id 15
Broke for id 16
Broke for id 17
Broke for id 18
Broke for id 19


  1%|          | 21/3972 [00:04<08:37,  7.63it/s]

Broke for id 21
Broke for id 22
Broke for id 23
Broke for id 24
Broke for id 25


  1%|          | 27/3972 [00:04<08:45,  7.51it/s]

Broke for id 27


  1%|          | 29/3972 [00:05<11:24,  5.76it/s]

Broke for id 29
Broke for id 30
Broke for id 31
Broke for id 32


  1%|          | 34/3972 [00:06<11:18,  5.81it/s]

Broke for id 34
Broke for id 35
Broke for id 36


  1%|          | 40/3972 [00:08<19:23,  3.38it/s]

Broke for id 40


  1%|          | 42/3972 [00:09<19:28,  3.36it/s]

Broke for id 42


  1%|          | 44/3972 [00:10<19:26,  3.37it/s]

Broke for id 44
Broke for id 45


  1%|          | 48/3972 [00:11<22:24,  2.92it/s]

Broke for id 48
Broke for id 49
Broke for id 50
Broke for id 51
Broke for id 52
Broke for id 53
Broke for id 54
Broke for id 55


  1%|▏         | 57/3972 [00:12<12:24,  5.26it/s]

Broke for id 57
Broke for id 58


  2%|▏         | 60/3972 [00:13<14:18,  4.56it/s]

Broke for id 60
Broke for id 61


  2%|▏         | 63/3972 [00:14<15:22,  4.24it/s]

Broke for id 63


  2%|▏         | 65/3972 [00:14<15:05,  4.32it/s]

Broke for id 65


  2%|▏         | 69/3972 [00:16<20:55,  3.11it/s]

Broke for id 69
Broke for id 70


  2%|▏         | 72/3972 [00:17<20:31,  3.17it/s]

Broke for id 72
Broke for id 73
Broke for id 74
Broke for id 75
Broke for id 76
Broke for id 77


  2%|▏         | 80/3972 [00:19<18:06,  3.58it/s]

Broke for id 80


  2%|▏         | 82/3972 [00:20<19:58,  3.24it/s]

Broke for id 82
Broke for id 83
Broke for id 84


  2%|▏         | 86/3972 [00:20<15:43,  4.12it/s]

Broke for id 86
Broke for id 87
Broke for id 88
Broke for id 89


  2%|▏         | 91/3972 [00:21<12:18,  5.26it/s]

Broke for id 91


  2%|▏         | 93/3972 [00:21<14:57,  4.32it/s]

Broke for id 93


  2%|▏         | 95/3972 [00:22<15:39,  4.13it/s]

Broke for id 95
Broke for id 96


  2%|▏         | 98/3972 [00:23<16:53,  3.82it/s]

Broke for id 98


  2%|▏         | 99/3972 [00:24<15:39,  4.12it/s]


KeyboardInterrupt: 

In [95]:
results_df = pd.DataFrame.from_dict(results, orient='index')
results_df

Unnamed: 0,vader_neg,vader_neu,vader_pos,vader_compound,roberta_neg,roberta_neu,roberta_pos
0,0.268,0.732,0.000,-0.6705,0.214919,0.765167,0.019915
1,0.298,0.702,0.000,-0.5267,0.662415,0.324751,0.012834
2,0.000,0.757,0.243,0.5423,0.005948,0.538470,0.455582
3,0.290,0.710,0.000,-0.5423,0.346270,0.637059,0.016671
4,0.000,1.000,0.000,0.0000,0.056946,0.885563,0.057491
...,...,...,...,...,...,...,...
3967,0.152,0.691,0.157,0.0258,0.224884,0.703206,0.071910
3968,0.000,1.000,0.000,0.0000,0.570033,0.403358,0.026609
3969,0.080,0.747,0.172,0.3818,0.502582,0.481971,0.015446
3970,0.173,0.827,0.000,-0.3182,0.048049,0.920026,0.031925


In [102]:
final_df = pd.concat([data, results_df], axis=1)


In [105]:
final_df.loc[final_df["roberta_pos"].idxmax(), "title"]

"Crist praises Biden, says president is 'phenomenal' and he 'can't wait' to have his support in Florida"

In [107]:
final_df.loc[final_df["vader_pos"].idxmax(), "title"]

"Former President Trump celebrates 'ALL' endorsement wins in primary: 'Great candidates!'"