URL https://towardsdatascience.com/social-media-sentiment-analysis-in-python-with-vader-no-training-required-4bc6a21e87b8

Credit and thanks to:
Zoumana Keita

In [1]:
#VADER - Valence Aware Dictionary sEntiment Reasoner
#a lexicon and simple rule-based model for sentiment analysis
#assess the sentiment without any pre-training
#result pos, neg, neu, compound
#!pip install nltk
import nltk

In [2]:
nltk.download("vader_lexicon")

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [3]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [4]:
sent_analyzer = SentimentIntensityAnalyzer()
sent_analyzer

<nltk.sentiment.vader.SentimentIntensityAnalyzer at 0x7f72c04180d0>

In [5]:
#polarity_score() 
sentence = "VADER is pretty good at identifying the underlying sentiment of a text"
sent_analyzer.polarity_scores(sentence)


{'compound': 0.7269, 'neg': 0.0, 'neu': 0.596, 'pos': 0.404}

In [6]:
sentence1='I do HATE those fake news on internet!!😡'
sent_analyzer.polarity_scores(sentence1)

{'compound': -0.8449, 'neg': 0.619, 'neu': 0.381, 'pos': 0.0}

In [7]:
sentence2='I do HATE those fake news on internet😡'
sent_analyzer.polarity_scores(sentence2)

{'compound': -0.8192, 'neg': 0.601, 'neu': 0.399, 'pos': 0.0}

In [8]:
#Dataset - license-free tweets dataset from Sentiment140 website
import pandas as pd

In [9]:
data = 'https://raw.githubusercontent.com/keitazoumana/VADER_sentiment-Analysis/main/data/testdata.manual.2009.06.14.csv'
senti_data = pd.read_csv(data)
senti_data.head()

Unnamed: 0,4,3,Mon May 11 03:17:40 UTC 2009,kindle2,tpryan,"@stellargirl I loooooooovvvvvveee my Kindle2. Not that the DX is cool, but the 2 is fantastic in its own right."
0,4,4,Mon May 11 03:18:03 UTC 2009,kindle2,vcu451,Reading my kindle2... Love it... Lee childs i...
1,4,5,Mon May 11 03:18:54 UTC 2009,kindle2,chadfu,"Ok, first assesment of the #kindle2 ...it fuck..."
2,4,6,Mon May 11 03:19:04 UTC 2009,kindle2,SIX15,@kenburbary You'll love your Kindle2. I've had...
3,4,7,Mon May 11 03:21:41 UTC 2009,kindle2,yamarama,@mikefish Fair enough. But i have the Kindle2...
4,4,8,Mon May 11 03:22:00 UTC 2009,kindle2,GeorgeVHulme,@richardebaker no. it is too big. I'm quite ha...


In [10]:
def format_data(ip_data):
  last_col = str(ip_data.columns[-1])
  first_col = str(ip_data.columns[0])

  ip_data.rename(columns ={last_col:'tweet_text',
                           first_col: 'polarity'}, inplace = True)
  labels = {0: 'negative',
            2: 'neutral',
            4: 'positive'}
  ip_data['polarity'] = ip_data['polarity'].map(labels)
  return ip_data[['tweet_text', 'polarity']]

In [11]:
tweet_data = format_data(senti_data)
tweet_data.head()

Unnamed: 0,tweet_text,polarity
0,Reading my kindle2... Love it... Lee childs i...,positive
1,"Ok, first assesment of the #kindle2 ...it fuck...",positive
2,@kenburbary You'll love your Kindle2. I've had...,positive
3,@mikefish Fair enough. But i have the Kindle2...,positive
4,@richardebaker no. it is too big. I'm quite ha...,positive


In [12]:
def format_output(output_dict):
  polarity = 'neutral'
  if(output_dict['compound'] >=0.05):
    polarity='positive'
  elif(output_dict['compound'] <= -0.05):
    polarity='negative'
  return polarity

In [13]:
def predict_sentiment(text):
  output_dict = sent_analyzer.polarity_scores(text)
  return format_output(output_dict)

In [14]:
tweet_data['VADERprediction'] = tweet_data['tweet_text'].apply(predict_sentiment)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [15]:
tweet_data.sample(5)

Unnamed: 0,tweet_text,polarity,VADERprediction
492,Ask Programming: LaTeX or InDesign?: submitted...,neutral,neutral
377,Having the old Coca-Cola guy on the GM board i...,negative,negative
295,i hate comcast right now. everything is down c...,negative,negative
481,"7 hours. 7 hours of inkscape crashing, normall...",negative,negative
152,Time Warner Cable Pulls the Plug on 'The Girlf...,neutral,neutral


In [16]:
from sklearn.metrics import accuracy_score, classification_report

In [18]:
accuracy = accuracy_score(tweet_data['polarity'], tweet_data['VADERprediction'])
print(accuracy)

0.716297786720322


In [19]:
print(classification_report(tweet_data['polarity'], tweet_data['VADERprediction']))

              precision    recall  f1-score   support

    negative       0.84      0.64      0.72       177
     neutral       0.67      0.70      0.68       139
    positive       0.67      0.81      0.73       181

    accuracy                           0.72       497
   macro avg       0.73      0.71      0.71       497
weighted avg       0.73      0.72      0.72       497

