In [9]:
from sklearn import datasets
from sklearn.metrics import confusion_matrix, classification_report
from pattern.en import sentiment
from nltk.sentiment import vader
from tqdm import tqdm
import numpy as np
import nltk
import csv

*you might need to install some stuff before, e.g.*

```sudo apt install libmysqlclient-dev
sudo pip3 install pattern```

## Let's get some data

In [None]:
nltk.download('vader_lexicon')

In [2]:
data = datasets.fetch_20newsgroups(categories=['soc.religion.christian'],
                                  remove=('headers', 'footers', 'quotes'), subset='train')
texts = data['data']

In [3]:
len(texts)

599

In [4]:
print(texts[0])

I wrote in response to dlecoint@garnet.acns.fsu.edu (Darius_Lecointe):


Was Paul a God too? Is an interpretation of the words of Paul of higher
priority than the direct word of Jesus in Matt5:14-19? Paul begins
Romans 14 with "If someone is weak in the faith ..." Do you count
yourself as one who is weak in the faith?


Yes, but what does the Bible have to say? What did Jesus say? Paul
closes Romans 14 with, "On the other hand, the person with doubts about
something who eats it anyway is guilty, because he isn't acting on his
faith, and any failure to act on faith is a sin." Gaus, ISBN:0-933999-99-2
Have you read the Ten Commandments which are a portion of the Law? Have
you read Jesus' word in Matt5:14-19? Is there any doubt in your mind
about what is right and what is sin (Greek hamartia = missing the mark)?


Whereas, the Ten Commandments and Jesus' words in Matt5:14-19 are fairly
clear, are they not?


Matt5:14-19 doesn't answer your question?


Breaking bread - roughly synonymous w

In [6]:
sentiment(texts[0])

(0.03465061874152785, 0.34897186147186154)

In [7]:
s = sentiment(texts[0])
print(f"The polarity is {s[0]} and the subjectivity is {s[1]}")

The polarity is 0.03465061874152785 and the subjectivity is 0.34897186147186154


## Let's try out pattern

In [10]:
polarity = []
subjectivity = []
for t in tqdm(texts):
    s = sentiment(t)
    polarity.append(s[0])
    subjectivity.append(s[1])

100%|██████████| 599/599 [00:18<00:00, 31.83it/s]


In [11]:
polarity = []
subjectivity = []
for t in tqdm(texts):
    polarity.append(sentiment(t)[0])
    subjectivity.append(sentiment(t)[1])

100%|██████████| 599/599 [00:21<00:00, 27.25it/s]


In [None]:
# DONT DO THIS!!
# dont loop twice over same list
# dont calculate sentiment twice
polarity = []
for t in texts:
    polarity.append(sentiment(t)[0])
    
subjectivity = []
for t in texts:
    subjectivity.append(sentiment(t)[1])    

In [12]:
polarityrity[0]

0.03465061874152785

In [17]:
output = zip([t.replace('\n',' ') for t in texts], polarity, subjectivity)
with open('test.csv', mode='w') as f:
    writer=csv.writer(f)
    writer.writerow(['newsgrouptext', 'polarity', 'subjectivity'])
    writer.writerows(output)

In [15]:
csv.writer?

## Do pattern and vader disagree?

In [18]:
senti=vader.SentimentIntensityAnalyzer()
senti.polarity_scores('This is a great day!')

{'neg': 0.0, 'neu': 0.406, 'pos': 0.594, 'compound': 0.6588}

In [19]:
vadercomp = []
senti=vader.SentimentIntensityAnalyzer()

for t in texts:
    s = senti.polarity_scores(t)
    vadercomp.append(s['compound'])
    # of course you could make more lists to store the other values as well

In [20]:
output = zip([t.replace('\n','') for t in texts], polarity, subjectivity, vadercomp)
with open('test.csv', mode='w') as f:
    writer=csv.writer(f)
    writer.writerow(['Text', 'pattern_polarity', 'pattern_subjectivity', 'vader_composite'])
    writer.writerows(output)

In [23]:
np.corrcoef([polarity, vadercomp])

array([[1.       , 0.2352023],
       [0.2352023, 1.       ]])

**It is REALLY important to validate - NEVER just blindly trust a sentiment score**

# Self-test Exercise for at home

**ON AN OWN DATASET THAT YOU ARE INTERESTED IN**, how do the following approaches compare:

- pattern
- vader
- own/existing word list
- hand-coding (if you would manually code some articles as positive or negative)

In [24]:
# assuming you made a csv file with two columns (the text and your coding ("annotation"))
mytexts = []
y_manual= []
with open('test_annotated.csv', mode='r') as f:
    reader = csv.reader(f)
    next(reader) # to skip the first row with headers
    for row in reader:
        mytexts.append(row[0])
        y_manual.append(row[1])

In [25]:
analyzer = vader.SentimentIntensityAnalyzer()
y_vader = []

for text in mytexts:
    sent = analyzer.polarity_scores(text)
    if sent['compound']>0:
        y_vader.append('pos')
    elif sent['compound']<0:
        y_vader.append('neg')
    else:
        y_vader.append('dont know')
print(confusion_matrix(y_manual, y_vader))
print(classification_report(y_manual, y_vader))

[[1 1]
 [1 3]]
              precision    recall  f1-score   support

         neg       0.50      0.50      0.50         2
         pos       0.75      0.75      0.75         4

    accuracy                           0.67         6
   macro avg       0.62      0.62      0.62         6
weighted avg       0.67      0.67      0.67         6



### Interpretation

(example of the 'neg' row)

- precision: which percentage of the texts that vader coded as negative were really negative (according to manual coding)?
- recall: which percentage of the really negative texts (according to manual coding) did vader find?