## Vader vs Naïve Bayes Classifier 

Reference: https://opensourceforu.com/2016/12/analysing-sentiments-nltk/

## Testing Vader on the Inferno

In [1]:
import json, nltk, os
from nltk.sentiment.vader import SentimentIntensityAnalyzer 

lang = "en"
cantica = "Inferno"
pathToFile = "texts/" + lang + "/" + cantica + "/"

negatives = []
neutrals = []
positives = []
compounds = []
total_lines = 0
lines = []

for root, dirs, files in os.walk(pathToFile):
    for file in sorted(files):
        if file.endswith(".txt"):
            with open(pathToFile + file) as canto:
                line_number = 0
                sid = SentimentIntensityAnalyzer()
                for line in canto:
                    if line.strip():
                        line_number += 1
                        total_lines += 1
                        ss = sid.polarity_scores(line)
                        for k in ss:
                            if (k == "neg"):
                                negatives.append(ss[k])
                            elif (k == "neu"):
                                neutrals.append(ss[k])
                            elif (k == "pos"):
                                positives.append(ss[k])
                            elif (k == "compound"):
                                compounds.append(ss[k])
                                compound = ss[k]
                                if (compound < 0):
                                    color = "red"
                                elif (compound > 0):
                                    color = "#2d8bcF"
                                else:
                                    color = "#ffbb00"

                        # Prepare the JSON file for viz
                        json_line_obj = {
                            "line_number": line_number,
                            "text": line,
                            "compound": compound,
                            "color": color
                        }

                    if json_line_obj not in lines:
                        lines.append(json_line_obj)

# Build the full thing
json_obj = {
    "cantica": cantica,
    "lang": lang,
    "lines": lines
}

print(total_lines)
mean_neg = sum(negatives) / total_lines
mean_neu = sum(neutrals) / total_lines
mean_pos = sum(positives) / total_lines
mean_comp = sum(compounds) / total_lines
print(cantica)
print("Compound:", mean_comp)
print("Mean negative:", mean_neg, "\nMean neutral:", 
      mean_neu, "\nMean positive:", mean_pos)

4720
Inferno
Compound: -0.02548993644067798
Mean negative: 0.07977690677966097 
Mean neutral: 0.8590080508474581 
Mean positive: 0.06121419491525423


In [2]:
import nltk, os
from nltk.sentiment.vader import SentimentIntensityAnalyzer 

lang = "en"
cantica = "Purgatorio"
pathToFile = "texts/" + lang + "/" + cantica + "/"

negatives = []
neutrals = []
positives = []
compounds = []
total_lines = 0

for root, dirs, files in os.walk(pathToFile):
    for file in sorted(files):
        if file.endswith(".txt"):
            with open(pathToFile + file) as canto:
                sid = SentimentIntensityAnalyzer()
                for line in canto:
                    if line.strip():
                        total_lines += 1
                        ss = sid.polarity_scores(line)
                        for k in ss:
                            if (k == "neg"):
                                negatives.append(ss[k])
                            elif (k == "neu"):
                                neutrals.append(ss[k])
                            elif (k == "pos"):
                                positives.append(ss[k])
                            elif (k == "compound"):
                                compounds.append(ss[k])
                                compound = ss[k]
                                if (compound < 0):
                                    color = "#d65108"
                                elif (compound > 0):
                                    color = "#2d8bcF"
                                else:
                                    color = "#ffbb00"

print(total_lines)
mean_neg = sum(negatives) / total_lines
mean_neu = sum(neutrals) / total_lines
mean_pos = sum(positives) / total_lines
mean_comp = sum(compounds) / total_lines
print(cantica)
print("Compound:", mean_comp)
print("Mean negative:", mean_neg, "\nMean neutral:", 
      mean_neu, "\nMean positive:", mean_pos)


4755
Purgatorio
Compound: 0.04260921135646684
Mean negative: 0.06073522607781285 
Mean neutral: 0.8496187171398526 
Mean positive: 0.08964521556256577


In [3]:
import nltk, os
from nltk.sentiment.vader import SentimentIntensityAnalyzer 

lang = "en"
cantica = "Paradiso"
pathToFile = "texts/" + lang + "/" + cantica + "/"

negatives = []
neutrals = []
positives = []
compounds = []
total_lines = 0

for root, dirs, files in os.walk(pathToFile):
    for file in sorted(files):
        if file.endswith(".txt"):
            with open(pathToFile + file) as canto:
                sid = SentimentIntensityAnalyzer()
                for line in canto:
                    if line.strip():
                        total_lines += 1
                        ss = sid.polarity_scores(line)
                        for k in ss:
                            if (k == "neg"):
                                negatives.append(ss[k])
                            elif (k == "neu"):
                                neutrals.append(ss[k])
                            elif (k == "pos"):
                                positives.append(ss[k])
                            elif (k == "compound"):
                                compounds.append(ss[k])
                                compound = ss[k]
                                if (compound < 0):
                                    color = "#d65108"
                                elif (compound > 0):
                                    color = "#2d8bcF"
                                else:
                                    color = "#ffbb00"

print(total_lines)
mean_neg = sum(negatives) / total_lines
mean_neu = sum(neutrals) / total_lines
mean_pos = sum(positives) / total_lines
mean_comp = sum(compounds) / total_lines
print(cantica)
print("Compound:", mean_comp)
print("Mean negative:", mean_neg, "\nMean neutral:", 
      mean_neu, "\nMean positive:", mean_pos)

4758
Paradiso
Compound: 0.1118575241698195
Mean negative: 0.0434201345102985 
Mean neutral: 0.8333379571248382 
Mean positive: 0.12324253888188315


## Testing Naïve Bayes Classifier

Training the data using the manually collated file training_dataset_en.csv

In [4]:
import csv, nltk, os
import pandas as pd
from nltk.tokenize import word_tokenize

# Step 1 – Training the data
training_data = []

with open('training_dataset_en.csv') as file:
    for row in csv.reader(file):
        # Ignore empty text cells
        if (row[0] != ""):
            training_data.append((row[0], row[1]))

In [5]:
# Step 2 
dictionary = set(word.lower() for passage in training_data for word in word_tokenize(passage[0]))

In [6]:
# Step 3
training = [({word: (word in word_tokenize(x[0])) for word in dictionary}, x[1]) for x in training_data]

In [None]:
# Step 4 – the classifier is trained with sample data
classifier = nltk.NaiveBayesClassifier.train(training)

Testing the text against the trained data

In [None]:
lang = "en"
cantica = "Inferno"
pathToFile = "texts/" + lang + "/" + cantica + "/"
total_lines = 0
positive = 0
negative = 0
neutral = 0

for root, dirs, files in os.walk(pathToFile):
    for file in sorted(files):
        if file.endswith(".txt"):
            with open(pathToFile + file) as canto:
                for line in canto:
                    if line.strip():
                        total_lines += 1
                        test_data_features = { word.lower(): (word in word_tokenize(line.lower())) for word in dictionary }
                        if (classifier.classify(test_data_features) == "negative"):
                            negative += 1
                            print("neg:", line)
                        elif (classifier.classify(test_data_features) == "positive"):
                            positive += 1
                            print("pos:", line)
                        else:
                            neutral += 1

print(cantica)
print("Negative:", negative)
print("Positive:", positive)
print("Neutral:", neutral)

print("Negative pol:", negative/total_lines)
print("Positive pol:", positive/total_lines)
print("Neutral pol:", neutral/total_lines)

pos: when Divine Love first moved those things of beauty;

pos: that she can never sate her greedy will;

pos: O Muses, o high genius, help me now;

neg: see if the force in me is strong enough

neg: For, if the Enemy of every evil

neg: When she had finished with her words to me,

neg: “Here one must leave behind all hesitation;

neg: here every cowardice must meet its death.

pos: “Master, what is it that I hear? Who are

neg: which waits for all who have no fear of God.

pos: similarly, the evil seed of Adam

neg: “My son,” the gracious master said to me,

neg: and like a man whom sleep has seized, I fell.
neg: And he to me: “The anguish of the people

pos: of that belief which vanquishes all errors,

neg: the crown he wore, a sign of victory.

neg: He carried off the shade of our first father,

neg: of his son Abel, and the shade of Noah,

pos: “O you who honor art and science both,

pos: who are these souls whose dignity has kept

neg: Soon after they had talked a while together,


In [None]:
lang = "en"
cantica = "Paradiso"
pathToFile = "texts/" + lang + "/" + cantica + "/"
total_lines = 0
positive = 0
negative = 0
neutral = 0

for root, dirs, files in os.walk(pathToFile):
    for file in sorted(files):
        if file.endswith(".txt"):
            with open(pathToFile + file) as canto:
                for line in canto:
                    if line.strip():
                        total_lines += 1
                        test_data_features = { word.lower(): (word in word_tokenize(line.lower())) for word in dictionary }
                        if (classifier.classify(test_data_features) == "negative"):
                            negative += 1
                        elif (classifier.classify(test_data_features) == "positive"):
                            positive += 1
                        else:
                            neutral += 1
print(cantica)
print("Negative:", negative)
print("Positive:", positive)
print("Neutral:", neutral)

print("Negative pol:", negative/total_lines)
print("Positive pol:", positive/total_lines)
print("Neutral pol:", neutral/total_lines)

In [None]:
lang = "en"
cantica = "Purgatorio"
pathToFile = "texts/" + lang + "/" + cantica + "/"
total_lines = 0
positive = 0
negative = 0
neutral = 0

for root, dirs, files in os.walk(pathToFile):
    for file in sorted(files):
        if file.endswith(".txt"):
            with open(pathToFile + file) as canto:
                for line in canto:
                    if line.strip():
                        total_lines += 1
                        test_data_features = { word.lower(): (word in word_tokenize(line.lower())) for word in dictionary }
                        if (classifier.classify(test_data_features) == "negative"):
                            negative += 1
                        elif (classifier.classify(test_data_features) == "positive"):
                            positive += 1
                        else:
                            neutral += 1
print(cantica)
print("Negative:", negative)
print("Positive:", positive)
print("Neutral:", neutral)

print("Negative pol:", negative/total_lines)
print("Positive pol:", positive/total_lines)
print("Neutral pol:", neutral/total_lines)