In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import nltk
from tqdm import tqdm_notebook as tqdm
from stop_words import get_stop_words

In [None]:
tqdm().pandas()

### Config

In [None]:
plt.style.use("seaborn-whitegrid")
plt.rc('ps',fonttype = 42)
plt.rc('pdf',fonttype = 42)
plt.rcParams.update({'font.size': 14})
plt.rcParams['ps.useafm'] = True
plt.rcParams['pdf.use14corefonts'] = True
plt.rcParams['axes.unicode_minus'] = False

### Load data

In [None]:
texts_df = pd.read_pickle("data/processed/texts.p")

### Prepare data

#### Fix authors

In [None]:
author_fix = {
    "Bachiller D. P. Gatell": "Bachiller D. P. Gatell.",
    "Eliza Haywood": "Eliza Fowler Haywood",
}
texts_df["author"] = texts_df["author"].replace(author_fix)

#### Fix language

In [None]:
texts_df["language"] = texts_df["language"].replace("Spanish; Castilian", "Spanish")

#### Load dictionaries

From: Chen, Y., & Skiena, S. (2014). Building Sentiment Lexicons for All Major Languages. In ACL (2) (pp. 383-389)

In [None]:
languages = ["German", "French", "Italian", "Spanish"]

In [None]:
sentiment_lexica = {}
for lang in languages:
    sentiment_lexica[lang] = {}
    with open("data/sentiment/negative_words_{}.txt".format(lang.lower()), "r") as fr:
        sentiment_lexica[lang]["neg"] = fr.read().splitlines()
    with open("data/sentiment/positive_words_{}.txt".format(lang.lower()), "r") as fr:
        sentiment_lexica[lang]["pos"] = fr.read().splitlines()

In [None]:
for k, v in sentiment_lexica.items():
    print(k, len(v["neg"]), len(v["pos"]))

#### Analyze sentiment

In [None]:
def analyze_sentiment(text, nl, pl):
    tokens = nltk.word_tokenize(text)
    
    num_negative = 0
    num_positive = 0
    for nw in nl:
        num_negative += tokens.count(nw.lower())
    for pw in pl:
        num_positive += tokens.count(pw.lower())
    try:
        score = (num_positive - num_negative) / (num_positive + num_negative)
    except ZeroDivisionError:
        score = 0
    return score

#### Per xml file

In [None]:
text_by_file_df = texts_df.groupby(["filename", "author", "language"])["text"].apply(lambda x: " ".join(x)).to_frame()

In [None]:
text_by_file_df["sentiment"] = 0
for lang in languages:
    lang_df = text_by_file_df.loc[text_by_file_df.index.map(lambda x: x[2] == lang)]
    neg_lexicon = sentiment_lexica[lang]["neg"]
    pos_lexicon = sentiment_lexica[lang]["pos"]
    scores = lang_df["text"].progress_apply(analyze_sentiment, args=[neg_lexicon, pos_lexicon])
    
    text_by_file_df["sentiment"].update(scores)


In [None]:
text_by_file_df[text_by_file_df.index.map(lambda x: x[2] == "German")].head()

In [None]:
ax = text_by_file_df[text_by_file_df.index.map(lambda x: x[2] in ["German", "Italian", "French", "Spanish"])].dropna().groupby("language")["sentiment"].mean().plot(kind="bar", title="Sentiment Analysis")
ax.set_ylabel("Mean Sentiment Score")
ax.set_xlabel("Language")
plt.tight_layout()
plt.show()
#plt.savefig("sentiment.pdf")

In [None]:
ax = text_by_file_df[text_by_file_df.index.map(lambda x: x[2] in ["German", "Italian", "French", "Spanish"])].dropna().groupby("author")["sentiment"].mean().plot(kind="bar", title="Sentiment per Author", figsize=(20, 10))
ax.set_ylabel("Mean Sentiment Score")
ax.set_xlabel("Author")
plt.tight_layout()
plt.show()
#plt.savefig("sentiment.pdf")

In [None]:
text_by_file_df[text_by_file_df.index.map(lambda x: x[1] == "Anonym {Eliza Fowler Haywood}")]

In [None]:
positive_list = [
    "mws-096-297.xml",
"mws-119-1239.xml",
"mws-117-1024.xml",
"mws-099-374.xml",
"mws-099-375.xml",
"mws-099-378.xml",
"mws-099-396.xml",
"mws-117-1170.xml",
"mws.3474.xml",
"mws.3480.xml",
"mws.5513.xml",
"mws.5519.xml",
"mws.5520.xml",
"mws.5528.xml",
"mws.5533.xml",
"mws.5542.xml",
"mws.5549.xml",
"mws.5553.xml",
"mws.5554.xml",
"mws.5556.xml",
"mws.5571.xml",
"mws-099-369.xml",
"mws-103-473.xml",
"mws-103-474.xml",
"mws-103-481.xml",
"mws-103-487.xml",
"mws-103-489.xml",
"mws-103-491.xml"
]


negative_list = ["mws.3464.xml",
"mws.3466.xml",
"mws.3468.xml",
"mws.3470.xml",
"mws.3478.xml",
"mws.3482.xml",
"mws.5512.xml",
"mws.5514.xml",
"mws.5517.xml",
"mws.5518.xml",
"mws.5521.xml",
"mws.5526.xml",
"mws.5530.xml",
"mws.5534.xml",
"mws.5536.xml",
"mws.5537.xml",
"mws.5540.xml",
"mws.5544.xml",
"mws.5545.xml",
"mws.5555.xml",
"mws.5587.xml",
"mws.5590.xml",
"mws.6347.xml",
"mws.6349.xml",
"mws.6351.xml",
"mws-111-817.xml",
"mws.2304.xml",
"mws.7058.xml",
"mws-099-357.xml",
"mws-099-358.xml",
"mws-099-363.xml",
"mws-099-365.xml",
"mws-103-463.xml",
"mws-103-465.xml",
"mws-103-467.xml",
"mws-103-488.xml",
"mws-103-492.xml"
]

In [None]:
text_by_file_df[text_by_file_df.index.map(lambda x: x[0] in negative_list)]["sentiment"] - text_by_file_df[text_by_file_df.index.map(lambda x: x[2] == "Italian")]["sentiment"].mean()

In [None]:
text_by_file_df[text_by_file_df.index.map(lambda x: x[0] in positive_list)]["sentiment"] - text_by_file_df[text_by_file_df.index.map(lambda x: x[2] == "Italian")]["sentiment"].mean()

In [None]:
text_by_file_df[text_by_file_df.index.map(lambda x: x[2] == "Italian")]["sentiment"].mean()

#### Per row

In [None]:
per_row_df = texts_df[texts_df["language"].isin(languages)].copy()

In [None]:
per_row_df["sentiment"] = 0
for lang in languages:
    lang_df = per_row_df.loc[per_row_df["language"] == lang]
    neg_lexicon = sentiment_lexica[lang]["neg"]
    pos_lexicon = sentiment_lexica[lang]["pos"]
    scores = lang_df["text"].progress_apply(analyze_sentiment, args=[neg_lexicon, pos_lexicon])
    
    per_row_df["sentiment"].update(scores)

In [None]:
per_row_df.head()

In [None]:
per_row_df.groupby("language")["sentiment"].mean().plot(kind="bar")

#### NDE

In [None]:
per_row_df.groupby("nde")["sentiment"].mean().plot(kind="bar")

#### NDF

In [None]:
per_row_df.groupby("ndf")["sentiment"].mean().plot(kind="bar")

#### Sentiment over time

In [None]:
per_row_df["date"] = per_row_df["date"].apply(lambda x: x.split("-")[0])
per_row_df["date"] = per_row_df["date"].apply(lambda x: x.split(" [")[0])
per_row_df["date"] = per_row_df["date"].apply(lambda x: x.split(" bzw.")[0])

In [None]:
sent_per_year = per_row_df.groupby("date")["sentiment"].mean()#.plot(figsize=(15, 5))
plt.figure(figsize=(15,5))
ax = sns.lineplot(x=sent_per_year.index, y=sent_per_year.values)
ax.set_xticklabels(sent_per_year.index, rotation=90)
plt.tight_layout()
plt.show()

#### Topics

In [None]:
topics = per_row_df["topics"].apply(lambda x:pd.Series(list(x))).reset_index().melt(id_vars="index").dropna()[["index", "value"]].set_index("index")
t_s_df = pd.merge(topics, per_row_df["sentiment"].to_frame(), left_index=True, right_index=True)
#per_row_df.groupby("date")["topics"].unique().plot(kind="bar", figsize=(15, 5))

In [None]:
ax = t_s_df.groupby("value")["sentiment"].mean().plot(kind="bar", figsize=(15, 5))
ax.set_xlabel("Topics")

#### Sentiment per topicsover time

In [None]:
t_y_df = pd.merge(topics, per_row_df[["date", "sentiment"]], left_index=True, right_index=True)

In [None]:
table = pd.pivot_table(t_y_df, values="sentiment", index="date", columns="value")

In [None]:
ax = table[["Idea of Man", "Image of Women", "Image of Men", "Theatre Literature Arts", "Manners and Customs"]].plot(kind="line", legend=True, figsize=(15, 5), xticks=range(0, len(table.index)))
ax.set_xticklabels(table.index, rotation=90)
plt.tight_layout()
plt.show()

#### Places

In [None]:
places = per_row_df["places"].apply(lambda x:pd.Series(list(x))).reset_index().melt(id_vars="index").dropna()[["index", "value"]].set_index("index")
p_s_df = pd.merge(places, per_row_df[["sentiment", "language"]], left_index=True, right_index=True)

In [None]:
p_s_df[p_s_df["value"] == "Bristol"].groupby("language")["sentiment"].mean().plot(kind="bar")

#### Places

In [None]:
persons = per_row_df["persons"].apply(lambda x:pd.Series(list(x))).reset_index().melt(id_vars="index").dropna()[["index", "value"]].set_index("index")
pe_s_df = pd.merge(persons, per_row_df[["sentiment", "language"]], left_index=True, right_index=True)

In [None]:
pe_s_df[pe_s_df["value"] == "Adonis"].groupby("language")["sentiment"].mean().plot(kind="bar")

#### Works

In [None]:
works = per_row_df["works"].apply(lambda x:pd.Series(list(x))).reset_index().melt(id_vars="index").dropna()[["index", "value"]].set_index("index")
w_s_df = pd.merge(works, per_row_df[["sentiment", "language"]], left_index=True, right_index=True)

In [None]:
w_s_df[w_s_df["value"] == "Macbeth"].groupby("language")["sentiment"].mean().plot(kind="bar")