# Recurrent Words Connotation Analysis

We want to establish which recurring words in the corpus are responsible for the sentiment score over time. More specifically, we are interested in the words that impact the positive/negative sentiment score over each one of the phases determined by the topic modeling analysis.

In [1]:
import pandas as pd
import warnings
import spacy

In [2]:
def extract_tokens(texts):
    docs = list(nlp.pipe(texts))
    tokens = []
    for doc in docs:
        for token in doc:
                tokens.append(token.lemma_)
    return tokens

In [3]:
# Ignore warnings in libraries
warnings.filterwarnings("ignore")

# Import Italo Svevo corpus dataset
data = pd.read_csv('datasets/carteggio.svevo3.csv', sep=';', parse_dates=['date'])

# Import positive and negative italian lexicons
pos = list(pd.read_csv('datasets/nrc_positive_it.txt', sep='\n', header=None)[0])
neg = list(pd.read_csv('datasets/nrc_negative_it.txt', sep='\n', header=None)[0])

# Tokenization
nlp = spacy.load("it", disable=['parser', 'ner'])

In [4]:
# Filter italian language letters only
data_it = data[data['mainLanguage'] == 'ITA']

# Split in groups based on years and topics
d8501 = list(data_it.query('year > 1884 and year < 1902')['text'])
d0208 = list(data_it.query('year > 1901 and year < 1909')['text'])
d0922 = list(data_it.query('year > 1908 and year < 1923')['text'])
d2328 = list(data_it.query('year > 1922 and year < 1929')['text'])

In [5]:
t8501 = extract_tokens(d8501)
t0208 = extract_tokens(d0208)
t0922 = extract_tokens(d0922)
t2328 = extract_tokens(d2328)

In [6]:
pos_8501 = [x for x in t8501 if x in pos]
pos_0208 = [x for x in t0208 if x in pos]
pos_0922 = [x for x in t0922 if x in pos]
pos_2328 = [x for x in t2328 if x in pos]

In [7]:
neg_8501 = [x for x in t8501 if x in neg]
neg_0208 = [x for x in t0208 if x in neg]
neg_0922 = [x for x in t0922 if x in neg]
neg_2328 = [x for x in t2328 if x in neg]

In [8]:
from collections import Counter
print("10 most common positive words for 1885-1901:")
print(Counter(pos_8501).most_common(10))
print("10 most common positive words for 1902-1908:")
print(Counter(pos_0208).most_common(10))
print("10 most common positive words for 1909-1922:")
print(Counter(pos_0922).most_common(10))
print("10 most common positive words for 1923-1928:")
print(Counter(pos_2328).most_common(10))
print()
print("10 most common negative words for 1885-1901:")
print(Counter(neg_8501).most_common(10))
print("10 most common negative words for 1902-1908:")
print(Counter(neg_0208).most_common(10))
print("10 most common negative words for 1909-1922:")
print(Counter(neg_0922).most_common(10))
print("10 most common negative words for 1923-1928:")
print(Counter(neg_2328).most_common(10))

10 most common positive words for 1885-1901:
[('primo', 227), ('caro', 208), ('parlare', 171), ('bene', 169), ('parola', 122), ('vero', 77), ('bello', 64), ('affare', 58), ('trattare', 53), ('solito', 50)]
10 most common positive words for 1902-1908:
[('caro', 194), ('bene', 155), ('primo', 124), ('parlare', 78), ('pregare', 52), ('vero', 51), ('solito', 46), ('affare', 46), ('bello', 42), ('parola', 41)]
10 most common positive words for 1909-1922:
[('bene', 143), ('caro', 139), ('primo', 105), ('parlare', 70), ('abbastanza', 53), ('solito', 46), ('parola', 37), ('bello', 36), ('pregare', 35), ('vero', 35)]
10 most common positive words for 1923-1928:
[('primo', 120), ('caro', 105), ('parlare', 85), ('signore', 78), ('bene', 64), ('articolare', 58), ('vero', 51), ('parola', 48), ('saluto', 37), ('amico', 35)]

10 most common negative words for 1885-1901:
[('caro', 208), ('partire', 159), ('piccolo', 87), ('dispiacere', 79), ('trattare', 53), ('fabbricare', 53), ('madre', 47), ('dolore

In [9]:
# Now filtering out too common/insignificant words
bad_pos = ['primo', 'caro', 'parlare', 'bene', 'parola', 'vero', 'bello', 'solito', 'saluto', 'signore', 
       'abbastanza', 'salutare', 'affare', 'trattare']

gpos_8501 = [x for x in t8501 if x in pos and x not in bad_pos]
gpos_0208 = [x for x in t0208 if x in pos and x not in bad_pos]
gpos_0922 = [x for x in t0922 if x in pos and x not in bad_pos]
gpos_2328 = [x for x in t2328 if x in pos and x not in bad_pos]

In [10]:
print("10 most common positive words for 1885-1901:")
print(Counter(gpos_8501).most_common(10))
print("10 most common positive words for 1902-1908:")
print(Counter(gpos_0208).most_common(10))
print("10 most common positive words for 1909-1922:")
print(Counter(gpos_0922).most_common(10))
print("10 most common positive words for 1923-1928:")
print(Counter(gpos_2328).most_common(10))

10 most common positive words for 1885-1901:
[('vicino', 49), ('madre', 47), ('pregare', 45), ('sicuro', 41), ('dolce', 37), ('lieto', 37), ('pieno', 33), ('evidente', 32), ('porto', 32), ('mangiare', 30)]
10 most common positive words for 1902-1908:
[('pregare', 52), ('mangiare', 39), ('insieme', 32), ('vicino', 22), ('dolce', 19), ('bambino', 17), ('pieno', 16), ('sterlina', 16), ('invitare', 15), ('sicuro', 15)]
10 most common positive words for 1909-1922:
[('pregare', 35), ('mangiare', 30), ('insieme', 23), ('stato', 21), ('sicuro', 19), ('lira', 19), ('pieno', 17), ('madre', 17), ('pagare', 17), ('contare', 15)]
10 most common positive words for 1923-1928:
[('articolare', 58), ('amico', 35), ('grazia', 33), ('ricordo', 28), ('pregare', 26), ('cordiale', 24), ('ringraziamento', 24), ('gentile', 22), ('sicuro', 22), ('grato', 19)]


In [11]:
bad_neg = ['caro', 'partire', 'signore', 'appena', 'trattare', 'piccolo', 'fabbricare', 'dispiacere', 'aspettare']

gneg_8501 = [x for x in t8501 if x in neg and x not in bad_neg]
gneg_0208 = [x for x in t0208 if x in neg and x not in bad_neg]
gneg_0922 = [x for x in t0922 if x in neg and x not in bad_neg]
gneg_2328 = [x for x in t2328 if x in neg and x not in bad_neg]

In [12]:
print("10 most common negative words for 1885-1901:")
print(Counter(gneg_8501).most_common(10))
print("10 most common negative words for 1902-1908:")
print(Counter(gneg_0208).most_common(10))
print("10 most common negative words for 1909-1922:")
print(Counter(gneg_0922).most_common(10))
print("10 most common negative words for 1923-1928:")
print(Counter(gneg_2328).most_common(10))

10 most common negative words for 1885-1901:
[('madre', 47), ('dolore', 45), ('dubbio', 39), ('soffrire', 37), ('gelosia', 35), ('partenza', 31), ('paura', 30), ('finora', 30), ('dimenticare', 30), ('bagnare', 29)]
10 most common negative words for 1902-1908:
[('peggio', 26), ('dimenticare', 24), ('servire', 22), ('bagnare', 20), ('partenza', 20), ('paura', 19), ('soffrire', 19), ('perdere', 18), ('sterlina', 16), ('cuocere', 15)]
10 most common negative words for 1909-1922:
[('bagnare', 41), ('colera', 31), ('malattia', 23), ('paura', 22), ('madre', 17), ('perdere', 17), ('soffrire', 16), ('finora', 15), ('servire', 14), ('dimenticare', 14)]
10 most common negative words for 1923-1928:
[('copia', 32), ('critico', 30), ('criticare', 25), ('perdere', 24), ('difficile', 21), ('grato', 19), ('dimenticare', 18), ('spesa', 18), ('taglio', 14), ('ristampare', 13)]
