In [44]:
from sentiment_analysis_1 import *
import pandas as pd

In [47]:
articles = load_titles()

In [48]:
print(len(articles), "articles found")

4604 articles found


In [49]:
content = load_html(articles)

In [50]:
set(content[k][0] for k in content.keys())

{'\n'}

load sentiment data

In [51]:
sentiment_lexicon_path = "../sentiwordnet/SentiWordNet_3.0.0.txt"
df = pd.read_csv(sentiment_lexicon_path, sep="\t", skiprows=25, skipfooter=1, engine="python")

In [52]:
df

Unnamed: 0,# POS,ID,PosScore,NegScore,SynsetTerms,Gloss
0,a,1740,0.125,0.000,able#1,(usually followed by `to') having the necessar...
1,a,2098,0.000,0.750,unable#1,(usually followed by `to') not having the nece...
2,a,2312,0.000,0.000,dorsal#2 abaxial#1,facing away from the axis of an organ or organ...
3,a,2527,0.000,0.000,ventral#2 adaxial#1,nearest to or facing toward the axis of an org...
4,a,2730,0.000,0.000,acroscopic#1,facing or on the side toward the apex
...,...,...,...,...,...,...
117654,v,2771756,0.000,0.000,run_dry#1 dry_out#2,"become empty of water; ""The river runs dry in ..."
117655,v,2771888,0.000,0.125,fog_up#1,"get foggy; ""The windshield fogged up"""
117656,v,2771997,0.000,0.000,coal#1 char#1,"burn to charcoal; ""Without a drenching rain, t..."
117657,v,2772202,0.125,0.250,haze#1,"become hazy, dull, or cloudy"


In [53]:
def parse_synset_terms(row):
    words = row.split()
    words = [word.split('#')[0] for word in words]
    return words

df['SynsetTerms'] = df['SynsetTerms'].apply(parse_synset_terms)
df = df.explode('SynsetTerms').reset_index(drop=True)
df

Unnamed: 0,# POS,ID,PosScore,NegScore,SynsetTerms,Gloss
0,a,1740,0.125,0.000,able,(usually followed by `to') having the necessar...
1,a,2098,0.000,0.750,unable,(usually followed by `to') not having the nece...
2,a,2312,0.000,0.000,dorsal,facing away from the axis of an organ or organ...
3,a,2312,0.000,0.000,abaxial,facing away from the axis of an organ or organ...
4,a,2527,0.000,0.000,ventral,nearest to or facing toward the axis of an org...
...,...,...,...,...,...,...
206936,v,2771888,0.000,0.125,fog_up,"get foggy; ""The windshield fogged up"""
206937,v,2771997,0.000,0.000,coal,"burn to charcoal; ""Without a drenching rain, t..."
206938,v,2771997,0.000,0.000,char,"burn to charcoal; ""Without a drenching rain, t..."
206939,v,2772202,0.125,0.250,haze,"become hazy, dull, or cloudy"


In [54]:
positiveness = dict(zip(df.SynsetTerms, round(df.PosScore - df.NegScore,5)))     # positiveness/negativeness score on [-1,1]
extremeness = dict(zip(df.SynsetTerms, (df.PosScore + df.NegScore)/2))  # extremeness score on [0,1]

In [32]:
set(positiveness.values()), set(extremeness.values())

({-1.0,
  -0.875,
  -0.75,
  -0.625,
  -0.556,
  -0.5,
  -0.4,
  -0.375,
  -0.334,
  -0.272,
  -0.25,
  -0.125,
  -0.112,
  0.0,
  0.112,
  0.125,
  0.25,
  0.334,
  0.375,
  0.4,
  0.5,
  0.556,
  0.625,
  0.75,
  0.875,
  1.0},
 {0.0, 0.0625, 0.125, 0.1875, 0.25, 0.3125, 0.375, 0.4375, 0.5})

load validation data

In [91]:
validation = pd.read_csv("../validation.txt", sep="\t", header=None, names=["article", "sentiment"])
validation = validation.set_index("article")
validation.head()

Unnamed: 0_level_0,sentiment
article,Unnamed: 1_level_1
1755_Lisbon_earthquake,-1
1896_Summer_Olympics,1
1997_Pacific_hurricane_season,-1
Actinium,0
Barracuda,-1


In [92]:
validation.describe()

Unnamed: 0,sentiment
count,42.0
mean,-0.02381
std,0.840676
min,-1.0
25%,-1.0
50%,0.0
75%,1.0
max,1.0


model - since there is no training we can go straight to validation data

In [93]:
def model(articles_content, lexicon):
    """
    Infer the sentiment prediction for articles on their content, according to lexicon of sentiments
    Args:
        articles_content: dict(), dictionary of (article-title: article-content) pairs
        lexicon: dict(), dictionary of (word: sentiment) pairs, with sentiment a sentiment score
    Returns:
        dict(), dictionary of (article-title: average-score) pairs
    """
    article_scores = {}

    for title, content in articles_content.items():
        words = content.split()
        word_scores = [lexicon[word] for word in words if word in lexicon]
        average_score = sum(word_scores) / len(word_scores) if len(word_scores) > 0 else None
        article_scores[title] = average_score

    return article_scores

In [95]:
validation_content = dict(zip(list(validation.index), [None]*len(validation)))
for k in validation_content.keys():
    validation_content[k] = content[k]

In [96]:
scores = model(validation_content, positiveness)

In [99]:
validation["prediction"] = scores

In [102]:
validation.head(), validation.describe()

(                               sentiment  prediction
 article                                             
 1755_Lisbon_earthquake                -1   -0.050156
 1896_Summer_Olympics                   1   -0.006281
 1997_Pacific_hurricane_season         -1   -0.037893
 Actinium                               0   -0.005826
 Barracuda                             -1   -0.002747,
        sentiment  prediction
 count  42.000000   42.000000
 mean   -0.023810   -0.006757
 std     0.840676    0.015840
 min    -1.000000   -0.051758
 25%    -1.000000   -0.011054
 50%     0.000000   -0.005780
 75%     1.000000    0.004052
 max     1.000000    0.017951)

score predictions by the sign

In [113]:
print(sum((validation["sentiment"] * validation["prediction"]) > 0), "/", len(validation))

17 / 42


prediction corrected by validation average (but better if we correct with training average)

In [118]:
print(sum((validation["sentiment"] * (validation["prediction"] + validation["prediction"].mean())) > 0), "/", len(validation))

17 / 42
