## Imports

In [1]:
import nltk
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/elibol/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [2]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer # For sentiment analysis
import cPickle as pickle # For loaded dataset from pickle file
import tqdm # Progress bar
from collections import Counter # Handy addon
from pprint import pprint # Useful to print JSON objects
import numpy as np



## Load the dataset of articles with introductions 

In [3]:
# This is internal, to generate the dataset, feel free to remove it in your file
ntopics = {15664: "Brexit", 14723: "ISIS War"}
match = {'$match': {'ntopic': {'$in': ntopics.keys()}}}
project = {'$project': {'_id': 0, 'pubtime': 1, "title": 1, "source": 1, "url": 1, "introductions": 1, 'ntopic': 1}}

articles = list(db.article.aggregate([match, project]))


for article in articles:
    article['news_topic'] = ntopics[article['ntopic']]
    del article['ntopic']

with open("news_sentiment.pickle", "w") as f:
    pickle.dump(articles, f)

NameError: name 'db' is not defined

In [4]:
# This loads the file that you want, might take several seconds (up to a minute)

with open("news_sentiment.pickle", "r") as f:
    articles = pickle.load(f)
print len(articles), "articles were loaded"
print "Example article:"
pprint(articles[1040])


57767 articles were loaded
Example article:
{u'introductions': [{u'person': u'Bashar al-Assad',
                     u'text': u'President',
                     u'wdid': u'Q44329'},
                    {u'person': u'Emile Hokayem',
                     u'text': u'in Foreign Policy'},
                    {u'person': u'Ahrar al Sham',
                     u'text': u'the most important groups',
                     u'wdid': u'Q860943'},
                    {u'person': u'Vladimir Putin',
                     u'text': u'Russian President',
                     u'wdid': u'Q7747'},
                    {u'person': u'Barack Obama',
                     u'text': u'U.S. President',
                     u'wdid': u'Q76'},
                    {u'person': u'Osama Abu Zeid',
                     u'text': u'a senior adviser to the moderate Free Syrian Army'},
                    {u'person': u'Op-Ed',
                     u'text': u'for The Washington Post',
                     u'wdid': u'Q2602337'},
 

In [7]:
# separate articles from the two stories
ISIS_articles = []
Brexit_articles = []
for a in articles:
    if a["news_topic"] == 'ISIS War':
        ISIS_articles.append(a)
    else:
        Brexit_articles.append(a)
        
print len(ISIS_articles), " articles from ISIS War and ", len(Brexit_articles), "articles from Brexit were loaded"

 39206  articles from ISIS War and  18561 articles from Brexit were loaded


In [8]:
# get only articles from one story, you can change this
articles = ISIS_articles

## Extract introductions, and obtain their sentiment

In [9]:
analyzer = SentimentIntensityAnalyzer() 

total_introductions = []
for a in articles:
    for intro in a.get('introductions', []): # get intros from each article
        intro['source'] = a['source']
        total_introductions.append(intro) # total intros across

# call the sentiment analysis (VADER) func to output a sentiment val from the text 
for intro in tqdm.tqdm_notebook(total_introductions):
    intro['sentiment'] = analyzer.polarity_scores(intro['text'])['compound']

HBox(children=(IntProgress(value=0, max=214880), HTML(value=u'')))




In [10]:
# Example some sentiment for some of the introductions

subsample = np.random.choice(total_introductions, 100)
for intro in subsample:
    if intro['sentiment'] != 0: #print a few <entity,apposition,sentiment> values where sentiment !=0
        print "---------------"
        print "Entity mentionned:", intro['person']
        print intro['text']
        print "Sentiment:", intro['sentiment']

---------------
Entity mentionned: Asaad Hanna
a spokesman for the Free Syrian Army
Sentiment: 0.5106
---------------
Entity mentionned: Gulen
who has denied any involvement in the failed putsch
Sentiment: -0.7351
---------------
Entity mentionned: Al-Naim
where the jihadists once displayed the severed heads of their enemies
Sentiment: -0.6908
---------------
Entity mentionned: Moammar Gadhafi
Libyan strongman
Sentiment: 0.1779
---------------
Entity mentionned: Hama
a center of resistance where activists said dozens had been killed in new attacks
Sentiment: -0.8126
---------------
Entity mentionned: Fethullah Gulen
the US - based preacher who is accused of masterminding the failed July coup aimed at ousting President Recep Tayyip Erdogan
Sentiment: -0.6705
---------------
Entity mentionned: Ake Sellstrom
a former U.N. weapons inspector in Iraq
Sentiment: -0.4404
---------------
Entity mentionned: Davutoglu
a more mild - mannered academic and former diplomat who lacks Erdogan 's natura

## Build a 2-dimensional object containing sentiment per entity, per source

In [11]:
ent_source_sent = {}

for intro in total_introductions:
    p = intro['person']
    s = intro['source']
    if p not in ent_source_sent:
        ent_source_sent[p] = {} #allocate space to fille ent_source_sent
    if s not in ent_source_sent[p]:
        ent_source_sent[p][s] = [] #allocate space to fille ent_source_sent
    ent_source_sent[p][s].append(intro['sentiment'])

In [12]:
# An example of how one entity (a city) is described by different sources

print ent_source_sent['Aleppo'] # sentiments across sources for the single entity 'Aleppo'

{u'nytimes.com': [0.0, -0.1531, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -0.1531, 0.0, 0.0, 0.0, -0.1531, 0.0, 0.0, -0.5574, 0.0, 0.0, 0.0, 0.0], u'allafrica.com': [-0.5994], u'bloomberg.com': [-0.5994, 0.0, 0.0, -0.2023, 0.0, -0.4404, -0.1531, -0.1531, 0.0, 0.0], u'bbc.co.uk': [0.0516, 0.0, -0.1531, 0.0, 0.0, 0.0, -0.1531, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -0.1531, -0.3182, -0.5994, -0.5994, 0.0, 0.0, 0.0, -0.1531, 0.0, 0.0, 0.0, -0.1531, 0.0, 0.0, 0.0], u'theguardian.com': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -0.7096, 0.0, -0.1531, 0.0], u'telegraph.co.uk': [0.4019, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -0.3612, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.3182, 0.4404, -0.296, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.3818, -0.1531, 0.0, -0.1531, 0.0, 0.0, 0.0, -0.3182, -0.1531, 0.0, 0.0, 0.0, 0.0, 0.0, -0.3612, 0.2023, -0.1531, 0.0, 0.0, -0.1531, -0.1531, 

In [13]:
# We get rid of entities that don't contain enough data

entities_kept = []

for entity in ent_source_sent.keys():
    sentiments = ent_source_sent[entity] # collect sentiments across all sources for certain entity
    total_size = sum([len(sentiments[source]) for source in sentiments.keys()])
    if total_size >= 3: # only keep entities that 3 or more sources mention
        entities_kept.append(entity)
print "We will keep a total of", len(entities_kept), " / ", len(ent_source_sent.keys()) ," entities in our dataset"

sources = set([])
for entity in entities_kept:
    sources|= set(ent_source_sent[entity].keys())
sources = list(sources)

print "We have ", len(sources), "sources: ", sources

We will keep a total of 7852  /  25128  entities in our dataset
We have  22 sources:  [u'telegraph.co.uk', u'foxnews.com', u'ap.org', u'businessinsider.in', u'independent.co.uk', u'reuters.com', u'wikinews.org', u'cnn.com', u'techcrunch.com', u'aa.com.tr', u'allafrica.com', u'nytimes.com', u'bloomberg.com', u'bbc.co.uk', u'latimes.com', u'rt.com', u'france24.com', u'chinadaily.com.cn', u'theguardian.com', u'washingtonpost.com', u'middleeasteye.net', u'aljazeera.com']


## We create the array we will use in our sparse model

In [14]:
# this converts sentiments for same actor for same source into aggregate sentiment (which is discrete and is 1,0,or-1)
# Parameters: changing these affects the results you get
Pos_neg_ratio = 2.0
overall_ratio = 0.15
pos_threshold = 0.15
neg_threshold = -0.15

N = len(entities_kept)
M = len(sources)
A = np.zeros((N, M))

sentiment_counts = Counter()

source2j = {source: j for j, source in enumerate(sources)}

for i, entity in enumerate(entities_kept):
    for source in ent_source_sent[entity].keys():
        sent_array = np.array(ent_source_sent[entity][source])
        N_pos = float(len(np.where(sent_array > pos_threshold)[0])) # count sentiments that are positive enough
        N_neg = float(len(np.where(sent_array < neg_threshold)[0]))
        T = float(len(sent_array))
        aggregate_sentiment = 0
        if N_pos > Pos_neg_ratio*N_neg and N_pos > overall_ratio*T:
            aggregate_sentiment = 1 
        elif N_neg > Pos_neg_ratio*N_pos and N_neg > overall_ratio*T:
            aggregate_sentiment = -1
        j = source2j[source]
        
        A[i,j] = aggregate_sentiment
        
        sentiment_counts[aggregate_sentiment] += 1 #keeps track of #1,0,-1s assigned

print "We allocated some sentiment in this matrix, the repartition is:", sentiment_counts

We allocated some sentiment in this matrix, the repartition is: Counter({0: 19061, 1: 3650, -1: 2670})


## Model source similarity

In [9]:
# Write code that uses this matrix (entities, sources) to compute
# source similarity visible in bias of the way they describe entities

In [16]:
# reference on sklearn's graph lasso: http://scikit-learn.org/stable/modules/generated/sklearn.covariance.GraphLasso.html
from sklearn.covariance import GraphLasso # our Algo code should replace this and input/output the same thing
graph_lasso = GraphLasso(alpha=0.00001) # alpha =  regularization parameter: the higher alpha, the more regularization, the sparser the inverse covariance.
graph_lasso.fit(A) # A is the aggregated sentiment matrix, an arrray of (n_samples, n_features)
np.mean(graph_lasso.get_precision() > 0) #calculates avg of the precision matrix elements that are >0

# print pairs of sources for which precision matrix has pos val
# when precision matrix is pos, source pairs are likely to have same sentiment
# when precision matrix is neg, source pairs are likely to have opposit sentiment
for (i, j) in zip(*np.where(graph_lasso.get_precision() > 0)):
    if i > j: #since precision matrix is symmetric, only need to print upper half
        print sources[i], sources[j]

businessinsider.in foxnews.com
reuters.com ap.org
wikinews.org ap.org
wikinews.org businessinsider.in
wikinews.org independent.co.uk
wikinews.org reuters.com
cnn.com ap.org
techcrunch.com foxnews.com
techcrunch.com ap.org
techcrunch.com independent.co.uk
techcrunch.com reuters.com
aa.com.tr techcrunch.com
allafrica.com businessinsider.in
allafrica.com reuters.com
nytimes.com ap.org
nytimes.com allafrica.com
bbc.co.uk ap.org
bbc.co.uk techcrunch.com
latimes.com telegraph.co.uk
latimes.com reuters.com
latimes.com techcrunch.com
latimes.com nytimes.com
latimes.com bloomberg.com
rt.com ap.org
rt.com businessinsider.in
rt.com latimes.com
france24.com ap.org
france24.com wikinews.org
france24.com techcrunch.com
france24.com allafrica.com
france24.com latimes.com
chinadaily.com.cn cnn.com
chinadaily.com.cn nytimes.com
theguardian.com techcrunch.com
theguardian.com allafrica.com
theguardian.com france24.com
washingtonpost.com chinadaily.com.cn
middleeasteye.net foxnews.com
middleeasteye.net te