# 05-measure-novelty-from-aggregated-topics

In [1]:
import numpy as np
import pandas as pd
from itertools import *

import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
mpl.rcParams['figure.figsize'] = [7,8]
mpl.rcParams['figure.dpi'] = 80
mpl.rcParams['savefig.dpi'] = 200

mpl.rcParams['font.size'] = 17
mpl.rcParams['legend.fontsize'] = 'large'
mpl.rcParams['figure.titlesize'] = 'medium'
mpl.rcParams['lines.linewidth'] = 2.5
mpl.rcParams['lines.markersize'] = 10

sns.set_context('talk')

from pprint import pprint

import imp

from scipy.stats import entropy
entropy_fix = np.log2(np.e)

In [2]:
years = range(1947,2017)

infiles = [ 'data/%d.csv' % d for d in years ]

# create DataFrame for all articles
df_alldocs = pd.DataFrame()

for infile in infiles:

    df_year = pd.read_csv(infile, index_col=0)
    df_year = df_year[pd.notnull(df_year['text'])]
    del df_year['text']
    
    # uncomment for short run
    #df = df.head(50)
    
    df_alldocs = df_alldocs.append(df_year)

In [18]:
#n_topics=50  # 10, 15, 20, 25, 50
n_topics=15

words_per_topic     = pd.read_csv('data/topic_word_distributions_'+str(n_topics)+'topics.csv', index_col=0)
topics_per_document = pd.read_csv('data/document_topic_distributions_'+str(n_topics)+'topics.csv', index_col=0)

#words_per_topic     = pd.read_csv('data/olddata/topic_word_distributions_'+str(n_topics)+'topics.csv', index_col=0)
#topics_per_document = pd.read_csv('data/olddata/document_topic_distributions_'+str(n_topics)+'topics.csv', index_col=0)

In [4]:
topics_per_document.head()

Unnamed: 0,Topic 1,Topic 2,Topic 3,Topic 4,Topic 5,Topic 6,Topic 7,Topic 8,Topic 9,Topic 10
d-41122662,0.0125,0.0125,0.0125,0.0125,0.6375,0.0125,0.0125,0.0125,0.2625,0.0125
d-41122648,0.008333,0.008333,0.008333,0.008333,0.091667,0.008333,0.341667,0.008333,0.508333,0.008333
d-41122673,0.000826,0.009091,0.091736,0.389256,0.000826,0.000826,0.009091,0.496694,0.000826,0.000826
d-41122667,0.000935,0.000935,0.12243,0.000935,0.000935,0.000935,0.000935,0.76729,0.103738,0.000935
d-41122630,0.059494,0.000422,0.308439,0.464557,0.000422,0.000422,0.063713,0.093249,0.008861,0.000422


## Aggregate topic distributions per period

In [5]:
from datetime import datetime
import time

to_date = lambda date: datetime.fromtimestamp(time.mktime(time.strptime('{} {} 1'.format(date[0],date[1]), '%Y %W %w'))) 

doc_to_date = { row.filename: (to_date((row.year, row.week)), row.week)
                for _, row in df_alldocs.iterrows() }

In [19]:
all_dates = set(sorted(doc_to_date.values()))
all_documents = topics_per_document.index.values

In [20]:
period_names = ['week','month','quarter','year']
agg_topics = { pn:{} for pn in period_names }

i=0
for doc_id, dates in doc_to_date.items():
    i+=1
    if i % 20000 == 0:
        print('{}/{} done'.format(i,len(doc_to_date)))
        
    year   = dates[0].year 
    quarter= (dates[0].month-1)//3
    month  = dates[0].month
    week   = dates[1]
    
    year_s    = str(year)
    quarter_s = str(year)+'-'+str(quarter).zfill(2)
    month_s   = str(year)+'-'+str(month).zfill(2)
    week_s    = str(year)+'-'+str(week).zfill(2)
    
    try:
        if len(topics_per_document.loc[doc_id]) < n_topics:
            topics = topics_per_document.loc[doc_id].iloc[0].values
        else:
            topics = topics_per_document.loc[doc_id].values

        periods = [week_s,month_s,quarter_s,year_s]
        for period_name, period in zip(period_names, periods):
            if period not in agg_topics[period_name]:
                agg_topics[period_name][period]  = topics
            else:
                agg_topics[period_name][period] += topics
                
    except KeyError as e:
        print(e, 'not found')

20000/308456 done
'd-41762153' not found
'd-42623174' not found
'd-43063223' not found
40000/308456 done
'd-45139792' not found
'd-45123645' not found
'd-45137470' not found
'd-46171586' not found
'd-46174488' not found
60000/308456 done
80000/308456 done
'd-41392837' not found
100000/308456 done
120000/308456 done
140000/308456 done
160000/308456 done
180000/308456 done
200000/308456 done
220000/308456 done
'd-16098353' not found
'd-15876127' not found
'd-15930865' not found
240000/308456 done
'd-26109937' not found
260000/308456 done
'd-47209198' not found
280000/308456 done
'd-81015385' not found
'd-80165638' not found
'd-80362966' not found
'd-80362855' not found
'd-85157643' not found
'd-100591164' not found
300000/308456 done


In [21]:
newspaper = 'Spiegel'
period_names = ['week','month','quarter','year']

for pn in period_names:
    topics = agg_topics[pn]   
    outfile = '../Null-models/data/aggregate_{}topics_per_{}_{}.csv'.format(n_topics,pn,
                                                                            newspaper)
    df = pd.DataFrame(topics.values(), index=topics.keys())
    df.to_csv(outfile)

## Read them again

In [22]:
newspaper = 'Spiegel'
period_names = ['week','month','quarter','year']
agg_dfs = {}

for pn in period_names:
    topics = agg_topics[pn]
    infile = '../Null-models/data/aggregate_{}topics_per_{}_{}.csv'.format(n_topics,pn,
                                                                           newspaper)
    df = pd.read_csv(infile, index_col=0)
    agg_dfs[pn] = df.sort_index()

In [23]:
# Novelty measures

from numpy.linalg import norm
from scipy.stats import entropy
from scipy.spatial.distance import hamming
from scipy.spatial.distance import euclidean
from sklearn.metrics import mutual_info_score

def rel_entr(P, Q):                
    return entropy(P,Q)*entropy_fix

def JSD(P, Q):
    _P = P / norm(P, ord=1)
    _Q = Q / norm(Q, ord=1)
    _M = 0.5 * (_P + _Q)
    return 0.5 * (rel_entr(_P, _M) + rel_entr(_Q, _M))

def BCD(P,Q):
    _P = np.array(P / norm(P, ord=1),dtype=np.float32)
    _Q = np.array(Q / norm(Q, ord=1),dtype=np.float32)
    BC = np.dot(np.sqrt(_P),np.sqrt(_Q))
    return -np.log2(BC)    

def MI(_P,_Q):
    return mutual_info_score(_P,_Q)

def novelty(p,q, metric='KL'):
    
    if metric=='KL':
        return rel_entr(p,q)
    elif metric=='hamming':
        return hamming(p>0,q>0)
    elif metric=='euclidean':
        return euclidean(p,q)
    elif metric=='JSD':
        return JSD(p,q)
    elif metric=='BCD':
        return BCD(p,q)
    elif metric=='MI':
        return MI(p,q)
    else:
        return 0

In [24]:
KLs = { pn:[] for pn in period_names }

for pn in period_names:
    df = agg_dfs[pn]
    for t,tm1 in zip(df.index[1:],df.index[:-1]):
        topics_t   = df.loc[t]/df.loc[t].sum()
        topics_tm1 = df.loc[tm1]/df.loc[tm1].sum()
        KL = novelty(topics_t, topics_tm1)
        KLs[pn] += [(t,KL)]

In [25]:
newspaper = 'Spiegel'
period_names = ['week','month','quarter','year']

for pn in period_names:
    outfile = '../Null-models/data/agg_novelty_{}topics_per_{}_{}.csv'.format(n_topics,pn,
                                                                              newspaper)
    print('Printing', outfile)
    with open(outfile, 'w') as f:
        f.write(''.join(['{},{}\n'.format(p[0],p[1]) for p in KLs[pn]]))

Printing ../Null-models/data/agg_novelty_15topics_per_week_Spiegel.csv
Printing ../Null-models/data/agg_novelty_15topics_per_month_Spiegel.csv
Printing ../Null-models/data/agg_novelty_15topics_per_quarter_Spiegel.csv
Printing ../Null-models/data/agg_novelty_15topics_per_year_Spiegel.csv
