# 05-plotting-novelty-GLES

In [1]:
import numpy as np
import pandas as pd
from itertools import *

import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
mpl.rcParams['figure.figsize'] = [7,8]
mpl.rcParams['figure.dpi'] = 80
mpl.rcParams['savefig.dpi'] = 200

mpl.rcParams['font.size'] = 17
mpl.rcParams['legend.fontsize'] = 'large'
mpl.rcParams['figure.titlesize'] = 'medium'
mpl.rcParams['lines.linewidth'] = 2.5
mpl.rcParams['lines.markersize'] = 10

sns.set_context('poster')

from pprint import pprint

import imp

from scipy.stats import entropy
entropy_fix = np.log2(np.e)

In [33]:
df_all_raw_texts = pd.read_csv('data/C8_all_raw_texts_and_labels.csv', index_col=0, sep='\t', encoding='utf-8')
df_all_raw_texts = df_all_raw_texts[pd.notnull(df_all_raw_texts['text'])]

from datetime import datetime
all_dates = [ datetime.strptime(i[:10], '%Y-%m-%d') for i in sorted(df_all_raw_texts.time_window.unique()) ]

# Plot novelty plots for different n_topics

In [106]:
n_topics = 30

In [107]:
all_quarts = range(1,32)

df_all_novelty = pd.DataFrame()

for q in all_quarts:
    infile = 'data/df_%d_topics_novelty_quarterly_%d.csv' % (n_topics, q)
    df = pd.read_csv(infile, index_col=0)
    df_all_novelty = df_all_novelty.append(df, ignore_index=True)    
    
df_all_novelty.head()

Unnamed: 0,date_idx,doc_idx1,doc_idx0,KL,jensen,bhatta,mutual
0,1,1153,1,0.864858,0.175855,0.207842,0.007867
1,1,1153,3,0.864858,0.175855,0.207842,0.007867
2,1,1153,6,0.0,0.0,-0.0,1.0
3,1,1153,8,0.0,0.0,-0.0,1.0
4,1,1153,9,0.864858,0.175855,0.207842,0.007867


In [108]:
measure_names = df_all_novelty.keys()[-4:]
mean_novelty_over_time = []
std_novelty_over_time  = []

for date_idx in df_all_novelty.date_idx.unique():
    df2 = df_all_novelty[ df_all_novelty.date_idx==date_idx ]
    mean_novelty_over_time += [[ df2[measure].mean() for measure in measure_names ]]
    std_novelty_over_time  += [[ df2[measure].std()  for measure in measure_names ]]

mean_novelty_over_time = np.array(mean_novelty_over_time)
std_novelty_over_time  = np.array(std_novelty_over_time)

In [109]:
sns.set_context('poster')

time = all_dates[1:]
fig, axs = plt.subplots(4,1,figsize=(20,16))
for i,ax in enumerate(axs):
    ax.plot(time, mean_novelty_over_time[:,i], 'o-', alpha=0.95)
    ax.plot(time, mean_novelty_over_time[:,i]+std_novelty_over_time[:,i], 'o-', c='dodgerblue', alpha=0.65)
    ax.plot(time, mean_novelty_over_time[:,i]-std_novelty_over_time[:,i], 'o-', c='dodgerblue', alpha=0.65)
    ax.set_ylim(0, np.ceil(max(mean_novelty_over_time[:,i]+std_novelty_over_time[:,i])) )
    ax.set_ylabel(measure_names[i])

plt.tight_layout()
plt.savefig('images/novelty_over_time_%s_%dtopics.png' % (measure, n_topics), dpi=200)
plt.close()
#plt.show() 

In [110]:
# Plot novelty between a week and each one of the previous 10 weeks, every five years

sns.set(style="white", rc={"axes.facecolor": (0, 0, 0, 0)}, font_scale=1.2)

for measure in measure_names:

    df = df_all_novelty.filter([ 'date_idx', measure ])
    #df = df[df.date_idx<10]

    # Initialize the FacetGrid object
    L = len(df['date_idx'].unique())
    L = 31

    pal = sns.cubehelix_palette(L, rot=-.25, light=.7)
    g = sns.FacetGrid(df, row="date_idx", hue="date_idx", \
                      aspect=10, size=.5, palette=pal, xlim=(-0.5,np.ceil(df[measure].max())))

    # Draw the densities in a few steps
    bw = 0.05*(df[measure].max() - df[measure].min())
    g.map(sns.kdeplot, measure, clip_on=False, shade=True, alpha=1, lw=1.5, bw=bw)
    g.map(sns.kdeplot, measure, clip_on=False, color="w",           lw=2.5, bw=bw)
    g.map(plt.axhline, y=0, lw=2, clip_on=False)

    #ax.set_xlim(-1,15)
    #plt.xlim(0,15)

    # Define and use a simple function to label the plot in axes coordinates
    def label(x, color, label):
        ax = plt.gca()
        ax.text(0, .2, label, fontweight="bold", color=color, 
                ha="left", va="center", transform=ax.transAxes)

    g.map(label, measure)

    # Set the subplots to overlap
    g.fig.subplots_adjust(hspace=-.5)

    # Remove axes details that don't play will with overlap
    g.set_titles("")
    g.set(yticks=[])
    g.despine(bottom=True, left=True)

    g.fig.set_figheight(6.0)
    g.fig.set_figwidth(10.0)

    plt.subplots_adjust(left=0.1)
    plt.savefig('images/joyplots_%dtopics_%s.png' % (n_topics, measure), dpi=200)
    plt.close()
    #plt.show()