In [1]:
import bz2
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
from tqdm.notebook import tqdm
import pickle
import plotly.graph_objects as go

In [26]:
def generate_word_count_plot(df, year):
    df['quarter'] = df['date'].apply(lambda x: (x.month-1)//3)
    # Create figure
    fig = go.Figure()
    # Add traces, one for each slider step
    names = ['first', 'second', 'third', 'fourth']
    ks = []
    for k, group in df.groupby('quarter'):
        keywords = [x[0] for y in group['keywords']for x in y]
        counts = pd.Series(keywords).value_counts().iloc[:10]
        fig.add_trace(
            go.Bar(visible=False, name='Most frequent words', x=counts.index, y=counts.values)
           )
        ks.append(k)
    
    # Create and add slider
    steps = []
    
    for i in range(len(fig.data)):
        step = dict(
            method="update",
            args=[{"visible": [False] * len(fig.data)},
                  {"title": f"Top 10 most frequent keywords in the {names[ks[i]]} quarter"}],  # layout attribute
        )
        step["args"][0]["visible"][i] = True  # Toggle i'th trace to "visible"
        steps.append(step)
        
    sliders = [dict(
        active=0,
        pad={"t": 50},
        currentvalue={"prefix": "Quarter: "},
        steps=steps
    )]
    
    fig.update_layout(
        sliders=sliders,
        title = f"Top 10 most frequent keywords in the {names[ks[0]]} quarter"
    )
    
    fig.data[0].visible = True
    fig.write_html(f"C:/Users/jozef/Desktop/quotebank/word_count_plots/word_count_plot_{year}.html")

## Load data

In [27]:
years = ['2008','2009','2010','2011','2012','2013', '2014', '2015','2016', '2017', '2018', '2020']

for year in years:
    path_to_out = f'C:/Users/jozef/Desktop/quotebank/processed_western_quotes/processed_western_quotes_{year}.json.bz2'
    df1 = pd.read_json(path_to_out)    
    generate_word_count_plot(df1, year)

In [28]:
df_19 = pd.DataFrame()
for i in range(1,5):
    path_to_out = f'C:/Users/jozef/Desktop/quotebank/processed_western_quotes/processed_western_quotes_2019-{i}.json.bz2'
    df1 = pd.read_json(path_to_out)
    df_19 = pd.concat([df_19, df1])    
generate_word_count_plot(df_19, '2019')