# Plot figure for webiste

We want to show our RDD results on the website in a more suitable way. So we replot the time series figures with the new package **plotly** as well as the regression line.

## 1. Load data

In [138]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 1000)
from analysis.helpers.load import load_interventions, load_aggregated, load_topics
import pickle
import seaborn as sns
from scipy import stats
import math

import matplotlib
import statsmodels.formula.api as smf
import matplotlib.pyplot as mpl

import plotly.graph_objects as go
import plotly.express as px
import statsmodels.formula.api as smf
import statsmodels.api as sm
# %matplotlib inline

path = '../Covidpageviews/_includes/'

In [128]:
# load pageview data 
# we only plot the weekly data
with open('./data/keyword_pageviews_allcountry_weekly.pickle', 'rb') as handle:
    all_pageviews = pickle.load(handle)
all_pageviews['ja']['pageviews']['noncovid'].head(1)

Unnamed: 0,Hepatitis_C,Tuberculous_meningitis,Colorectal_cancer,Ovarian_cancer,Ulcerative_colitis,Type_2_diabetes,Cancer_pain,Hepatitis_E,Viral_hepatitis,Food_allergy,Cataract,Cancer_vaccine,Seborrhoeic_dermatitis,Diabetes,World_AIDS_Day,Nummular_dermatitis,Hepatitis_A_vaccine,Low_back_pain,Multi-drug-resistant_tuberculosis,Gestational_diabetes,Sepsis,Skin_allergy_test,Cancer_immunotherapy,Milk_allergy,University_of_Texas_MD_Anderson_Cancer_Center,Rhinitis,Appendicitis,Type_1_diabetes,Flatulence,Asepsis,Head_and_neck_cancer,Shivering,Allergic_rhinitis,Stasis_dermatitis,Oral_cancer,Penile_cancer,Pancreatic_cancer,Hepatitis_B_vaccine,Abdominal_pain,Tuskegee_syphilis_experiment,Endometrial_cancer,Hypertension,Gestational_hypertension,National_Cancer_Institute,Dana–Farber_Cancer_Institute,Breast_cancer,Prostate_cancer,Malnutrition,Dermatitis,HIV_vaccine,World_Cancer_Day,Colitis,Hives,Lymphocytic_choriomeningitis,Gallbladder_cancer,Atopic_dermatitis,Rheumatoid_arthritis,National_Institute_of_Allergy_and_Infectious_Diseases,Cervical_cancer,Laryngeal_cancer,Pulmonary_hypertension,Hepatitis_B,Testicular_cancer,Hepatitis,Inflammatory_breast_cancer,Varicose_veins,Lung_cancer,White_coat_hypertension,Ischemic_colitis,Rash,Viral_meningitis,Hearing_loss,Squamous_cell_skin_cancer,Bladder_cancer,American_Cancer_Society,Gastroenteritis,Conjunctivitis,Osteoarthritis,Athlete's_foot,Hereditary_nonpolyposis_colorectal_cancer,Liver_cancer,History_of_syphilis,Gout,Guatemala_syphilis_experiments,Neurosyphilis,Diabetes_management,Thyroid_cancer,Inflammation,Complications_of_diabetes,Frequent_urination,Systemic_lupus_erythematosus,Chickenpox,Contact_dermatitis,Paralysis,Skin_cancer,Dehydration,Hepatitis_D,Constipation,Oral_allergy_syndrome,Neonatal_sepsis,Meningitis,Hyperhidrosis,Hepatitis_B_virus,Irritant_diaper_dermatitis,Esophageal_cancer,Kidney_cancer,Portal_hypertension,HIV,Tuberculosis_management,Sleep_paralysis,Bruise,Vaginal_flatulence,Mycobacterium_tuberculosis,Prediabetes,Childhood_cancer,Discoid_lupus_erythematosus,Tuberculosis,Allergy,Stomach_cancer,Cancer_stem_cell,Latex_allergy,Hypothyroidism,Cancer,Asthma,Noise-induced_hearing_loss,Hepatitis_A,Syphilis,Autoimmune_hepatitis,Appendix_cancer,Diabetes_insipidus
2019-01-06,686,78,935,318,3453,478,284,314,171,663,846,89,1738,2526,51,543,13.0,453,20.0,250,4464,0.0,188,418,0.0,455,2408,671,3466,18.0,72,878,242,2.0,180,304,1606,41,772,348,187,705,540,49,34,961,867,491,501,119,12.0,401,1730,30.0,326,1692,1455,10.0,601,482,102,858,741,980,72.0,297,939,117,562,410,212,905,255,352,6.0,2570,483,83,933,88,337,170,2817,190,0.0,105,295,1740,107,122,2614,4343,561,955,87,842,83,1216,306,52,1966,468,299,86,522,109,177,4541,0.0,4247,625,93,684,154,149,82,4956,2315,839,92,226,1235,3039,1431,663,367,6315,93,1120,506


In [129]:
all_pageviews['it']['pageviews']['covid'].T.index.names = ['index']
interventions = load_interventions('data/interventions.csv')

In [130]:
# As the correlation results show that the result of Italy is always good, so we decided to plot the result of Italy here too.
language = 'it'

# show all the events
# not in all countris/languages, like "English" only has "Mobility"&"Normalcy"
interventions[language].keys()

dict_keys(['1st case', '1st death', 'School closure', 'Public events banned', 'Lockdown', 'Mobility', 'Normalcy'])

## 2.Combine all the data in one datafrmae without the datailed pagenames

In [131]:
# global variable

languages = ['sr', 'it', 'fr', 'en', 'nl', 'ko', 'de', 'fi', 'ja', 'no', 'sv', 'da']
categories = ['covid', 'noncovid','mental']

In [132]:
df = pd.DataFrame(columns = ['time','category','pageview','language','year'])

for l in languages:
    for c in categories:
        all_pageviews[l]['pageviews'][c].T.index.names = ['index']
        d1 = all_pageviews[l]['category'][c].merge(all_pageviews[l]['pageviews'][c].T,left_on = 'index',right_on = 'index')
        
        #compute the mean for each category
        d2 = d1.groupby('category').mean().T
        d2.columns.name = ''
        d2.index.names = ['time']
        pages = pd.DataFrame(all_pageviews[l]['pageviews'][c].mean(axis=1))
        pages.columns = [c]
        pages.index.names = ['time']
        d3 = pd.concat([pages,d2],axis=1)
        d3.reset_index(inplace=True)

        
        #drop time after 9.30.2019 for year 2019
        #d4 = d3.drop(range(273,365))
        d4 = d3.drop(range(39,52))
        d4 = d4.set_index('time').stack()
        d4 = d4.reset_index()
        d4.columns = ['time','category','pageview']
        d4['language'] = l
        d4['year'] = d4['time'].dt.year
        d4.time = d4['time'].apply(lambda x:x.replace(year = 2020))
        df = pd.concat([df,d4])


final = pd.DataFrame()
for l in languages:
    dftmp = df[df.language == l].copy()
    for e in interventions[l].keys():
        
        dftmp[e] = [0 if x < interventions[l][e] else 1 for x in dftmp.time]
    final = pd.concat([final,dftmp])


In [170]:
df = final
weekly_data = df.melt(id_vars=["time", "category","pageview","language","year"], 
        var_name="event", 
        value_name="intervention_flag")
weekly_data.pageview = np.log(weekly_data.pageview)
weekly_data.tail(1)

Unnamed: 0,time,category,pageview,language,year,event,intervention_flag
85175,2020-09-27,violence,3.109061,da,2020,Normalcy,1.0


In [191]:
weekly = pd.DataFrame()
for l in weekly_data.language.unique():
#     weekly_data['k'] = (weekly_data['time'] - pd.Series([interventions[l][e] for e in weekly_data[weekly_data['language']==l]['event'].unique()])).dt.days
    for c in weekly_data.category.unique():
        for e in interventions[l].keys():
            tmp = weekly_data[(weekly_data['language'] == l)*(weekly_data['category'] == c)*(weekly_data['event'] == e)].copy()
#             tmp['k'] = tmp['time']
            tmp['k'] = (tmp['time'] - pd.Series([interventions[l][e]]).item()).dt.days
            tmp_2019 = tmp[tmp['year'] == 2019].copy()
            tmp_2020 = tmp[tmp['year'] == 2020].copy()
            # tmp_2019['bestfit'] = sm.OLS(tmp_2019['covid'],sm.add_constant(tmp_2019[['k','intervention_flag']]).astype(float)).fit().fittedvalues 
            # tmp_2020['bestfit'] = sm.OLS(tmp_2020['covid'],sm.add_constant(tmp_2020[['k','intervention_flag']]).astype(float)).fit().fittedvalues 
            tmp_2019.loc[(tmp_2019.k>-80)*(tmp_2019.k<80),'bestfit'] = smf.ols(formula="pageview ~ k*intervention_flag", data=tmp_2019[(tmp_2019.k>-80)*(tmp_2019.k<80)]).fit().fittedvalues
            tmp_2020.loc[(tmp_2020.k>-80)*(tmp_2020.k<80),'bestfit'] = smf.ols(formula="pageview ~ k*intervention_flag", data=tmp_2020[(tmp_2020.k>-80)*(tmp_2020.k<80)]).fit().fittedvalues
            
            tmp = pd.concat([tmp_2019,tmp_2020])
            weekly = pd.concat([weekly,tmp])


## 3. Time series plot

Here ew will plot 4 different figures:
 - Figure of 'Covid' for Italy over all events: it_time.html
 - Figure of 'Mental' for Italy over all events: mental_it_time.html
 - Figure of all categories and event 'Mobility' for Italy: cate_time_rdd.html
 - Figure of 'Covid' and event 'Mobility' for all countries: coun_time_rdd.html

### 3.1 Figure of 'Covid' for Italy over all events: 

**it_time.html**



In [200]:
# event = 'Mobility'
category = 'covid'
language = 'it'

df = daily[(daily['language'] == language)*(daily['category'] == category)].copy()

df['year'] = df['year'].astype(str)
fig=px.line(df, x='time', y="bestfit",color = 'year',animation_frame='event',line_group = 'intervention_flag')
fig.add_scatter(x=df[df['year'] == '2019']['time'], y=df[df['year'] == '2019']["pageview"],mode='markers',name = '2019')
fig.add_scatter(x=df[df['year'] == '2020']['time'], y=df[df['year'] == '2020']["pageview"],mode='markers',name = '2020')
fig.add_vline(x = df['event_time'].iloc[0],line_dash = 'dash')

for k in range(len(fig.frames)):
    fig.frames[k]['layout'].update(annotations = [{'text': str(list(interventions[language].values())[k].date()), 'x': list(interventions[language].values())[k], 'y': 1, 'yref': 'paper'}],
                                    shapes = [{'line': {'dash':'dash'},
                                          'type':'line',
                                          'x0': list(interventions[language].values())[k],
                                          'x1': list(interventions[language].values())[k],
                                          'xref': 'x',
                                          'y0': 0,
                                          'y1': 1,
                                          'yref': 'y domain'}],
#                                     sliders = [{'transition':{'duration':1000}}]
                                  )
#     fig.layout.sliders[0].steps[k].args.update(
#         frame = {'duration': 500, 'redraw': True},
#     )
fig.update_xaxes(
    dtick="M1",
    tickformat="%b")

fig.update_xaxes(rangeslider_visible=True,rangeslider_thickness = 0.05)

fig.update_layout(xaxis_title = 'Time', 
                  yaxis_title = 'Pageviews',
                  updatemenus = [{'buttons':[{'args': [None, {'frame': {'duration': 500,
                                           'redraw': True}, 'mode': 'immediate',
                                           'fromcurrent': True, 'transition':
                                           {'duration': 150, 'easing': 'linear'}}],
                                  'label': '&#9654;',
                                  'method': 'animate'},
                                 {'args': [[None], {'frame': {'duration': 500,
                                           'redraw': False}, 'mode': 'immediate',
                                           'fromcurrent': True, 'transition':
                                           {'duration': 150, 'easing': 'linear'}}],
                                  'label': '&#9724;',
                                  'method': 'animate'}]}])

fig.write_html(path+'it_time.html')

fig.show()


### 3.2 Figure of 'Mental' for Italy over all events: 

**mental_it_time.html**

In [201]:
# event = 'Mobility'
category = 'mental'
language = 'it'

df = daily[(daily['language'] == language)*(daily['category'] == category)].copy()

df['year'] = df['year'].astype(str)
fig=px.line(df, x='time', y="bestfit",color = 'year',animation_frame='event',line_group = 'intervention_flag')
fig.add_scatter(x=df[df['year'] == '2019']['time'], y=df[df['year'] == '2019']["pageview"],mode='markers',name = '2019')
fig.add_scatter(x=df[df['year'] == '2020']['time'], y=df[df['year'] == '2020']["pageview"],mode='markers',name = '2020')
fig.add_vline(x = df['event_time'].iloc[0],line_dash = 'dash')

for k in range(len(fig.frames)):
    fig.frames[k]['layout'].update(annotations = [{'text': str(list(interventions[language].values())[k].date()), 'x': list(interventions[language].values())[k], 'y': 1, 'yref': 'paper'}],
                                    shapes = [{'line': {'dash':'dash'},
                                          'type':'line',
                                          'x0': list(interventions[language].values())[k],
                                          'x1': list(interventions[language].values())[k],
                                          'xref': 'x',
                                          'y0': 0,
                                          'y1': 1,
                                          'yref': 'y domain'}],
#                                     sliders = [{'transition':{'duration':1000}}]
                                  )
#     fig.layout.sliders[0].steps[k].args.update(
#         frame = {'duration': 500, 'redraw': True},
#     )
fig.update_xaxes(
    dtick="M1",
    tickformat="%b")

fig.update_xaxes(rangeslider_visible=True,rangeslider_thickness = 0.05)

fig.update_layout(xaxis_title = 'Time', 
                  yaxis_title = 'Pageviews',
                  updatemenus = [{'buttons':[{'args': [None, {'frame': {'duration': 500,
                                           'redraw': True}, 'mode': 'immediate',
                                           'fromcurrent': True, 'transition':
                                           {'duration': 150, 'easing': 'linear'}}],
                                  'label': '&#9654;',
                                  'method': 'animate'},
                                 {'args': [[None], {'frame': {'duration': 500,
                                           'redraw': False}, 'mode': 'immediate',
                                           'fromcurrent': True, 'transition':
                                           {'duration': 150, 'easing': 'linear'}}],
                                  'label': '&#9724;',
                                  'method': 'animate'}]}])

fig.write_html(path+'mental_it_time.html')

fig.show()


### 3.3  Figure of all categories and event 'Mobility' for Italy

**cate_time_rdd.html**

In [204]:
plot = go.Figure()
event = 'Mobility'
language = 'it'

# update categories
categories = final.category.unique()

for category in categories:
    df = daily[(daily['language'] == language)*(daily['category'] == category)*(daily['event'] == event)].copy()

    df['year'] = df['year'].astype(str)
    if category == 'covid':
        fig=px.line(df, x='time', y="bestfit",color = 'year',line_group = 'intervention_flag')
    else:
        fig=px.line(df, x='time', y="bestfit",color = 'year',line_group = 'intervention_flag').update_traces(visible=False, selector=lambda t: t.name in ["2019","2020"])
    fig.add_scatter(x=df[df['year'] == '2019']['time'], y=df[df['year'] == '2019']["pageview"],mode='markers',name = '2019',visible=False,)
    fig.add_scatter(x=df[df['year'] == '2020']['time'], y=df[df['year'] == '2020']["pageview"],mode='markers',name = '2020',visible=False,)
    plot.add_traces(fig.data)

plot.update_layout(
    updatemenus=[
        dict(
            active=1,
            buttons=list([
                dict(label="None",
                     method="update",
                     args=[{"visible": [False]*78},
                           {"title": "No Selection"}]),
            ])+list([
                dict(label=(str(k).capitalize()),
                     method='update',
                    args=[{'visible':[False]*6*i+[True]*6+[False]*6*(12-i)},
                          {'title':str(k).capitalize()}],)
                for i,k in enumerate(categories)
            ]),
            showactive = True,
            x = 1.08,
            xanchor = 'right',
            y = 1.2,
            yanchor = 'top',   
        )
    ])

plot.add_vline(x = interventions[language][event],line_dash = 'dash')
plot.add_annotation(x=interventions[language][event], y=1,yref="paper",text=str(interventions[language][event].date()))

plot.update_xaxes(
    dtick="M1",
    tickformat="%b")

plot.update_xaxes(rangeslider_visible=True,rangeslider_thickness = 0.05)
plot.update_layout(xaxis_title = 'Time', 
                  yaxis_title = 'Pageviews(Weekly)-Log',)

plot.write_html(path+'cate_time_rdd.html')
plot.show()



### 3.4  Figure of 'Covid' and event 'Mobility' for all countries

**coun_time_rdd.html**

In [208]:
plot = go.Figure()
event = 'Mobility'
category = 'covid'

for language in languages:
    df = daily[(daily['language'] == language)*(daily['category'] == category)*(daily['event'] == event)].copy()

    df['year'] = df['year'].astype(str)
    if language == 'it':
        fig=px.line(df, x='time', y="bestfit",color = 'year',line_group = 'intervention_flag')
    else:
        fig=px.line(df, x='time', y="bestfit",color = 'year',line_group = 'intervention_flag').update_traces(visible=False, selector=lambda t: t.name in ["2019","2020"])
        
    fig.add_scatter(x=df[df['year'] == '2019']['time'], y=df[df['year'] == '2019']["pageview"],mode='markers',name = '2019',visible=False,)
    fig.add_scatter(x=df[df['year'] == '2020']['time'], y=df[df['year'] == '2020']["pageview"],mode='markers',name = '2020',visible=False,)
    plot.add_traces(fig.data)
    
plot.update_layout(
    updatemenus=[
        dict(
            active=1,
            buttons=list([
                dict(label="None",
                     method="update",
                     args=[{"visible": [False]*78},
                           {"title": "No Selection"}]),
            ])+list([
                dict(label=(str(k).capitalize()),
                     method='update',
                    args=[{'visible':[False]*6*i+[True]*6+[False]*6*(12-i)},
                          {'title':str(k).capitalize()}],)
                for i,k in enumerate(languages)
            ]),
            showactive = True,
            x = 1.08,
            xanchor = 'right',
            y = 1.2,
            yanchor = 'top',
            
        )
    ])

plot.update_xaxes(
    dtick="M1",
    tickformat="%b")

plot.update_xaxes(rangeslider_visible=True,rangeslider_thickness = 0.05)
plot.update_layout(xaxis_title = 'Time', 
                  yaxis_title = 'Pageviews(Weekly)-Log',)

plot.write_html(path+'coun_time_rdd.html')
plot.show()

