# Milestone 3 research and data visualisation

### Table of Contents

* [Digital Propagation](#chapter1)
    * [Loading the data](#section_1_1)
    * [Overview of the data](#Section_1_2)
    * [Time series analysis](#Section_1_3)
    
        * [Check for stationarity](#section_1_3_1)
        * [Autocorrelation](#section_1_3_2)
        * [Decomposition](#section_1_3_3)
        
    * [Google mobility data](#Section_1_4) 
    
        * [Data processing](#section_1_4_1)
        * [Analysis per country](#section_1_4_1)
        
        
* [COVID-19 dataset](#chapter2)
    * [Downloading the data](#section_2_1)
    * [Overview of the data](#section_2_2)
    * [Time series analysis](#Section_2_3)
        * [Check for stationarity](#section_2_3_1)
        * [Autocorrelation](#section_2_3_2)
        * [Decomposition](#section_2_3_3)
        
        
* [Pearson Correlation](#chapter3)      


* [Trust dataset](#chapter4)  
    * [Visualizing Government trust](#section_4_1)
    * [Visualizing Trust in Journalists](#section_4_2)
    * [Visualizing Trust in Science](#section_4_3)
    * [Analysis](#section_4_4)


* [Clustering](#chapter5) 

In [57]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sn
from scipy import stats
from helper import *
from scipy.stats.mstats import gmean
#Importation of all the packages
import datetime
import math
import json
import zipfile  
import ssl
from scipy import stats
import statsmodels.formula.api as smf
import statsmodels.api as sm
import plotly.express as px

#To dowload data
import requests
import io
import gzip

#To create the mapchart
import iso3166
import plotly
from iso3166 import countries
import plotly.graph_objects as go

## Digital propagation <a class="anchor" id="chapter1"></a>
### Loading the data <a class="anchor" id="section_1_1"></a>

TODO: Decrire ce qu'on va faire

In [58]:
#Loading raw df from csv file
pageview_df = pd.read_csv("page_views_covid_related.csv.gz")
population_df = pd.read_csv("Population_countries.csv")
#get cleaned dfs, cumulative df and per 100k of population dfs for pageviews, covid cases and deaths data 
df_pageviews, df_pageviews_cumul, df_pageviews100k, df_pageviews_cumul100k = get_pageviews_df(pageview_df, population_df, get_country_dict('original'), '2020-01-22', '2022-07-31')
deaths, cases, deaths_cumul, cases_cumul, deaths100k, deaths100k_cumul, cases100k, cases100k_cumul = get_cases_deaths_df(population_df, get_country_dict('original'), '2020-01-22', '2022-07-31')

In [59]:
#getting dict of country name with countries language code
o_country_dict = get_country_dict('original')
inv_o_country_dict = {v: k for k, v in o_country_dict.items()}
other_country_name = {"Russia": "Russian Federation", "Turkey":"Türkiye", "Vietnam" : "Viet Nam", "South Korea" : "Korea, Democratic People's Republic of" }

In [60]:
deaths_mapchart = pd.DataFrame({})
for country in list(o_country_dict.keys()): 
  df = pd.DataFrame(deaths100k.rename(columns= inv_o_country_dict)[country])
  df = df.rename(columns= {country: 'deaths'})
  if (country in list(other_country_name.keys())):
    df['Country_code'] = [countries.get(other_country_name[country]).alpha3] * len(df)
  else:
    df['Country_code'] = [countries.get(country).alpha3] * len(df)
  df['date'] = df.index
  df['country'] = country

  #To keep every 5 rows
  df = df.iloc[::2,:]
  deaths_mapchart = pd.concat([deaths_mapchart, df], axis= 0)
fig = px.choropleth(deaths_mapchart, locations= "Country_code",
                    color = deaths_mapchart['deaths'],
                    animation_frame='date',
                    hover_name=deaths_mapchart['country'], # column to add to hover information
                    range_color = [0,np.percentile(deaths_mapchart['deaths'],99)],
                    color_continuous_scale='Reds', title=format_title('Number of death of COVID-19 per 100k inhabitants and per country','The colour of the country corresponds to how much death per 100k inhabitants happen at this date.'),
                    width= 700,
                    height= 700)

fig.layout.updatemenus[0].buttons[0].args[1]["frame"]["duration"] = 50 # buttons
fig.layout.updatemenus[0].buttons[0].args[1]["transition"]["duration"] = 0
fig.layout.updatemenus[0].buttons[1].args[1]["frame"]["duration"] = 50
fig.layout.updatemenus[0].buttons[1].args[1]["transition"]["duration"] = 0
fig.layout.sliders[0].steps[0].args[1]["frame"]["duration"] = 0 # slider
fig.layout.updatemenus[0].buttons[0].args[1]["visible"] = False
fig.update_geos(
    center=dict(lon=80, lat=35),
    projection_type="mercator",
    lataxis_range=[-50,80], lonaxis_range=[-10, 230]
)

fig.show()
fig.write_html("/Users/robindebalme/Desktop/ADAProj/ada-2022-project-thedatadiggers22/data/deaths_mapchart.html")

In [96]:
pageviews_mapchart = pd.DataFrame({})
for country in o_country_dict.keys():
    
    df = pd.DataFrame(df_pageviews100k.rename(columns= inv_o_country_dict)[country])
    df = df.rename(columns= {country: 'COVID-19 pageviews'})
    if (country in list(other_country_name.keys())):
        df['Country_code'] = [countries.get(other_country_name[country]).alpha3] * len(df)
    else:
        df['Country_code'] = [countries.get(country).alpha3] * len(df)
    df['date'] = df.index
    df['country'] = country

    #To keep every 5 rows
    df = df.iloc[::2,:]
    pageviews_mapchart = pd.concat([pageviews_mapchart, df], axis= 0)

fig = px.choropleth(pageviews_mapchart, locations= "Country_code",
                    color = pageviews_mapchart['COVID-19 pageviews'],
                    animation_frame='date',
                    hover_name=pageviews_mapchart['country'], # column to add to hover information
                    #color_continuous_midpoint = max(pageviews['COVID-19 pages'])/2,
                    range_color = [0,np.percentile(pageviews_mapchart['COVID-19 pageviews'],99)],
                    color_continuous_scale='ylorbr')

fig.update_layout(title={'text':'Number of pageviews per 100000 inhabitants and per country',
                         'font':{'size':18, 'color':'black'},
                         'x':0.5, 'y':0.95},
                    transition = {'duration': 10},
                    width= 700,
                    height= 700)

fig.layout.updatemenus[0].buttons[0].args[1]["frame"]["duration"] = 50 # buttons
fig.layout.updatemenus[0].buttons[0].args[1]["transition"]["duration"] = 0
fig.layout.updatemenus[0].buttons[1].args[1]["frame"]["duration"] = 50
fig.layout.updatemenus[0].buttons[1].args[1]["transition"]["duration"] = 0
fig.layout.sliders[0].steps[0].args[1]["frame"]["duration"] = 0 # slider
fig.layout.updatemenus[0].buttons[0].args[1]["visible"] = False
fig.update_geos(
    center=dict(lon=80, lat=35),
    projection_type="mercator",
    lataxis_range=[-50,80], lonaxis_range=[-10, 230]
)

fig.show()
fig.write_html("/Users/robindebalme/Desktop/ADAProj/ada-2022-project-thedatadiggers22/data/pageviews_mapchart.html")

In [None]:
cases_mapchart = pd.DataFrame({})
for country in o_country_dict.keys():
    
    df = pd.DataFrame(cases100k.rename(columns= inv_o_country_dict)[country])
    df = df.rename(columns= {country: 'COVID-19 pageviews'})
    if (country in list(other_country_name.keys())):
        df['Country_code'] = [countries.get(other_country_name[country]).alpha3] * len(df)
    else:
        df['Country_code'] = [countries.get(country).alpha3] * len(df)
    df['date'] = df.index
    df['country'] = country

    #To keep every 5 rows
    df = df.iloc[::2,:]
    pageviews_mapchart = pd.concat([pageviews_mapchart, df], axis= 0)

fig = px.choropleth(pageviews_mapchart, locations= "Country_code",
                    color = pageviews_mapchart['COVID-19 pageviews'],
                    animation_frame='date',
                    hover_name=pageviews_mapchart['country'], # column to add to hover information
                    #color_continuous_midpoint = max(pageviews['COVID-19 pages'])/2,
                    range_color = [0,np.percentile(pageviews_mapchart['COVID-19 pageviews'],99)],
                    color_continuous_scale='ylorbr')

fig.update_layout(title={'text':'Number of pageviews per 100000 inhabitants and per country',
                         'font':{'size':18, 'color':'black'},
                         'x':0.5, 'y':0.95},
                    transition = {'duration': 10},
                    width= 700,
                    height= 700)

fig.layout.updatemenus[0].buttons[0].args[1]["frame"]["duration"] = 50 # buttons
fig.layout.updatemenus[0].buttons[0].args[1]["transition"]["duration"] = 0
fig.layout.updatemenus[0].buttons[1].args[1]["frame"]["duration"] = 50
fig.layout.updatemenus[0].buttons[1].args[1]["transition"]["duration"] = 0
fig.layout.sliders[0].steps[0].args[1]["frame"]["duration"] = 0 # slider
fig.layout.updatemenus[0].buttons[0].args[1]["visible"] = False
fig.update_geos(
    center=dict(lon=80, lat=35),
    projection_type="mercator",
    lataxis_range=[-50,80], lonaxis_range=[-10, 230]
)

fig.show()
fig.write_html("/Users/robindebalme/Desktop/ADAProj/ada-2022-project-thedatadiggers22/data/pageviews_mapchart.html")