In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd
from datetime import datetime
from tqdm import tqdm
import os
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns

# select the style from fivethityeight website
plt.style.use('fivethirtyeight') 
mpl.rcParams['lines.linewidth'] = 2

# predefined figsize
figsize=(12,9)

import warnings
warnings.filterwarnings('ignore')

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

from statsmodels.graphics.tsaplots import plot_acf, plot_pacf

from scipy.stats import pearsonr

%config InlineBackend.figure_format = 'retina' #set 'png' here when working on notebook
%matplotlib inline

In [2]:
region_names = ('Abruzzo', 'Basilicata', 'Calabria', 'Campania', 'Emilia-Romagna', 'Friuli Venezia Giulia', 'Lazio', 'Liguria', 'Lombardia', 'Marche',
                'Molise', 'P.A. Bolzano', 'P.A. Trento', 'Piemonte', 'Puglia', 'Sardegna', 'Sicilia', 'Toscana', 'Umbria', 'Valle d\'Aosta', 'Veneto')

column_names = ['data', 'lat', 'long', 'ricoverati_con_sintomi', 'terapia_intensiva', 'totale_ospedalizzati', 'totale_positivi', 'variazione_totale_positivi', 'nuovi_positivi',
                'dimessi_guariti', 'deceduti', 'tamponi']

dir = 'drive/MyDrive/homework3/COVID-19/dati-regioni'

In [3]:
class COVID19_parser():
  
  def __init__(self, directory, region_names, column_names, min_date = '20200224', vax=True):
    self.dir = directory
    self.region_names, self.column_names = region_names, column_names
    empty_df = pd.DataFrame(columns=column_names)
    self.region_dict = { name : empty_df for name in region_names }
    self.csv_names, self.dates, self.min_date = None, None, min_date
    self.vax = vax

  def get_names(self):

    # Load names of the files present in the directory and sort them
    self.csv_names = next(os.walk(self.dir))[2]
    self.csv_names.sort()
    
    # Last two files are not used
    self.csv_names = self.csv_names[:-2]

    # Extract day from filenames
    self.dates = [name[24:-4] for name in self.csv_names]

    return

  def parse_day(self, df):

    # Transform datastring in date-time object
    data = pd.to_datetime(df.data[0][:10])

    # For each region append the new day to the dataframe
    for key, value in self.region_dict.items():
      new_line = df[df.denominazione_regione == key][column_names]
      new_line['data'] = data
      self.region_dict[key] = pd.concat([value, new_line], ignore_index=True, axis = 0)

    return
  
  def parse_directory(self):
    
    # Check min date
    if type(self.min_date) != str:
      self.min_date = str(self.min_date)
    
    # Compose filename of min date
    date_name = 'dpc-covid19-ita-regioni-' + self.min_date + '.csv'
    index = self.csv_names.index(date_name)
    
    # Loop over all the following days
    for name in tqdm(self.csv_names[index:]):
      df = pd.read_csv(os.path.join(self.dir,name))
      self.parse_day(df)
    
    for key in self.region_dict.keys():  
      self.region_dict[key].set_index('data', inplace=True)
      self.region_dict[key] = self.region_dict[key].apply(pd.to_numeric)


    return
    
  def add_vaccines(self):
    url = "https://raw.githubusercontent.com/italia/covid19-opendata-vaccini/master/dati/somministrazioni-vaccini-summary-latest.csv"
    vaccines = pd.read_csv(url)
    dict_names = {'Valle d\'Aosta / Vallée d\'Aoste':'Valle d\'Aosta','Provincia Autonoma Bolzano / Bozen':'P.A. Bolzano','Provincia Autonoma Trento':'P.A. Trento','Friuli-Venezia Giulia':'Friuli Venezia Giulia'}
    for index,row in vaccines.iterrows():
      if vaccines['reg'][index] in dict_names.keys():
        vaccines['reg'][index] = dict_names[row['reg']]
    
    vaccines_per_region = vaccines.groupby(by = 'reg')

    #vaccines_per_region.groups['Valle d\'Aosta'] = vaccines_per_region.groups.pop('Valle d\'Aosta / Vallée d\'Aoste')

  
    for key in self.region_dict.keys():
      reg = vaccines_per_region.get_group(key)
      reg = reg[['data','totale']]
      reg.sort_values(by = 'data', inplace = True)
      

      dates_list = [datetime.strptime(date, '%Y-%m-%d').date() for date in reg['data']]
      reg['data'] = dates_list
      minDate = datetime.strptime(self.min_date, '%Y%m%d').date()
      indices_to_drop = reg[reg['data'] < minDate].index
      reg.drop(indices_to_drop, inplace = True)
      reg.set_index('data', inplace = True)

      self.region_dict[key] = pd.concat([self.region_dict[key], reg], axis = 1, join = 'outer')
      self.region_dict[key].rename(columns = {'totale':'vaccini'}, inplace = True)
      


  def display_corr(self, region1, region2, relevant_cols, to_diff = [] ):

    df1 = self.create_dataframe(region1, relevant_cols, to_diff)

    df2 = self.create_dataframe(region2, relevant_cols, to_diff)

    data = pd.concat([df1,df2], axis = 1)



    def corrfunc(x,y, ax=None, **kws):
        """Plot the correlation coefficient in the top left hand corner of a plot."""
        r, _ = pearsonr(x, y)
        ax = ax or plt.gca()
        # Unicode for lowercase rho (ρ)
        rho = '\u03C1'
        ax.annotate(f'{rho} = {r:.2f}', xy=(.1, .9), xycoords=ax.transAxes, fontsize = 30)
        
    def corrdot(*args, **kwargs):
        corr_r = args[0].corr(args[1], 'pearson')
        corr_text = f"{corr_r:2.2f}".replace("0.", ".")
        ax = plt.gca()
        ax.set_axis_off()
        marker_size = abs(corr_r) * 10000
        ax.scatter([.5], [.5], marker_size, [corr_r], alpha=0.6, cmap="Blues",
                  vmin=-1, vmax=1, transform=ax.transAxes)
        font_size = abs(corr_r) * 40 + 5
        ax.annotate(corr_text, [.5, .5,],  xycoords="axes fraction",
                    ha='center', va='center', fontsize=font_size)    
        
    # g = sns.pairplot(stocks,palette=["Blues_d"])
    g = sns.PairGrid(data, aspect=2.4, diag_sharey=False, x_vars = df1.columns, y_vars = df2.columns)
    g.map(corrfunc)
    g.map(sns.regplot, lowess=True, ci=False, line_kws={'color': 'Black','linewidth':1})
    #g.map_diag(corrfunc)
    #g.map_diag(sns.regplot, lowess=True, ci=False, line_kws={'color': 'Black','linewidth':1})
    #g.map_upper(corrdot)
    plt.show()

    
  
  def create_dataframe(self, region1, relevant_cols, to_diff):
    if region1 not in self.region_dict.keys(): 
      raise ValueError("Nome regione errato")

    
    df1 = self.region_dict[region1][relevant_cols].copy()
    df1[to_diff] = df1[to_diff].diff()
    df1.rename(columns = lambda x: "Variazione_"+ x if x in to_diff else x, inplace = True )
    df1.drop(labels = '2021-10-02', inplace = True)
    df1.dropna( axis = 0, how = 'all', inplace = True)
    if region1 == 'Emilia-Romagna':
      region1 = 'ER'
    df1.rename(columns = lambda x: x + "_" + region1, inplace = True)
    


    return df1

  
  def parse(self):
    self.get_names()
    self.parse_directory()
    if self.vax:
      self.add_vaccines()
    return