In [32]:
'''libraries list with help showing the version of the libraries being used in this nodebook'''
libraries = []
import random
'''Datasets, arrays and json files'''

import pandas as pd
import numpy as np
import pickle

libraries.append('pandas')
libraries.append('numpy')
libraries.append('pickle')

'''Following progress'''
from tqdm.notebook import tqdm
libraries.append('tqdm')

'''Utilities'''
import collections
from pathlib import Path
from datetime import datetime
from dateutil.relativedelta import relativedelta
import os
#import powerlaw 
libraries.append('collections')
libraries.append('pathlib')
libraries.append('datetime')
libraries.append('os')
libraries.append('powerlaw')

'''Plots and figures'''
%matplotlib notebook

import matplotlib.pyplot as plt
import plotly.express as px
libraries.append('matplotlib')
libraries.append('plotly')

plt.rcParams["figure.facecolor"] = "w"

'''Pthon and library version'''
import types
import pkg_resources
import sys
from platform import python_version

In [2]:
'''To display version of Software being used'''
print('Version of python installed: {}' .format(sys.version))
print('Version of python being used: {}' .format(python_version()))
print('\nNon-built in libraries being used:')

for m in pkg_resources.working_set:
    if m.project_name.lower() in libraries:
        print('{}, version {}'.format(m.project_name,m.version))

Version of python installed: 3.8.10 (default, May 19 2021, 11:01:55) 
[Clang 10.0.0 ]
Version of python being used: 3.8.10

Non-built in libraries being used:
tqdm, version 4.62.3
powerlaw, version 1.4.6
plotly, version 5.1.0
pandas, version 1.3.3
numpy, version 1.19.2
matplotlib, version 3.4.3


In [3]:
def to_check_duplicates(df, col):
    '''Helper function to check if there is duplicated in a DataFrame df, on a defined
    column col. It will return the duplicated rows, if there is any.'''
    ids = df[col]
    duplicated = df[ids.isin(ids[ids.duplicated()])]
    return duplicated

def print_file_names():
    '''Helper function to search for files'''
    cwd = os.getcwd()  # Get the current working directory (cwd)
    files = os.listdir(cwd)  # Get all the files in that directory
    print("Files in %r: %s" % (cwd, files))

In [6]:
'''Filepath and folders that are going to be used'''
filepath = '../../data/covid19/downloaded_data/'
folder = ['Brasil/', 'Canada/','Chile/','NSW/','USA/']

In [7]:
saving_path = '../_generated_files/'
!ls {saving_path}

AllScales24Oct.pkl            results_covid19_brazil.pickle
Brazil24Oct.csv               results_covid19_chile.pickle
Chile_26Oct.csv               x_covid19_NSW.pickle
Chile_wdates23Sept.csv        x_covid19_USA.pickle
NSW_SUA_SA2_LGA24Oct.pkl      x_covid19_brazil.pickle
USA14Oct.csv                  x_covid19_chile.pickle
USA14Octstate.csv             y_covid19_NSW.pickle
dataframe_empty_USA.pikle     y_covid19_USA.pickle
postcodeCases24Oct.csv        y_covid19_brazil.pickle
postcodeCaseswDates24Oct.csv  y_covid19_chile.pickle
results_covid19_NSW.pickle


# Brazil

In [11]:
brazil_dates = pd.read_csv(filepath+'/Brazil/caso.csv.gz', compression='gzip')

Filter cities (don't use states).

In [12]:
np.unique(brazil_dates.place_type.values)

array(['city', 'state'], dtype=object)

In [13]:
brazil_dates = brazil_dates[brazil_dates['place_type'] != 'state']
brazil_dates

Unnamed: 0,date,state,city,place_type,confirmed,deaths,order_for_place,is_last,estimated_population_2019,estimated_population,city_ibge_code,confirmed_per_100k_inhabitants,death_rate
588,2021-10-30,AP,Amapá,city,1417,13,542,True,9109.0,9187.0,1600105.0,15423.96865,0.0092
589,2021-10-29,AP,Amapá,city,1415,13,541,False,9109.0,9187.0,1600105.0,15402.19876,0.0092
590,2021-10-28,AP,Amapá,city,1413,13,540,False,9109.0,9187.0,1600105.0,15380.42887,0.0092
591,2021-10-27,AP,Amapá,city,1413,13,539,False,9109.0,9187.0,1600105.0,15380.42887,0.0092
592,2021-10-26,AP,Amapá,city,1413,13,538,False,9109.0,9187.0,1600105.0,15380.42887,0.0092
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2621513,2020-06-23,SP,Óleo,city,1,0,5,False,2496.0,2471.0,3533809.0,40.46945,0.0000
2621514,2020-06-22,SP,Óleo,city,1,0,4,False,2496.0,2471.0,3533809.0,40.46945,0.0000
2621515,2020-06-21,SP,Óleo,city,1,0,3,False,2496.0,2471.0,3533809.0,40.46945,0.0000
2621516,2020-06-20,SP,Óleo,city,1,0,2,False,2496.0,2471.0,3533809.0,40.46945,0.0000


To select the information that will be usefil in the analysis: City, Cases and population.

In [14]:
brazil_dates = brazil_dates.loc[:,['date','state','city_ibge_code','city', 'confirmed', 'estimated_population_2019']]
brazil_dates['city'] = brazil_dates['city']+'('+brazil_dates['state']+')'
brazil_dates.head()

Unnamed: 0,date,state,city_ibge_code,city,confirmed,estimated_population_2019
588,2021-10-30,AP,1600105.0,Amapá(AP),1417,9109.0
589,2021-10-29,AP,1600105.0,Amapá(AP),1415,9109.0
590,2021-10-28,AP,1600105.0,Amapá(AP),1413,9109.0
591,2021-10-27,AP,1600105.0,Amapá(AP),1413,9109.0
592,2021-10-26,AP,1600105.0,Amapá(AP),1413,9109.0


In [15]:
brazil_dates['city_ibge_code'].isnull().values.any() #Null values

True

In [16]:
brazil_dates[brazil_dates['city_ibge_code'].isnull()] #no information about city

Unnamed: 0,date,state,city_ibge_code,city,confirmed,estimated_population_2019
57993,2021-10-29,DF,,Importados/Indefinidos(DF),59486,
57994,2021-10-28,DF,,Importados/Indefinidos(DF),59469,
57995,2021-10-27,DF,,Importados/Indefinidos(DF),59452,
57996,2021-10-26,DF,,Importados/Indefinidos(DF),59444,
57997,2021-10-25,DF,,Importados/Indefinidos(DF),59417,
...,...,...,...,...,...,...
2419960,2020-03-23,SP,,Importados/Indefinidos(SP),0,
2419961,2020-03-22,SP,,Importados/Indefinidos(SP),0,
2419962,2020-03-21,SP,,Importados/Indefinidos(SP),0,
2419963,2020-03-20,SP,,Importados/Indefinidos(SP),8,


Drop the rows without city information.

In [17]:
brazil_dates = brazil_dates[brazil_dates['city_ibge_code'].notnull()]

In [18]:
brazil_dates.confirmed.sum() #Accumulated numbers. 4,614,712,201. 
#According to worldometer (2Nov) Brazil COVID: 21,814,693 Cases 

4614712201

In [19]:
brazil_dates[brazil_dates['city_ibge_code'] == 1100015.]

Unnamed: 0,date,state,city_ibge_code,city,confirmed,estimated_population_2019
1783706,2021-09-30,RO,1100015.0,Alta Floresta D'Oeste(RO),4089,22945.0
1783707,2021-09-29,RO,1100015.0,Alta Floresta D'Oeste(RO),4085,22945.0
1783708,2021-09-24,RO,1100015.0,Alta Floresta D'Oeste(RO),4079,22945.0
1783709,2021-09-18,RO,1100015.0,Alta Floresta D'Oeste(RO),4075,22945.0
1783710,2021-09-05,RO,1100015.0,Alta Floresta D'Oeste(RO),4065,22945.0
...,...,...,...,...,...,...
1784156,2020-05-06,RO,1100015.0,Alta Floresta D'Oeste(RO),1,22945.0
1784157,2020-05-05,RO,1100015.0,Alta Floresta D'Oeste(RO),1,22945.0
1784158,2020-05-04,RO,1100015.0,Alta Floresta D'Oeste(RO),1,22945.0
1784159,2020-05-03,RO,1100015.0,Alta Floresta D'Oeste(RO),1,22945.0


In [20]:
#Deal with datatypes
brazil_dates['estimated_population_2019']= brazil_dates['estimated_population_2019'].astype(np.int)
brazil_dates['city_ibge_code']= brazil_dates['city_ibge_code'].astype(np.int)
brazil_dates['city_ibge_code']= brazil_dates['city_ibge_code'].astype(np.str)

In [21]:
brazil_dates.date.min()

'2020-02-25'

In [22]:
brazil_dates.date.max()

'2021-10-31'

To analyse the data, we need to select the last information (accumulated cases) for a given date.

In [23]:
brazil_dates['date']= pd.to_datetime(brazil_dates['date'])

In [24]:
print(type(brazil_dates['date']))

<class 'pandas.core.series.Series'>


In [25]:
brazil_dates.head()

Unnamed: 0,date,state,city_ibge_code,city,confirmed,estimated_population_2019
588,2021-10-30,AP,1600105,Amapá(AP),1417,9109
589,2021-10-29,AP,1600105,Amapá(AP),1415,9109
590,2021-10-28,AP,1600105,Amapá(AP),1413,9109
591,2021-10-27,AP,1600105,Amapá(AP),1413,9109
592,2021-10-26,AP,1600105,Amapá(AP),1413,9109


Build a new dataset with uniqe city_ibge_code, city, and population. 
Add the last register closer to the cut date per city. If it is not found (in the sliced dataset), then confirmed cases until that date = 0.

In [26]:
new_brazil = brazil_dates[['city_ibge_code','city','estimated_population_2019']].drop_duplicates().reset_index(drop=True)
new_brazil.to_pickle(saving_path+"/dataframe_empty_brazil.pikle")
new_brazil.head()

Unnamed: 0,city_ibge_code,city,estimated_population_2019
0,1600105,Amapá(AP),9109
1,1600204,Calçoene(AP),11117
2,1600212,Cutias(AP),5983
3,1600238,Ferreira Gomes(AP),7780
4,1600253,Itaubal(AP),5503


In [173]:
#To check duplicates
len(to_check_duplicates(new_brazil, 'city_ibge_code')) #None
to_check_duplicates(new_brazil, 'city').sort_values(by='city') 

Unnamed: 0,city_ibge_code,city,estimated_population_2019


In [27]:
brazil_dates.date.max()

Timestamp('2021-10-31 00:00:00')

In [28]:
brazil_dates.date.min()

Timestamp('2020-02-25 00:00:00')

In [42]:
def cut_by_num_days(df, start_delay, end_delay):
    start_date = df.date.min()+relativedelta(days=start_delay)
    end_date = start_date + relativedelta(days=end_delay)
    if end_date > df.date.max():
        end_date = df.date.max()
    df = df.copy()
    after_start_date = df["date"] >= start_date
    before_end_date = df["date"] <= end_date
    between_two_dates = after_start_date & before_end_date
    filtered_dates = df.loc[between_two_dates]
    filtered_dates = filtered_dates.sort_values('date').drop_duplicates('city_ibge_code',keep='last')
    filtered_dates = filtered_dates.reset_index(drop=True)
 
    new_brazil = pd.read_pickle(saving_path+"/dataframe_empty_brazil.pikle")
    merged = new_brazil.merge(filtered_dates, how='left', on='city_ibge_code', copy=False)
    #Fill NaN values with zeros (not in the sliced dataframe means no cases)
    merged['confirmed'] = merged['confirmed'].fillna(0)


    #Keep the relevant columns in the right format
    merged = merged[['city_x','estimated_population_2019_x','confirmed']]
    merged['confirmed']= merged['confirmed'].astype(np.int)
    
    #Rename to simpler names
    merged =merged.rename(columns={'city_x': 'city', 
                                   'estimated_population_2019_x': 'population',
                                   'confirmed':'cases'})
    return merged

In [43]:
df = cut_by_num_days(brazil_dates, start_delay=0, end_delay=30)
print(df.population.sum())
print(df.cases.sum())
df.shape

210147125
2891


(5570, 3)

In [44]:
df = cut_by_num_days(brazil_dates, start_delay=0, end_delay=100)
print(df.population.sum())
print(df.cases.sum())
df.shape

210147125
616223


(5570, 3)

In [45]:
df.head()

Unnamed: 0,city,population,cases
0,Amapá(AP),9109,119
1,Calçoene(AP),11117,142
2,Cutias(AP),5983,159
3,Ferreira Gomes(AP),7780,123
4,Itaubal(AP),5503,77


In [46]:
df = cut_by_num_days(brazil_dates, start_delay=0, end_delay=700)
df.cases.sum()  #21.495.588 cases in total. close to the actual value.

21495588

In [47]:
df.sort_values(by=['population'])

Unnamed: 0,city,population,cases
2835,Serra da Saudade(MG),781,57
5001,Borá(SP),837,127
1433,Araguainha(MT),935,29
4355,Engenho Velho(RS),1034,182
4872,Oliveira de Fátima(TO),1112,147
...,...,...,...
261,Fortaleza(CE),2669342,256441
1044,Salvador(BA),2872347,237137
100,Brasília(DF),3015268,455247
3037,Rio de Janeiro(RJ),6718903,484433


Select some interesting points to analize the fitting of the models.

In [52]:
list_delays = [25, 30, 35, 38]
for end_delay in list_delays:
    df = cut_by_num_days(brazil_dates, start_delay=0, end_delay=end_delay)
    file_name = saving_path+'Brazil_by_dates/Brazil_'+str(end_delay)
    df.cases.sum()
    df.to_csv(file_name+'.csv' , header=['cities', 'population', 'cases'],index = False)
    txt_file = file_name+'txt'
    frame=pd.DataFrame(df.values, columns = ["#city", "population", "cases"])
    frame.to_csv(txt_file, index=None, sep=',', mode='a')

In [54]:
n = []
i = 45
while i < 300:
    n.append(i)
    i+=10
for end_delay in n:
    df = cut_by_num_days(brazil_dates, start_delay=0, end_delay=end_delay)
    file_name = saving_path+'Brazil_by_dates/Brazil_'+str(end_delay)
    df.cases.sum()
    df.to_csv(file_name+'.csv' , header=['cities', 'population', 'cases'],index = False)
    txt_file = file_name+'txt'
    frame=pd.DataFrame(df.values, columns = ["#city", "population", "cases"])
    frame.to_csv(txt_file, index=None, sep=',', mode='a')

In [59]:
for d in n:
    list_delays.append(d)

In [53]:
n = []
i = 300
while i < 500:
    n.append(i)
    i+=10
for end_delay in n:
    df = cut_by_num_days(brazil_dates, start_delay=0, end_delay=end_delay)
    file_name = saving_path+'Brazil_by_dates/Brazil_'+str(end_delay)
    df.cases.sum()
    df.to_csv(file_name+'.csv' , header=['cities', 'population', 'cases'],index = False)
    txt_file = file_name+'txt'
    frame=pd.DataFrame(df.values, columns = ["#city", "population", "cases"])
    frame.to_csv(txt_file, index=None, sep=',', mode='a')

In [61]:
for d in n:
    list_delays.append(d)

In [65]:
with open(saving_path+"/list_delays_Brazil.pikle", 'wb') as f:
    pickle.dump(list_delays, f)

Where Brazil stopped to have zero cases in some places?

In [66]:
df = cut_by_num_days(brazil_dates, start_delay=0, end_delay=293)
df[df['cases']==0]

Unnamed: 0,city,population,cases
