Skip to content

Commit

Permalink
ENH: save dataset so it does not (long) reads it again from web WIP: …
Browse files Browse the repository at this point in the history
…save historic datasets
  • Loading branch information
garciaguevara committed Jun 9, 2020
1 parent b75d84b commit ca80395
Show file tree
Hide file tree
Showing 2 changed files with 29 additions and 8 deletions.
34 changes: 27 additions & 7 deletions covidmx/dge.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,13 @@
from covidmx.dge_plot import DGEPlot
pd.options.mode.chained_assignment = None

import wget# import urllib
import os
import zipfile
import shutil



URL_DATA = 'http://187.191.75.115/gobmx/salud/datos_abiertos/datos_abiertos_covid19.zip'
URL_DESCRIPTION = 'http://187.191.75.115/gobmx/salud/datos_abiertos/diccionario_datos_covid19.zip'
URL_HISTORICAL = 'http://187.191.75.115/gobmx/salud/datos_abiertos/historicos/datos_abiertos_covid19_{}.zip'
Expand All @@ -21,7 +28,8 @@ def __init__(
return_catalogo=False,
return_descripcion=False,
date=None,
date_format='%d-%m-%Y'):
date_format='%d-%m-%Y',
data_path=None):
"""
Returns COVID19 data from the Direccion General de Epidemiología
Expand All @@ -30,6 +38,7 @@ def __init__(
self.clean = clean
self.return_catalogo = return_catalogo
self.return_descripcion = return_descripcion
self.data_path = data_path


self.date = date
Expand Down Expand Up @@ -61,28 +70,39 @@ def get_data(self, preserve_original=None):

return df

def get_encoded_data(self, url, encoding='UTF-8'):
def get_encoded_data(self, path, encoding='UTF-8'):

try:
data = pd.read_csv(url, encoding=encoding)
data = pd.read_csv(path, encoding=encoding)
except BaseException as e:
if isinstance(e, UnicodeDecodeError):
encoding = 'ISO-8859-1'
data = self.get_encoded_data(url, encoding)
data = self.get_encoded_data(path, encoding)
else:
raise RuntimeError('Cannot read the data.')

return data

def read_data(self, encoding='UTF-8'):

# data_file= os.path.join( data_path, os.path.split( url_data )[1].replace('.','-').replace('-zip','.zip') )
if self.date is None:
url_data = URL_DATA
data_file= os.path.join( self.data_path, os.path.split( url_data )[1] )

if not os.path.exists(data_file):
wget.download(url_data, data_file)
with ZipFile(data_file) as myzip:
myzip.infolist()
df_filename=myzip.infolist()[0].filename.split('.')[0]; myzip.close()
shutil.copyfile( data_file, data_file.replace('.zip',df_filename+'.zip') )

data_path=data_file

else:
date_f = self.date.strftime('%d.%m.%Y')
url_data = URL_HISTORICAL.format(date_f)
data_path = URL_HISTORICAL.format(date_f)

data = self.get_encoded_data(url_data)
data = self.get_encoded_data(data_path)

try:
r_url = requests.get(URL_DESCRIPTION, stream=True)
Expand Down
3 changes: 2 additions & 1 deletion covidmx/dge_plot.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,8 @@ def prepare_data(self, df):
df = pd.concat([df, pd.get_dummies(df['resultado'])], axis=1)

int_vars = list(replace_resultado.values()) + ['muertos']
df[int_vars] = df[int_vars].astype(int)
df[int_vars] = df[int_vars].astype(int) #19764 09-07
#confL=df['resultado']=='confirmados'; df[confL]['muertos'].sum()

return df

Expand Down

0 comments on commit ca80395

Please sign in to comment.