In [38]:
import os
import pandas as pd
import requests
from zipfile import ZipFile
from io import BytesIO

In [39]:
# Define headers for CSV files without headers
DEFAULT_HEADERS = [
"AÑO","FECHA_DEF","SEXO_NOMBRE","EDAD_TIPO","EDAD_CANT","COD_COMUNA","COMUNA","NOMBRE_REGION","DIAG1","CAPITULO_DIAG1","GLOSA_CAPITULO_DIAG1","CODIGO_GRUPO_DIAG1","GLOSA_GRUPO_DIAG1","CODIGO_CATEGORIA_DIAG1","GLOSA_CATEGORIA_DIAG1","CODIGO_SUBCATEGORIA_DIAG1","GLOSA_SUBCATEGORIA_DIAG1","DIAG2","CAPITULO_DIAG2","GLOSA_CAPITULO_DIAG2","CODIGO_GRUPO_DIAG2","GLOSA_GRUPO_DIAG2","CODIGO_CATEGORIA_DIAG2","GLOSA_CATEGORIA_DIAG2","CODIGO_SUBCATEGORIA_DIAG2","GLOSA_SUBCATEGORIA_DIAG2","LUGAR_DEFUNCION"

In [40]:
def get_data_paths():
    # Check if running in the GitHub Actions environment
    if 'GITHUB_ACTIONS' in os.environ:
        base_path = os.getcwd()
    else:
        # Assuming your notebook is in the 'scripts' directory
        base_path = os.path.abspath(os.path.join(os.getcwd(), '../../../data'))

    source_path = os.path.join(base_path, "source/minsal/deis")
    processed_path = os.path.join(base_path, "processed/minsal/deis")
    
    return source_path, processed_path

In [41]:
source_dir, processed_dir = get_data_paths()

print(f"Source dir: {source_dir}")
print(f"Processed dir: {processed_dir}")


Source dir: /Users/ernestolaval/Documents/nodeJS/github/datos_abiertos/data/source/minsal/deis
Processed dir: /Users/ernestolaval/Documents/nodeJS/github/datos_abiertos/data/processed/minsal/deis


In [42]:
os.makedirs(source_dir, exist_ok=True)
os.makedirs(processed_dir, exist_ok=True)


In [43]:
# List of file URLs and corresponding CSV filenames to extract
file_info = [
    ("https://repositoriodeis.minsal.cl/DatosAbiertos/VITALES/DEFUNCIONES_FUENTE_DEIS_2022_2024_25062024.zip", "DEFUNCIONES_FUENTE_DEIS_2022_2024_25062024.csv"),
    ("https://repositoriodeis.minsal.cl/DatosAbiertos/VITALES/DEFUNCIONES_FUENTE_DEIS_1990_2021_CIFRAS_OFICIALES.zip", "DEFUNCIONES_FUENTE_DEIS_1990_2021_CIFRAS_OFICIALES.csv")
]


In [44]:
# Function to download a file from a URL
def download_file(url, save_path):
    response = requests.get(url)
    if response.status_code == 200:
        with open(save_path, 'wb') as f:
            f.write(response.content)
    else:
        print(f"Failed to download file from {url}")

In [45]:
def read_csv_file(file_path, delimiter=';', encoding='utf-8', header='infer'):
    try:
        return pd.read_csv(file_path, sep=delimiter, encoding=encoding, index_col=False, low_memory=False, header=header)
    except UnicodeDecodeError:
        print(f"Encoding error with {file_path}, trying ISO-8859-1")
        return pd.read_csv(file_path, sep=delimiter, encoding='ISO-8859-1', index_col=False, low_memory=False, header=header)



In [46]:
def extract_csv_from_zip(zip_path, extract_filename, delimiter=';', encoding='utf-8', header='infer'):
    with ZipFile(zip_path, 'r') as zip_file:
        if extract_filename in zip_file.namelist():
            with zip_file.open(extract_filename) as f:
                return read_csv_file(f, delimiter=delimiter, encoding=encoding, header=header)
        else:
            print(f"{extract_filename} not found in the zip archive.")
            return None


In [47]:
def process_zip_file(url, extract_filename, source_dir, processed_dir, delimiter=';', header='infer'):
    # Extract filename from URL
    zip_filename = url.split('/')[-1]
    zip_path = os.path.join(source_dir, zip_filename)

    # Check if the file already exists
    if not os.path.exists(zip_path):
        # Download the zip file if it doesn't exist
        download_file(url, zip_path)
        print(f"Downloaded {zip_filename}")
    else:
        print(f"{zip_filename} already exists. Skipping download.")
    
    # Extract and process the CSV file
    df = extract_csv_from_zip(zip_path, extract_filename, delimiter=delimiter, header=header)
    if df is not None:
        parquet_filename = extract_filename.replace('.csv', '.parquet')
        parquet_path = os.path.join(processed_dir, parquet_filename)
        df.to_parquet(parquet_path)
        print(f"Processed {extract_filename} and saved to {parquet_path}")


In [48]:
# Function to convert CSV to Parquet
def convert_csv_to_parquet(csv_path, parquet_path):
    df = read_csv_file(csv_path)
    df.to_parquet(parquet_path)


In [49]:
# Process each zip file
for url, extract_filename in file_info:
    header_option = None if '1990_2021' in extract_filename else 'infer'
    process_zip_file(url, extract_filename, source_dir, processed_dir, header=header_option, delimiter=';')


DEFUNCIONES_FUENTE_DEIS_2022_2024_25062024.zip already exists. Skipping download.
Encoding error with <zipfile.ZipExtFile name='DEFUNCIONES_FUENTE_DEIS_2022_2024_25062024.csv' mode='r' compress_type=deflate>, trying ISO-8859-1
Processed DEFUNCIONES_FUENTE_DEIS_2022_2024_25062024.csv and saved to /Users/ernestolaval/Documents/nodeJS/github/datos_abiertos/data/processed/minsal/deis/DEFUNCIONES_FUENTE_DEIS_2022_2024_25062024.parquet
DEFUNCIONES_FUENTE_DEIS_1990_2021_CIFRAS_OFICIALES.zip already exists. Skipping download.
Encoding error with <zipfile.ZipExtFile name='DEFUNCIONES_FUENTE_DEIS_1990_2021_CIFRAS_OFICIALES.csv' mode='r' compress_type=deflate>, trying ISO-8859-1


  return pd.read_csv(file_path, sep=delimiter, encoding='ISO-8859-1', index_col=False, low_memory=False, header=header)


ParserError: Error tokenizing data. C error: Expected 19 fields in line 2, saw 27


In [None]:
# Read and merge both files using the common structure
df1 = pd.read_parquet(os.path.join(processed_dir, "DEFUNCIONES_FUENTE_DEIS_2022_2024_25062024.parquet"))
df2 = pd.read_parquet(os.path.join(processed_dir, "DEFUNCIONES_FUENTE_DEIS_1990_2021_CIFRAS_OFICIALES.parquet"))

# Concatenate dataframes
df_combined = pd.concat([df1, df2], ignore_index=True)

# Save combined dataframe
combined_parquet_path = os.path.join(processed_dir, "combined_defunciones.parquet")
df_combined.to_parquet(combined_parquet_path)
print(f"Combined DataFrame saved to {combined_parquet_path}")

In [55]:
import os
import pandas as pd
import requests
from zipfile import ZipFile
from io import BytesIO

# Define headers for CSV files without headers
DEFAULT_HEADERS = [
    "AÑO", "FECHA_DEF", "SEXO_NOMBRE", "EDAD_TIPO", "EDAD_CANT", "COD_COMUNA", "COMUNA", "NOMBRE_REGION",
    "DIAG1", "CAPITULO_DIAG1", "GLOSA_CAPITULO_DIAG1", "CODIGO_GRUPO_DIAG1", "GLOSA_GRUPO_DIAG1",
    "CODIGO_CATEGORIA_DIAG1", "GLOSA_CATEGORIA_DIAG1", "CODIGO_SUBCATEGORIA_DIAG1", "GLOSA_SUBCATEGORIA_DIAG1",
    "DIAG2", "CAPITULO_DIAG2", "GLOSA_CAPITULO_DIAG2", "CODIGO_GRUPO_DIAG2", "GLOSA_GRUPO_DIAG2",
    "CODIGO_CATEGORIA_DIAG2", "GLOSA_CATEGORIA_DIAG2", "CODIGO_SUBCATEGORIA_DIAG2", "GLOSA_SUBCATEGORIA_DIAG2",
    "LUGAR_DEFUNCION"
]

def get_data_paths():
    # Check if running in the GitHub Actions environment
    if 'GITHUB_ACTIONS' in os.environ:
        base_path = os.getcwd()
    else:
        # Assuming your notebook is in the 'scripts' directory
        base_path = os.path.abspath(os.path.join(os.getcwd(), '../../../'))

    source_path = os.path.join(base_path, "data/source/vitales")
    processed_path = os.path.join(base_path, "data/processed/vitales")
    
    return source_path, processed_path

def download_file(url, save_path):
    response = requests.get(url)
    if response.status_code == 200:
        with open(save_path, 'wb') as f:
            f.write(response.content)
    else:
        print(f"Failed to download file from {url}")

def read_csv_file(file_path, delimiter=';', encoding='utf-8', header='infer', names=None):
    try:
        df = pd.read_csv(file_path, sep=delimiter, encoding=encoding, index_col=False, low_memory=False, header=header, names=names)
        print(f"Successfully read file with encoding {encoding}")
        return df
    except UnicodeDecodeError:
        print(f"Encoding error with {file_path}, trying ISO-8859-1")
        return pd.read_csv(file_path, sep=delimiter, encoding='ISO-8859-1', index_col=False, low_memory=False, header=header, names=names)

def extract_csv_from_zip(zip_path, extract_filename, delimiter=';', header='infer', names=None):
    with ZipFile(zip_path, 'r') as zip_file:
        if extract_filename in zip_file.namelist():
            with zip_file.open(extract_filename) as f:
                try:
                    first_lines = f.read(2048).decode('utf-8').split('\n')
                except UnicodeDecodeError:
                    first_lines = f.read(2048).decode('ISO-8859-1').split('\n')
                
                num_columns = len(first_lines[0].split(delimiter))
                print(f"Number of columns detected: {num_columns}")
                if names and len(names) != num_columns:
                    print("Mismatch in number of columns between headers and data.")
                    names = None  # Fall back to automatic handling
                
                # Reset the file pointer and read the entire file
                f.seek(0)
                try:
                    df = read_csv_file(f, delimiter=delimiter, encoding='utf-8', header=header, names=names)
                except UnicodeDecodeError:
                    df = read_csv_file(f, delimiter=delimiter, encoding='ISO-8859-1', header=header, names=names)
                
                return df
        else:
            print(f"{extract_filename} not found in the zip archive.")
            return None

def process_zip_file(url, extract_filename, source_dir, processed_dir, delimiter=';', header='infer', names=None):
    # Extract filename from URL
    zip_filename = url.split('/')[-1]
    zip_path = os.path.join(source_dir, zip_filename)

    # Check if the file already exists
    if not os.path.exists(zip_path):
        # Download the zip file if it doesn't exist
        download_file(url, zip_path)
        print(f"Downloaded {zip_filename}")
    else:
        print(f"{zip_filename} already exists. Skipping download.")
    
    # Extract and process the CSV file
    df = extract_csv_from_zip(zip_path, extract_filename, delimiter=delimiter, header=header, names=names)
    if df is not None:
        parquet_filename = extract_filename.replace('.csv', '.parquet')
        parquet_path = os.path.join(processed_dir, parquet_filename)
        df.to_parquet(parquet_path)
        print(f"Processed {extract_filename} and saved to {parquet_path}")

# Create directories if they don't exist
source_dir, processed_dir = get_data_paths()
os.makedirs(source_dir, exist_ok=True)
os.makedirs(processed_dir, exist_ok=True)

# List of file URLs and corresponding CSV filenames to extract
file_info = [
    ("https://repositoriodeis.minsal.cl/DatosAbiertos/VITALES/DEFUNCIONES_FUENTE_DEIS_2022_2024_25062024.zip", "DEFUNCIONES_FUENTE_DEIS_2022_2024_25062024.csv"),
    ("https://repositoriodeis.minsal.cl/DatosAbiertos/VITALES/DEFUNCIONES_FUENTE_DEIS_1990_2021_CIFRAS_OFICIALES.zip", "DEFUNCIONES_FUENTE_DEIS_1990_2021_CIFRAS_OFICIALES.csv")
]

# Process each zip file
for url, extract_filename in file_info:
    header_option = 'infer' if '2022_2024' in extract_filename else None
    names_option = None if header_option == 'infer' else DEFAULT_HEADERS
    process_zip_file(url, extract_filename, source_dir, processed_dir, header=header_option, names=names_option)

# Read and merge both files using the common structure
df1 = pd.read_parquet(os.path.join(processed_dir, "DEFUNCIONES_FUENTE_DEIS_2022_2024_25062024.parquet"))
df2 = pd.read_parquet(os.path.join(processed_dir, "DEFUNCIONES_FUENTE_DEIS_1990_2021_CIFRAS_OFICIALES.parquet"))

# Concatenate dataframes
df_combined = pd.concat([df1, df2], ignore_index=True)

# Save combined dataframe
combined_parquet_path = os.path.join(processed_dir, "combined_defunciones.parquet")
df_combined.to_parquet(combined_parquet_path)
print(f"Combined DataFrame saved to {combined_parquet_path}")


DEFUNCIONES_FUENTE_DEIS_2022_2024_25062024.zip already exists. Skipping download.
Number of columns detected: 25
Encoding error with <zipfile.ZipExtFile name='DEFUNCIONES_FUENTE_DEIS_2022_2024_25062024.csv' mode='r' compress_type=deflate>, trying ISO-8859-1


  return pd.read_csv(file_path, sep=delimiter, encoding='ISO-8859-1', index_col=False, low_memory=False, header=header, names=names)


Processed DEFUNCIONES_FUENTE_DEIS_2022_2024_25062024.csv and saved to /Users/ernestolaval/Documents/nodeJS/github/datos_abiertos/data/processed/vitales/DEFUNCIONES_FUENTE_DEIS_2022_2024_25062024.parquet
DEFUNCIONES_FUENTE_DEIS_1990_2021_CIFRAS_OFICIALES.zip already exists. Skipping download.
Number of columns detected: 26
Mismatch in number of columns between headers and data.
Encoding error with <zipfile.ZipExtFile name='DEFUNCIONES_FUENTE_DEIS_1990_2021_CIFRAS_OFICIALES.csv' mode='r' compress_type=deflate>, trying ISO-8859-1


ParserError: Error tokenizing data. C error: Expected 19 fields in line 2, saw 27


In [None]:
df = extract_csv_from_zip(zip_path, extract_filename, delimiter=delimiter, header=header, names=names)


In [56]:
zip_path = os.path.join(source_dir, "DEFUNCIONES_FUENTE_DEIS_2022_2024_25062024.zip")

In [57]:
zip_path

'/Users/ernestolaval/Documents/nodeJS/github/datos_abiertos/data/source/vitales/DEFUNCIONES_FUENTE_DEIS_2022_2024_25062024.zip'

In [58]:
extract_filename = "DEFUNCIONES_FUENTE_DEIS_2022_2024_25062024.csv"

In [59]:
extract_filename

'DEFUNCIONES_FUENTE_DEIS_2022_2024_25062024.csv'

In [60]:
df = extract_csv_from_zip(zip_path, extract_filename, delimiter=";", header="none", names=DEFAULT_HEADERS)


Number of columns detected: 25
Mismatch in number of columns between headers and data.


ValueError: header must be integer or list of integers