In [71]:
import os
import pandas as pd
import requests
from zipfile import ZipFile
import chardet

In [72]:
# Define headers for CSV files without headers
DEFAULT_HEADERS = [
    "AÑO", "FECHA_DEF", "SEXO_NOMBRE", "EDAD_TIPO", "EDAD_CANT", "COD_COMUNA", "COMUNA", "NOMBRE_REGION",
    "DIAG1", "CAPITULO_DIAG1", "GLOSA_CAPITULO_DIAG1", "CODIGO_GRUPO_DIAG1", "GLOSA_GRUPO_DIAG1",
    "CODIGO_CATEGORIA_DIAG1", "GLOSA_CATEGORIA_DIAG1", "CODIGO_SUBCATEGORIA_DIAG1", "GLOSA_SUBCATEGORIA_DIAG1",
    "DIAG2", "CAPITULO_DIAG2", "GLOSA_CAPITULO_DIAG2", "CODIGO_GRUPO_DIAG2", "GLOSA_GRUPO_DIAG2",
    "CODIGO_CATEGORIA_DIAG2", "GLOSA_CATEGORIA_DIAG2", "CODIGO_SUBCATEGORIA_DIAG2", "GLOSA_SUBCATEGORIA_DIAG2",
    "LUGAR_DEFUNCION"
]


In [73]:
def get_data_paths():
    # Check if running in the GitHub Actions environment
    if 'GITHUB_ACTIONS' in os.environ:
        base_path = os.getcwd()
    else:
        # Assuming your notebook is in the 'scripts' directory
        base_path = os.path.abspath(os.path.join(os.getcwd(), '../../../'))

    source_path = os.path.join(base_path, "data/source/minsal/deis")
    processed_path = os.path.join(base_path, "data/processed/minsal/deis")
    
    return source_path, processed_path

def download_file(url, save_path):
    response = requests.get(url)
    if response.status_code == 200:
        with open(save_path, 'wb') as f:
            f.write(response.content)
    else:
        print(f"Failed to download file from {url}")

def detect_encoding(file_path, num_bytes=1000):
    with open(file_path, 'rb') as f:
        raw_data = f.read(num_bytes)
    result = chardet.detect(raw_data)
    return result['encoding']

def read_csv_file(file_path, delimiter=';', header='infer', names=None):
    encoding = detect_encoding(file_path)
    print(f"Detected encoding for {file_path}: {encoding}")
    try:
        df = pd.read_csv(file_path, sep=delimiter, encoding=encoding, index_col=False, low_memory=False, header=header, names=names)
        print(f"Successfully read file with encoding {encoding}")
        return df
    except Exception as e:
        print(f"Error reading CSV file {file_path}: {e}")
        return None

def extract_csv_from_zip(zip_path, extract_filename, delimiter=';', header='infer', names=None):
    with ZipFile(zip_path, 'r') as zip_file:
        if extract_filename in zip_file.namelist():
            with zip_file.open(extract_filename) as f:
                # Save the file temporarily to detect encoding
                temp_csv_path = os.path.join("/tmp", extract_filename)
                with open(temp_csv_path, 'wb') as temp_f:
                    temp_f.write(f.read())
                df = read_csv_file(temp_csv_path, delimiter=delimiter, header=header, names=names)
                return df
        else:
            print(f"{extract_filename} not found in the zip archive.")
            return None

def process_zip_file(url, extract_filename, source_dir, processed_dir, delimiter=';', header='infer', names=None):
    # Extract filename from URL
    zip_filename = url.split('/')[-1]
    zip_path = os.path.join(source_dir, zip_filename)

    # Check if the file already exists
    if not os.path.exists(zip_path):
        # Download the zip file if it doesn't exist
        download_file(url, zip_path)
        print(f"Downloaded {zip_filename}")
    else:
        print(f"{zip_filename} already exists. Skipping download.")
    
    # Extract and process the CSV file
    df = extract_csv_from_zip(zip_path, extract_filename, delimiter=delimiter, header=header, names=names)
    if df is not None:
        parquet_filename = extract_filename.replace('.csv', '.parquet')
        parquet_path = os.path.join(processed_dir, parquet_filename)
        df.to_parquet(parquet_path)
        print(f"Processed {extract_filename} and saved to {parquet_path}")

In [74]:
# Create directories if they don't exist
source_dir, processed_dir = get_data_paths()
os.makedirs(source_dir, exist_ok=True)
os.makedirs(processed_dir, exist_ok=True)


In [75]:
# List of file URLs and corresponding CSV filenames to extract
file_info = [
    ("https://repositoriodeis.minsal.cl/DatosAbiertos/VITALES/DEFUNCIONES_FUENTE_DEIS_2022_2024_25062024.zip", "DEFUNCIONES_FUENTE_DEIS_2022_2024_25062024.csv"),
    ("https://repositoriodeis.minsal.cl/DatosAbiertos/VITALES/DEFUNCIONES_FUENTE_DEIS_1990_2021_CIFRAS_OFICIALES.zip", "DEFUNCIONES_FUENTE_DEIS_1990_2021_CIFRAS_OFICIALES.csv")
]



In [76]:
# Process each zip file
for url, extract_filename in file_info:
    header_option = 'infer' if '2022_2024' in extract_filename else None
    names_option = None if header_option == 'infer' else DEFAULT_HEADERS
    process_zip_file(url, extract_filename, source_dir, processed_dir, header=header_option, names=names_option)


Downloaded DEFUNCIONES_FUENTE_DEIS_2022_2024_25062024.zip
Detected encoding for /tmp/DEFUNCIONES_FUENTE_DEIS_2022_2024_25062024.csv: ISO-8859-1
Successfully read file with encoding ISO-8859-1
Processed DEFUNCIONES_FUENTE_DEIS_2022_2024_25062024.csv and saved to /Users/ernestolaval/Documents/nodeJS/github/datos_abiertos/data/processed/minsal/deis/DEFUNCIONES_FUENTE_DEIS_2022_2024_25062024.parquet
Downloaded DEFUNCIONES_FUENTE_DEIS_1990_2021_CIFRAS_OFICIALES.zip
Detected encoding for /tmp/DEFUNCIONES_FUENTE_DEIS_1990_2021_CIFRAS_OFICIALES.csv: ISO-8859-1
Successfully read file with encoding ISO-8859-1
Processed DEFUNCIONES_FUENTE_DEIS_1990_2021_CIFRAS_OFICIALES.csv and saved to /Users/ernestolaval/Documents/nodeJS/github/datos_abiertos/data/processed/minsal/deis/DEFUNCIONES_FUENTE_DEIS_1990_2021_CIFRAS_OFICIALES.parquet


In [77]:
# Read and merge both files using the common structure
df1 = pd.read_parquet(os.path.join(processed_dir, "DEFUNCIONES_FUENTE_DEIS_2022_2024_25062024.parquet"))
df2 = pd.read_parquet(os.path.join(processed_dir, "DEFUNCIONES_FUENTE_DEIS_1990_2021_CIFRAS_OFICIALES.parquet"))

# Concatenate dataframes
df_combined = pd.concat([df1, df2], ignore_index=True)

# Save combined dataframe
combined_parquet_path = os.path.join(processed_dir, "combined_defunciones.parquet")
df_combined.to_parquet(combined_parquet_path)
print(f"Combined DataFrame saved to {combined_parquet_path}")


Combined DataFrame saved to /Users/ernestolaval/Documents/nodeJS/github/datos_abiertos/data/processed/minsal/deis/combined_defunciones.parquet
