In [1]:
import os
import sys
import pandas as pd
import requests
from collections import defaultdict

In [2]:
# Check if running in the GitHub Actions environment
if 'GITHUB_ACTIONS' in os.environ:
    project_path = os.getcwd()
else:
    # Assuming your script is in the 'scripts' directory
    project_path = os.path.abspath(os.path.join(os.getcwd(), '../../..'))

# Add the project directory to the PYTHONPATH if it's not already there
if project_path not in sys.path:
    sys.path.append(project_path)

# Now you can import your custom module
from data_utils.data_processing import download_file, process_zip_file


In [3]:
from data_utils.data_processing import download_file, read_csv_file


In [4]:
def get_data_paths():
    # Check if running in the GitHub Actions environment
    if 'GITHUB_ACTIONS' in os.environ:
        base_path = os.path.join(os.getcwd(), 'data')
    else:
        # Assuming your script is in the 'scripts' directory
        base_path = os.path.abspath(os.path.join(os.getcwd(), '../../../data'))

    source_path = os.path.join(base_path, "source/ine/empleo")
    processed_path = os.path.join(base_path, "processed/ine/empleo")
    
    return source_path, processed_path


In [5]:
source_dir, processed_dir = get_data_paths()

print(f"Source dir: {source_dir}")
print(f"Processed dir: {processed_dir}")


Source dir: /Users/ernestolaval/Documents/Github Repositories/data_chile/data/source/ine/empleo
Processed dir: /Users/ernestolaval/Documents/Github Repositories/data_chile/data/processed/ine/empleo


In [6]:
os.makedirs(source_dir, exist_ok=True)
os.makedirs(processed_dir, exist_ok=True)


In [9]:
# List of file URLs
file_urls = [
    "https://www.ine.gob.cl/docs/default-source/ocupacion-y-desocupacion/bbdd/2024/csv/ene-2024-04-mam.csv",
    "https://www.ine.gob.cl/docs/default-source/ocupacion-y-desocupacion/bbdd/2024/csv/ene-2024-03-fma.csv",
    "https://www.ine.gob.cl/docs/default-source/ocupacion-y-desocupacion/bbdd/2024/csv/ene-2024-02-efm.csv",
    "https://www.ine.gob.cl/docs/default-source/ocupacion-y-desocupacion/bbdd/2024/csv/ene-2024-01-def.csv",

    "https://www.ine.gob.cl/docs/default-source/ocupacion-y-desocupacion/bbdd/2023/csv/ene-2023-12-nde.csv",
    "https://www.ine.gob.cl/docs/default-source/ocupacion-y-desocupacion/bbdd/2023/csv/ene-2023-11-ond.csv",
    "https://www.ine.gob.cl/docs/default-source/ocupacion-y-desocupacion/bbdd/2023/csv/ene-2023-10-son.csv",
    "https://www.ine.gob.cl/docs/default-source/ocupacion-y-desocupacion/bbdd/2023/csv/ene-2023-09-aso.csv",
    "https://www.ine.gob.cl/docs/default-source/ocupacion-y-desocupacion/bbdd/2023/csv/ene-2023-08-jas.csv",
    "https://www.ine.gob.cl/docs/default-source/ocupacion-y-desocupacion/bbdd/2023/csv/ene-2023-07-jja.csv",
    "https://www.ine.gob.cl/docs/default-source/ocupacion-y-desocupacion/bbdd/2023/csv/ene-2023-06-mjj.csv",
    "https://www.ine.gob.cl/docs/default-source/ocupacion-y-desocupacion/bbdd/2023/csv/ene-2023-05-amj.csv",
    "https://www.ine.gob.cl/docs/default-source/ocupacion-y-desocupacion/bbdd/2023/csv/ene-2023-04-mam.csv",
    "https://www.ine.gob.cl/docs/default-source/ocupacion-y-desocupacion/bbdd/2023/csv/ene-2023-03-fma.csv",
    "https://www.ine.gob.cl/docs/default-source/ocupacion-y-desocupacion/bbdd/2023/csv/ene-2023-02-efm.csv",
    "https://www.ine.gob.cl/docs/default-source/ocupacion-y-desocupacion/bbdd/2023/csv/ene-2023-01-def.csv",

    "https://www.ine.gob.cl/docs/default-source/ocupacion-y-desocupacion/bbdd/2022/csv/ene-2022-04-mam.csv",
    "https://www.ine.gob.cl/docs/default-source/ocupacion-y-desocupacion/bbdd/2022/csv/ene-2022-03-fma.csv",
    "https://www.ine.gob.cl/docs/default-source/ocupacion-y-desocupacion/bbdd/2022/csv/ene-2022-02-efm.csv",
    "https://www.ine.gob.cl/docs/default-source/ocupacion-y-desocupacion/bbdd/2022/csv/ene-2022-01-def.csv",

    "https://www.ine.gob.cl/docs/default-source/ocupacion-y-desocupacion/bbdd/2021/csv/ene-2021-04-mam.csv",
    "https://www.ine.gob.cl/docs/default-source/ocupacion-y-desocupacion/bbdd/2021/csv/ene-2021-03-fma.csv",
    "https://www.ine.gob.cl/docs/default-source/ocupacion-y-desocupacion/bbdd/2021/csv/ene-2021-02-efm.csv",
    "https://www.ine.gob.cl/docs/default-source/ocupacion-y-desocupacion/bbdd/2021/csv/ene-2021-01-def.csv",

    "https://www.ine.gob.cl/docs/default-source/ocupacion-y-desocupacion/bbdd/2020/csv/ene-2020-04-mam.csv",
    "https://www.ine.gob.cl/docs/default-source/ocupacion-y-desocupacion/bbdd/2020/csv/ene-2020-03-fma.csv",
    "https://www.ine.gob.cl/docs/default-source/ocupacion-y-desocupacion/bbdd/2020/csv/ene-2020-02-efm.csv",
    "https://www.ine.gob.cl/docs/default-source/ocupacion-y-desocupacion/bbdd/2020/csv/ene-2020-01-def.csv",

    "https://www.ine.gob.cl/docs/default-source/ocupacion-y-desocupacion/bbdd/2019/csv/ene-2019-04-mam.csv",
    "https://www.ine.gob.cl/docs/default-source/ocupacion-y-desocupacion/bbdd/2019/csv/ene-2019-03-fma.csv",
    "https://www.ine.gob.cl/docs/default-source/ocupacion-y-desocupacion/bbdd/2019/csv/ene-2019-02-efm.csv",
    "https://www.ine.gob.cl/docs/default-source/ocupacion-y-desocupacion/bbdd/2019/csv/ene-2019-01-def.csv",
    # Add more URLs as needed
]


In [10]:
delimiter = ";"


In [11]:
# Function to preprocess the "fact_cal" field
def preprocess_fact_cal(df):
    if 'fact_cal' in df.columns:
        df['fact_cal'] = df['fact_cal'].str.replace(',', '.').astype(float)
    return df



In [12]:
# Dictionary to store DataFrames for each trimester
trimester_data = defaultdict(pd.DataFrame)


In [13]:
# Download files and group by trimester
for url in file_urls:
    # Extract filename from URL
    filename = url.split('/')[-1]
    csv_path = os.path.join(source_dir, filename)
    
    print(f"Next {url}")
    # Check if the file already exists
    if not os.path.exists(csv_path):
        # Download the CSV file if it doesn't exist
        download_file(url, csv_path)
        print(f"Downloaded {filename}")
    else:
        print(f"{filename} already exists. Skipping download.")
    
    # Read the CSV file
    df = read_csv_file(csv_path, delimiter=delimiter)
    
    # Preprocess the "fact_cal" field
    df = preprocess_fact_cal(df)
    
    # Extract trimester info (e.g., "04-mam" from "ene-2024-04-mam.csv")
    trimester = '-'.join(filename.split('-')[2:4]).split('.')[0]
    
    # Append data to the corresponding trimester DataFrame
    trimester_data[trimester] = pd.concat([trimester_data[trimester], df], ignore_index=True)
    
    print(f"Added {filename} to the trimester {trimester} DataFrame.")

Next https://www.ine.gob.cl/docs/default-source/ocupacion-y-desocupacion/bbdd/2024/csv/ene-2024-04-mam.csv
ene-2024-04-mam.csv already exists. Skipping download.
Detected encoding for /Users/ernestolaval/Documents/Github Repositories/data_chile/data/source/ine/empleo/ene-2024-04-mam.csv: utf-8
Successfully read file with encoding utf-8
Added ene-2024-04-mam.csv to the trimester 04-mam DataFrame.
Next https://www.ine.gob.cl/docs/default-source/ocupacion-y-desocupacion/bbdd/2024/csv/ene-2024-03-fma.csv
ene-2024-03-fma.csv already exists. Skipping download.
Detected encoding for /Users/ernestolaval/Documents/Github Repositories/data_chile/data/source/ine/empleo/ene-2024-03-fma.csv: utf-8
Successfully read file with encoding utf-8
Added ene-2024-03-fma.csv to the trimester 03-fma DataFrame.
Next https://www.ine.gob.cl/docs/default-source/ocupacion-y-desocupacion/bbdd/2024/csv/ene-2024-02-efm.csv
Downloaded ene-2024-02-efm.csv
Detected encoding for /Users/ernestolaval/Documents/Github Repos

In [14]:
# Save each trimester DataFrame as a Parquet file
for trimester, df in trimester_data.items():
    processed_path = os.path.join(processed_dir, f"ene-{trimester}.parquet")
    df.to_parquet(processed_path)
    print(f"Merged DataFrame for trimester {trimester} saved to {processed_path}")


Merged DataFrame for trimester 04-mam saved to /Users/ernestolaval/Documents/Github Repositories/data_chile/data/processed/ine/empleo/ene-04-mam.parquet
Merged DataFrame for trimester 03-fma saved to /Users/ernestolaval/Documents/Github Repositories/data_chile/data/processed/ine/empleo/ene-03-fma.parquet
Merged DataFrame for trimester 02-efm saved to /Users/ernestolaval/Documents/Github Repositories/data_chile/data/processed/ine/empleo/ene-02-efm.parquet
Merged DataFrame for trimester 01-def saved to /Users/ernestolaval/Documents/Github Repositories/data_chile/data/processed/ine/empleo/ene-01-def.parquet
