In [1]:
import os
import glob
import pandas as pd

pd.options.display.max_rows = 10
pd.options.display.float_format = '{:,.0f}'.format

RAW_DIR = "../data/raw"
MOL_2024_DIR = os.path.join(RAW_DIR, "molinetes", "2024")
FREQ_XLSX = os.path.join(RAW_DIR, "frecuencia", "frecuencia_subte.xlsx")

In [2]:
csv_paths = sorted(glob.glob(os.path.join(MOL_2024_DIR, "*.csv")))[:3]
csv_paths

[]

In [3]:
df_list = []
for p in csv_paths:
    try:
        df_tmp = pd.read_csv(p, encoding="utf-8")
    except UnicodeDecodeError:
        df_tmp = pd.read_csv(p, encoding="latin1")
    # Si el separador es ; en vez de ,
    if df_tmp.shape[1] == 1:  # probablemente sea ; o mal parseado
        df_tmp = pd.read_csv(p, sep=";", encoding=df_tmp.encoding if hasattr(df_tmp, "encoding") else "utf-8", engine="python")
    df_tmp["source_file"] = os.path.basename(p)
    df_list.append(df_tmp)

mol = pd.concat(df_list, ignore_index=True)
mol.head()

ValueError: No objects to concatenate

In [None]:
cols = [c.lower() for c in mol.columns]
mol.columns = cols

rename_map = {}
for c in mol.columns:
    if c in ["linea", "line", "línea"]:
        rename_map[c] = "line"
    if c in ["estacion", "estación", "station"]:
        rename_map[c] = "station"
    if c in ["fecha", "date"]:
        rename_map[c] = "date"
    if c in ["viajes", "pasajeros", "passengers", "conteo", "count"]:
        rename_map[c] = "passengers"

mol = mol.rename(columns=rename_map)

if "date" in mol.columns:
    mol["date"] = pd.to_datetime(mol["date"], errors="coerce", dayfirst=True)
    mol["year_month"] = mol["date"].dt.to_period("M").astype(str)

for col in ["line", "station"]:
    if col in mol.columns:
        mol[col] = mol[col].astype(str).str.strip().str.upper()

if "passengers" in mol.columns:
    mol["passengers"] = pd.to_numeric(mol["passengers"], errors="coerce")

mol.head()

In [None]:
agg_line = (mol
            .dropna(subset=["line","passengers"])
            .groupby("line", as_index=False)["passengers"].sum()
            .sort_values("passengers", ascending=False))
agg_line.head(10)

In [None]:
if "year_month" in mol.columns:
    trend = (mol
             .dropna(subset=["year_month","passengers"])
             .groupby(["year_month","line"], as_index=False)["passengers"].sum()
             .sort_values(["year_month","line"]))
    trend.head(12)

In [None]:
freq = pd.read_excel(FREQ_XLSX)
freq.columns = [c.lower() for c in freq.columns]

rename_freq = {}
for c in freq.columns:
    if c in ["linea", "line", "línea"]:
        rename_freq[c] = "line"
    if c in ["mes", "month", "periodo", "period"]:
        rename_freq[c] = "year_month"
    if c in ["frecuencia", "trenes", "despachos", "services", "dispatches"]:
        rename_freq[c] = "services"

freq = freq.rename(columns=rename_freq)

for col in ["line", "year_month"]:
    if col in freq.columns:
        freq[col] = freq[col].astype(str).str.strip().str.upper()

if "services" in freq.columns:
    freq["services"] = pd.to_numeric(freq["services"], errors="coerce")

freq.head()

In [None]:
if {"year_month","line","passengers"}.issubset(set(trend.columns)) and {"year_month","line","services"}.issubset(set(freq.columns)):
    kpi = (trend.merge(freq[["year_month","line","services"]], on=["year_month","line"], how="left")
                 .assign(pax_per_service=lambda d: d["passengers"] / d["services"]))
    kpi.sort_values(["year_month","line"]).head(12)
else:
    kpi = None
kpi.head() if kpi is not None else "KPI join not available yet (check column names)"

In [None]:
os.makedirs("../data/processed", exist_ok=True)
mol_sample = mol.copy()

keep_cols = [c for c in ["year_month","line","station","passengers"] if c in mol_sample.columns]
mol_sample = mol_sample[keep_cols].dropna()

mol_sample.to_csv("../data/processed/molinetes_sample.csv", index=False)
if kpi is not None:
    kpi.to_csv("../data/processed/kpi_pax_per_service.csv", index=False)

"Saved processed samples to ../data/processed/"