In [1]:
import os, glob, csv, pandas as pd, re
from collections import defaultdict

BASE_DIR = os.path.abspath("..")
RAW_DIR = os.path.join(BASE_DIR, "data", "raw")
MOL_DIR = os.path.join(RAW_DIR, "molinetes")

mol_csvs = sorted(glob.glob(os.path.join(MOL_DIR, "*.csv")))
print("CSV detectados:", len(mol_csvs))
for p in mol_csvs[:5]:
    print(" -", os.path.basename(p))

assert len(mol_csvs) > 0, "No se encontraron CSVs en data/raw/molinetes/."

ENC = "utf-8-sig"
SEP = ";"


def normalize_token(s: str) -> str:
    s = (s or "").strip().strip('"').strip()          # quita comillas/espacios
    s = re.sub(r"\s+", "_", s)                         # espacios -> _
    s = s.lower()
    if s == "":
        s = "col"
    return s

def make_unique(cols):
    seen = defaultdict(int)
    out = []
    for c in cols:
        base = normalize_token(c)
        seen[base] += 1
        if seen[base] > 1:
            out.append(f"{base}_{seen[base]-1}")       # col, col_1, col_2...
        else:
            out.append(base)
    return out

def read_header_names(path, enc="utf-8-sig", sep=";"):
    with open(path, "r", encoding=enc, errors="replace") as f:
        first_line = f.readline().rstrip("\n\r")
    # quita comillas de punta si aparecen
    if first_line.startswith('"') and first_line.endswith('"'):
        first_line = first_line[1:-1]
    raw_cols = [c for c in first_line.split(sep)]
    cols = make_unique(raw_cols)
    return cols

def read_molinetes_quoted(path, enc="utf-8-sig", sep=";"):
    cols = read_header_names(path, enc=enc, sep=sep)
    # leer con header=None y nuestros names únicos
    df = pd.read_csv(
        path,
        sep=sep,
        encoding=enc,
        engine="python",
        header=None,
        names=cols,
        quoting=csv.QUOTE_NONE,
        on_bad_lines="skip"
    )
    # limpiar comillas residuales en object
    for c in df.select_dtypes(include="object").columns:
        df[c] = df[c].astype(str).str.strip('"').str.strip()
    return df

# Cargar una muestra de 3 archivos
df_list = []
for p in mol_csvs[:3]:
    df = read_molinetes_quoted(p, enc=ENC, sep=SEP)
    df["source_file"] = os.path.basename(p)
    df_list.append(df)

mol = pd.concat(df_list, ignore_index=True)
mol.columns = [c.strip().lower() for c in mol.columns]

# Mapeo a nombres canónicos
rename_map = {}
for c in mol.columns:
    if c in {"fecha"}: rename_map[c] = "date"
    if c in {"desde", "desde_hora", "hora_desde"}: rename_map[c] = "time_from"
    if c in {"hasta", "hasta_hora", "hora_hasta"}: rename_map[c] = "time_to"
    if c in {"linea", "línea", "line"}: rename_map[c] = "line"
    if c in {"estacion", "estación", "station"}: rename_map[c] = "station"
    # campos de demanda comunes
    if c in {"pax_total","viajes","pasajeros","pax","passengers","conteo","count"}:
        rename_map[c] = "passengers"

mol = mol.rename(columns=rename_map)

# Parseos
if "date" in mol.columns:
    mol["date"] = pd.to_datetime(mol["date"], errors="coerce", dayfirst=True)
    mol["year_month"] = mol["date"].dt.to_period("M").astype(str)

for col in ["line","station","time_from","time_to"]:
    if col in mol.columns:
        mol[col] = mol[col].astype(str).str.strip().str.upper()

# Si no existe passengers, tratar de derivarlo
if "passengers" not in mol.columns:
    for alt in ["pax_total","viajes","pasajeros","pax","count"]:
        if alt in mol.columns:
            mol["passengers"] = pd.to_numeric(mol[alt], errors="coerce")
            break

# Normalizar línea "LineaB" -> "B"
if "line" in mol.columns:
    mol["line"] = (mol["line"]
                   .str.upper()
                   .str.replace(r"^LINEA", "", regex=True)
                   .str.strip())

mol.head()


CSV detectados: 24
 - 202401_PAX15min-ABC.csv
 - 202401_PAX15min-DEH.csv
 - 202402_PAX15min-ABC.csv
 - 202402_PAX15min-DEH.csv
 - 202403_PAX15min-ABC.csv


  mol["date"] = pd.to_datetime(mol["date"], errors="coerce", dayfirst=True)


Unnamed: 0,date,time_from,time_to,line,molinete,station,pax_pagos,pax_pases_pagos,pax_franq,passengers,source_file,col,col_1,year_month
0,NaT,DESDE,HASTA,,MOLINETE,ESTACION,pax_pagos,pax_pases_pagos,pax_franq,pax_TOTAL,202401_PAX15min-ABC.csv,,,NaT
1,2024-01-01,07:45:00,08:00:00,B,LineaB_Malabia_N_Turn01,MALABIA,3,0,0,3,202401_PAX15min-ABC.csv,,,2024-01
2,2024-01-01,07:45:00,08:00:00,B,LineaB_Tronador_Turn01,TRONADOR,1,0,0,1,202401_PAX15min-ABC.csv,,,2024-01
3,2024-01-01,07:45:00,08:00:00,B,LineaB_Pellegrini_E_Turn05,CARLOS PELLEGRINI,13,0,0,13,202401_PAX15min-ABC.csv,,,2024-01
4,2024-01-01,07:45:00,08:00:00,A,LineaA_Flores_Este_Turn03,FLORES,2,0,0,2,202401_PAX15min-ABC.csv,,,2024-01


In [2]:
import numpy as np
# 1) eliminar columnas basura si existen
for bad in ["col", "col_1", "unnamed: 1", "unnamed: 2"]:
    if bad in mol.columns:
        mol = mol.drop(columns=[bad], errors="ignore")

# 2) eliminar la fila de encabezado “incrustada” (detectable porque trae 'DESDE'/'HASTA')
mask_header_row = (
    mol.get("time_from", "").astype(str).str.upper().eq("DESDE") |
    mol.get("time_to", "").astype(str).str.upper().eq("HASTA")
)
mol = mol.loc[~mask_header_row].copy()

# 3) coerción de tipos
if "date" in mol.columns:
    mol["date"] = pd.to_datetime(mol["date"], errors="coerce", dayfirst=True)
    mol["year_month"] = mol["date"].dt.to_period("M").astype(str)

for col in ["time_from","time_to"]:
    if col in mol.columns:
        # mantener como string HH:MM:SS; si quisieras tipo time:
        # mol[col] = pd.to_datetime(mol[col], format="%H:%M:%S", errors="coerce").dt.time
        mol[col] = mol[col].astype(str).str.strip()

for col in ["line","station","molinete"]:
    if col in mol.columns:
        mol[col] = mol[col].astype(str).str.strip().str.upper()

# 4) pasajeros numérico (preferimos pax_total si existe)
if "passengers" not in mol.columns:
    for cand in ["pax_total","viajes","pasajeros","pax","count"]:
        if cand in mol.columns:
            mol["passengers"] = pd.to_numeric(mol[cand], errors="coerce")
            break
else:
    mol["passengers"] = pd.to_numeric(mol["passengers"], errors="coerce")

# 5) normalizar línea "LINEA B" o "LineaB" -> "B"
if "line" in mol.columns:
    mol["line"] = (mol["line"]
                   .str.replace(r"^LINEA\s*", "", regex=True)
                   .str.replace(r"^LINEA", "", regex=True)
                   .str.replace("_", " ")
                   .str.strip()
                   .str.upper())

mol = mol.reset_index(drop=True)
mol.head()


Unnamed: 0,date,time_from,time_to,line,molinete,station,pax_pagos,pax_pases_pagos,pax_franq,passengers,source_file,year_month
0,2024-01-01,07:45:00,08:00:00,B,LINEAB_MALABIA_N_TURN01,MALABIA,3,0,0,3,202401_PAX15min-ABC.csv,2024-01
1,2024-01-01,07:45:00,08:00:00,B,LINEAB_TRONADOR_TURN01,TRONADOR,1,0,0,1,202401_PAX15min-ABC.csv,2024-01
2,2024-01-01,07:45:00,08:00:00,B,LINEAB_PELLEGRINI_E_TURN05,CARLOS PELLEGRINI,13,0,0,13,202401_PAX15min-ABC.csv,2024-01
3,2024-01-01,07:45:00,08:00:00,A,LINEAA_FLORES_ESTE_TURN03,FLORES,2,0,0,2,202401_PAX15min-ABC.csv,2024-01
4,2024-01-01,07:45:00,08:00:00,B,LINEAB_DORREGO_N_TURN03,DORREGO,1,0,0,1,202401_PAX15min-ABC.csv,2024-01


In [3]:
# Pasajeros totales por línea (muestra 3 CSV)
agg_line = (mol
            .dropna(subset=["line","passengers"])
            .groupby("line", as_index=False)["passengers"].sum()
            .sort_values("passengers", ascending=False))
agg_line


Unnamed: 0,line,passengers
1,B,8300556
0,A,6361281
2,C,5201400
5,H,1659613
4,E,1411591
3,D,605572


In [4]:
# Tendencia mensual por línea (si hay fechas)
if "year_month" in mol.columns:
    trend = (mol
             .dropna(subset=["year_month","line","passengers"])
             .groupby(["year_month","line"], as_index=False)["passengers"].sum()
             .sort_values(["year_month","line"]))
    trend.head(12)


In [5]:
import plotly.express as px

# Barras por línea
fig_bar = px.bar(
    agg_line,
    x="line",
    y="passengers",
    title="Passengers by Line (sample of 3 CSVs)",
    text_auto=True
)
fig_bar.update_layout(xaxis_title="Line", yaxis_title="Passengers")
fig_bar.show()


In [6]:
# Línea temporal por línea (si existe trend)
if "year_month" in globals():
    fig_line = px.line(
        trend,
        x="year_month",
        y="passengers",
        color="line",
        markers=True,
        title="Monthly Passengers by Line (sample)"
    )
    fig_line.update_layout(xaxis_title="Year-Month", yaxis_title="Passengers")
    fig_line.show()


In [7]:
import os
os.makedirs("../data/processed", exist_ok=True)

agg_line.to_csv("../data/processed/agg_passengers_by_line.csv", index=False)
if "year_month" in mol.columns:
    trend.to_csv("../data/processed/trend_passengers_by_line_month.csv", index=False)

"Saved processed outputs to ../data/processed/"


'Saved processed outputs to ../data/processed/'