In [2]:
import os
import numpy as np
import pandas as pd

In [3]:
import os
from pathlib import Path

print("CWD (directorio actual):", os.getcwd())
print("CWD (Path):", Path.cwd())


CWD (directorio actual): c:\Users\ADMIN\Desktop\end_to_end_customer_risk_system\notebooks
CWD (Path): c:\Users\ADMIN\Desktop\end_to_end_customer_risk_system\notebooks


In [4]:
# ===============================
# 01. Configuración de rutas base
# ===============================

from pathlib import Path

# Detectar raíz del proyecto (un nivel arriba de /Notebooks)
PROJECT_ROOT = Path.cwd().parent

# Rutas del pipeline
RAW_PATH = PROJECT_ROOT / "Data" / "Raw" / "yellow_tripdata_2025-01.parquet"
PROCESSED_DIR = PROJECT_ROOT / "Data" / "processed"
REPORTS_DIR = PROJECT_ROOT / "reports"

# Crear directorios si no existen
PROCESSED_DIR.mkdir(parents=True, exist_ok=True)
REPORTS_DIR.mkdir(parents=True, exist_ok=True)

# Logging de rutas
print("Rutas configuradas:")
print("PROJECT_ROOT  =", PROJECT_ROOT)
print("RAW_PATH      =", RAW_PATH)
print("PROCESSED_DIR =", PROCESSED_DIR)
print("REPORTS_DIR   =", REPORTS_DIR)

# Validación crítica del archivo raw
if not RAW_PATH.exists():
    raise FileNotFoundError(f" No existe el archivo raw en: {RAW_PATH}")

print(" Archivo raw encontrado. Continuando pipeline...")


Rutas configuradas:
PROJECT_ROOT  = c:\Users\ADMIN\Desktop\end_to_end_customer_risk_system
RAW_PATH      = c:\Users\ADMIN\Desktop\end_to_end_customer_risk_system\Data\Raw\yellow_tripdata_2025-01.parquet
PROCESSED_DIR = c:\Users\ADMIN\Desktop\end_to_end_customer_risk_system\Data\processed
REPORTS_DIR   = c:\Users\ADMIN\Desktop\end_to_end_customer_risk_system\reports
 Archivo raw encontrado. Continuando pipeline...


In [5]:
cols = [
    "tpep_pickup_datetime",
    "tpep_dropoff_datetime",
    "trip_distance",
    "fare_amount",
    "total_amount",
    "passenger_count",
    "payment_type",
]

df = pd.read_parquet(RAW_PATH, columns=cols)

print("Dataset cargado ")
print("Shape:", df.shape)
df.head()

Dataset cargado 
Shape: (3475226, 7)


Unnamed: 0,tpep_pickup_datetime,tpep_dropoff_datetime,trip_distance,fare_amount,total_amount,passenger_count,payment_type
0,2025-01-01 00:18:38,2025-01-01 00:26:59,1.6,10.0,18.0,1.0,1
1,2025-01-01 00:32:40,2025-01-01 00:35:13,0.5,5.1,12.12,1.0,1
2,2025-01-01 00:44:04,2025-01-01 00:46:01,0.6,5.1,12.1,1.0,1
3,2025-01-01 00:14:27,2025-01-01 00:20:01,0.52,7.2,9.7,3.0,2
4,2025-01-01 00:21:34,2025-01-01 00:25:06,0.66,5.8,8.3,3.0,2


In [6]:
print("Tipos de datos:")
display(df.dtypes)

print("\nNulos por columna:")
display(df.isna().sum())

print("\nFilas duplicadas:", df.duplicated().sum())



Tipos de datos:


tpep_pickup_datetime     datetime64[us]
tpep_dropoff_datetime    datetime64[us]
trip_distance                   float64
fare_amount                     float64
total_amount                    float64
passenger_count                 float64
payment_type                      int64
dtype: object


Nulos por columna:


tpep_pickup_datetime          0
tpep_dropoff_datetime         0
trip_distance                 0
fare_amount                   0
total_amount                  0
passenger_count          540149
payment_type                  0
dtype: int64


Filas duplicadas: 0


In [7]:
critical = ["tpep_pickup_datetime", "tpep_dropoff_datetime", "trip_distance", "total_amount"]
df = df.dropna(subset=critical).copy()

print("Después de dropna críticos ")
print("Shape:", df.shape)


Después de dropna críticos 
Shape: (3475226, 7)


In [8]:
df["tpep_pickup_datetime"] = pd.to_datetime(df["tpep_pickup_datetime"], errors="coerce")
df["tpep_dropoff_datetime"] = pd.to_datetime(df["tpep_dropoff_datetime"], errors="coerce")

df = df.dropna(subset=["tpep_pickup_datetime", "tpep_dropoff_datetime"]).copy()

print("Fechas validadas ")
print(df[["tpep_pickup_datetime","tpep_dropoff_datetime"]].head())


df["trip_duration_min"] = (df["tpep_dropoff_datetime"] - df["tpep_pickup_datetime"]).dt.total_seconds() / 60.0

df["pickup_hour"] = df["tpep_pickup_datetime"].dt.hour
df["pickup_dayofweek"] = df["tpep_pickup_datetime"].dt.dayofweek  # 0=Lunes ... 6=Domingo
df["is_weekend"] = df["pickup_dayofweek"].isin([5, 6]).astype(int)

df["time_bucket"] = pd.cut(
    df["pickup_hour"],
    bins=[-1, 5, 11, 17, 23],
    labels=["madrugada", "mañana", "tarde", "noche"]
).astype(str)

df[["trip_duration_min","pickup_hour","pickup_dayofweek","is_weekend","time_bucket"]].head()


Fechas validadas 
  tpep_pickup_datetime tpep_dropoff_datetime
0  2025-01-01 00:18:38   2025-01-01 00:26:59
1  2025-01-01 00:32:40   2025-01-01 00:35:13
2  2025-01-01 00:44:04   2025-01-01 00:46:01
3  2025-01-01 00:14:27   2025-01-01 00:20:01
4  2025-01-01 00:21:34   2025-01-01 00:25:06


Unnamed: 0,trip_duration_min,pickup_hour,pickup_dayofweek,is_weekend,time_bucket
0,8.35,0,2,0,madrugada
1,2.55,0,2,0,madrugada
2,1.95,0,2,0,madrugada
3,5.566667,0,2,0,madrugada
4,3.533333,0,2,0,madrugada


In [9]:
# Reglas conservadoras para evitar basura:
# - Distancia >= 0
# - total_amount y fare_amount deben ser > 0 (o al menos no negativos)
# - duración debe ser > 0 (o al menos no negativa)
df = df[
    (df["trip_distance"] >= 0) &
    (df["total_amount"] > 0) &
    (df["fare_amount"] >= 0) &
    (df["trip_duration_min"] > 0)
].copy()

print("Después de reglas lógicas ")
print("Shape:", df.shape)



Después de reglas lógicas 
Shape: (3328229, 12)


In [10]:
# Evitar divisiones por 0 con np.where
df["cost_per_km"] = np.where(df["trip_distance"] > 0, df["total_amount"] / df["trip_distance"], np.nan)
df["cost_per_min"] = np.where(df["trip_duration_min"] > 0, df["total_amount"] / df["trip_duration_min"], np.nan)

# Proporción tarifa base vs total (sirve para entender recargos, fees, etc.)
df["fare_to_total_ratio"] = np.where(df["total_amount"] > 0, df["fare_amount"] / df["total_amount"], np.nan)

df[["trip_distance","trip_duration_min","fare_amount","total_amount","cost_per_km","cost_per_min","fare_to_total_ratio"]].head()


Unnamed: 0,trip_distance,trip_duration_min,fare_amount,total_amount,cost_per_km,cost_per_min,fare_to_total_ratio
0,1.6,8.35,10.0,18.0,11.25,2.155689,0.555556
1,0.5,2.55,5.1,12.12,24.24,4.752941,0.420792
2,0.6,1.95,5.1,12.1,20.166667,6.205128,0.421488
3,0.52,5.566667,7.2,9.7,18.653846,1.742515,0.742268
4,0.66,3.533333,5.8,8.3,12.575758,2.349057,0.698795


In [12]:
# Definimos umbrales razonables (conservadores).
# Nota: Ajustables luego según percentiles.
df["flag_zero_distance"] = (df["trip_distance"] == 0).astype(int)
df["flag_very_short_trip"] = (df["trip_distance"] < 0.5).astype(int)
df["flag_very_long_trip"] = (df["trip_distance"] > 30).astype(int)

df["flag_long_duration"] = (df["trip_duration_min"] > 120).astype(int)  # > 2 horas
df["flag_short_duration"] = (df["trip_duration_min"] < 2).astype(int)   # < 2 min

# Caso sospechoso típico: muy corto pero costoso
df["flag_short_but_expensive"] = ((df["trip_distance"] < 1) & (df["total_amount"] > 50)).astype(int)

# Pagos: 1 suele ser tarjeta, 2 cash (depende del diccionario TLC, pero ayuda como proxy)
df["flag_cash_payment"] = (df["payment_type"] == 2).astype(int)

df[
    ["trip_distance","trip_duration_min","total_amount",
     "flag_zero_distance","flag_very_short_trip","flag_very_long_trip",
     "flag_long_duration","flag_short_duration","flag_short_but_expensive","flag_cash_payment"]
].head()


Unnamed: 0,trip_distance,trip_duration_min,total_amount,flag_zero_distance,flag_very_short_trip,flag_very_long_trip,flag_long_duration,flag_short_duration,flag_short_but_expensive,flag_cash_payment
0,1.6,8.35,18.0,0,0,0,0,0,0,0
1,0.5,2.55,12.12,0,0,0,0,0,0,0
2,0.6,1.95,12.1,0,0,0,0,1,0,0
3,0.52,5.566667,9.7,0,0,0,0,0,0,1
4,0.66,3.533333,8.3,0,0,0,0,0,0,1
