In [1]:
import pandas as pd
from pathlib import Path
from src.etl_functions import ingest_data, clean_data

In [2]:
# Obtenemos la ruta del directorio actual (donde corre el notebook)
notebook_dir = Path.cwd()

# Buscamos la raíz del proyecto (el "padre" de la carpeta notebooks)
project_root = notebook_dir.parent

file_path = project_root  / "data" / "raw" / "dirty_cafe_sales.csv"

df_cafe_sales = ingest_data(file_path)

2026-02-04 13:15:24,991 - INFO - Ingestión exitosa. Dimensiones: (10000, 8)


In [3]:
print(df_cafe_sales.info())

<class 'pandas.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Transaction ID    10000 non-null  str    
 1   Item              9031 non-null   str    
 2   Quantity          9521 non-null   float64
 3   Price Per Unit    9467 non-null   float64
 4   Total Spent       9498 non-null   float64
 5   Payment Method    6822 non-null   str    
 6   Location          6039 non-null   str    
 7   Transaction Date  9540 non-null   str    
dtypes: float64(3), str(5)
memory usage: 625.1 KB
None


In [4]:
df_cafe_sales.head(2)

Unnamed: 0,Transaction ID,Item,Quantity,Price Per Unit,Total Spent,Payment Method,Location,Transaction Date
0,TXN_1961373,Coffee,2.0,2.0,4.0,Credit Card,Takeaway,2023-09-08
1,TXN_4977031,Cake,4.0,3.0,12.0,Cash,In-store,2023-05-16


**Diagnóstico (Profiling) básico**

In [5]:
# ¿Hay filas duplicadas?
print("Total rows: ", df_cafe_sales.shape[0])
print("Duplicate Rows: ", df_cafe_sales.duplicated().sum())


Total rows:  10000
Duplicate Rows:  0


In [6]:
null_data = df_cafe_sales.isnull().sum()
rate_null_data = (df_cafe_sales.isnull().mean() * 100).round(2) 

# axis=1 significa "pegar como columnas" (una al lado de la otra)
df_nulls = pd.concat([null_data, rate_null_data], axis=1, keys=['total', 'rate'])

print("Nulls Summary:\n",df_nulls)


Nulls Summary:
                   total   rate
Transaction ID        0   0.00
Item                969   9.69
Quantity            479   4.79
Price Per Unit      533   5.33
Total Spent         502   5.02
Payment Method     3178  31.78
Location           3961  39.61
Transaction Date    460   4.60


In [7]:
print(f"""Dtypes Check:
Transaction Quantity is currently: {df_cafe_sales["Quantity"].dtype}
Transaction Date is currently: {df_cafe_sales["Transaction Date"].dtype}""")

Dtypes Check:
Transaction Quantity is currently: float64
Transaction Date is currently: str


**Type Casting & Normalization**

In [8]:
df_cafe_sales_cleaned = clean_data(df_cafe_sales)

print(df_cafe_sales_cleaned.info())

2026-02-04 13:15:25,221 - INFO - Limpieza completada. Nulos en fechas: 460


<class 'pandas.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   transaction_id    10000 non-null  str           
 1   item              9031 non-null   str           
 2   quantity          9521 non-null   Int64         
 3   price_per_unit    9467 non-null   float64       
 4   total_spent       9498 non-null   float64       
 5   payment_method    6822 non-null   str           
 6   location          6039 non-null   str           
 7   transaction_date  9540 non-null   datetime64[us]
dtypes: Int64(1), datetime64[us](1), float64(2), str(4)
memory usage: 634.9 KB
None


In [10]:

print(df_cafe_sales_cleaned.head(5))

  transaction_id    item  quantity  price_per_unit  total_spent  \
0    TXN_1961373  Coffee         2             2.0          4.0   
1    TXN_4977031    Cake         4             3.0         12.0   
2    TXN_4271903  Cookie         4             1.0          NaN   
3    TXN_7034554   Salad         2             5.0         10.0   
4    TXN_3160411  Coffee         2             2.0          4.0   

   payment_method  location transaction_date  
0     Credit Card  Takeaway       2023-09-08  
1            Cash  In-Store       2023-05-16  
2     Credit Card  In-Store       2023-07-19  
3             NaN       NaN       2023-04-27  
4  Digital Wallet  In-Store       2023-06-11  
