In [None]:
## Contexto
Este notebook explora los datasets base habilitados, con el objetivo de validar calidad, consistencia y potencial de uso
para el modelado.


In [2]:
import pandas as pd
import numpy as np

pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", 200)

In [4]:
users = pd.read_csv("../data/raw/users.csv")
products = pd.read_csv("../data/raw/products.csv")
events = pd.read_csv("../data/raw/events.csv")
transactions = pd.read_csv("../data/raw/transaction_items.csv")
propension = pd.read_csv("../data/raw/propension_compra.csv")
uplift = pd.read_csv("../data/raw/uplift_marketing.csv")


In [5]:
datasets = {
    "users": users,
    "products": products,
    "events": events,
    "transactions": transactions,
    "propension": propension,
    "uplift": uplift
}

for name, df in datasets.items():
    print(f"\n{name.upper()}")
    print("Shape:", df.shape)
    display(df.head())



USERS
Shape: (50000, 4)


Unnamed: 0,user_id,registration_date,country,segment
0,1,2023-11-02,US,returning
1,2,2022-10-14,ES,vip
2,3,2023-09-21,US,returning
3,4,2022-05-31,US,vip
4,5,2023-09-29,AR,returning



PRODUCTS
Shape: (5000, 4)


Unnamed: 0,product_id,category,price,vendor_id
0,1,electro,223.48,415
1,2,hogar,94.72,315
2,3,electro,14.6,208
3,4,moda,155.22,256
4,5,moda,49.12,308



EVENTS
Shape: (500000, 6)


Unnamed: 0,event_id,user_id,event_type,product_id,timestamp,revenue
0,1,12606,product_view,1840,2024-12-16 16:24:00,0.0
1,2,30164,product_view,2589,2024-04-28 06:46:00,0.0
2,3,15219,product_view,954,2024-06-25 07:41:00,0.0
3,4,44930,product_view,2381,2024-01-30 02:19:00,0.0
4,5,25657,product_view,686,2024-03-23 21:21:00,0.0



TRANSACTIONS
Shape: (120000, 4)


Unnamed: 0,transaction_id,user_id,product_id,quantity
0,36197,21258,800,4
1,30229,7873,2998,4
2,33857,1090,3669,2
3,37328,23646,2090,1
4,27485,31497,3900,3



PROPENSION
Shape: (100000, 10)


Unnamed: 0,user_id,edad,sesiones_30d,categoria_frecuente,gasto_historico,tiempo_promedio_sesion,descuento_usado,productos_vistos,abandono_carrito,compra
0,1,56,4,hogar,123.59,656.65,0,18,1,0
1,2,69,7,moda,84.69,642.55,2,25,1,0
2,3,46,7,electronica,595.72,240.46,1,15,0,0
3,4,32,4,hogar,203.28,286.44,0,20,1,0
4,5,60,4,hogar,271.37,307.05,4,15,1,0



UPLIFT
Shape: (50000, 7)


Unnamed: 0,user_id,grupo,prob_compra_previa,descuento_ofrecido,engagement_score,historial_compras,compra
0,1,control,0.105,5,0.335,4,0
1,2,control,0.371,15,0.714,9,0
2,3,tratamiento,0.226,10,0.963,1,0
3,4,tratamiento,0.068,5,0.346,1,0
4,5,tratamiento,0.073,10,0.486,3,1


In [6]:
for name, df in datasets.items():
    print(f"\n{name.upper()} COLUMNS")
    display(pd.DataFrame({
        "column": df.columns,
        "dtype": df.dtypes.values
    }))



USERS COLUMNS


Unnamed: 0,column,dtype
0,user_id,int64
1,registration_date,object
2,country,object
3,segment,object



PRODUCTS COLUMNS


Unnamed: 0,column,dtype
0,product_id,int64
1,category,object
2,price,float64
3,vendor_id,int64



EVENTS COLUMNS


Unnamed: 0,column,dtype
0,event_id,int64
1,user_id,int64
2,event_type,object
3,product_id,int64
4,timestamp,object
5,revenue,float64



TRANSACTIONS COLUMNS


Unnamed: 0,column,dtype
0,transaction_id,int64
1,user_id,int64
2,product_id,int64
3,quantity,int64



PROPENSION COLUMNS


Unnamed: 0,column,dtype
0,user_id,int64
1,edad,int64
2,sesiones_30d,int64
3,categoria_frecuente,object
4,gasto_historico,float64
5,tiempo_promedio_sesion,float64
6,descuento_usado,int64
7,productos_vistos,int64
8,abandono_carrito,int64
9,compra,int64



UPLIFT COLUMNS


Unnamed: 0,column,dtype
0,user_id,int64
1,grupo,object
2,prob_compra_previa,float64
3,descuento_ofrecido,int64
4,engagement_score,float64
5,historial_compras,int64
6,compra,int64


In [7]:
for name, df in datasets.items():
    print(f"\n{name.upper()} - Missing values (%)")
    display((df.isnull().mean() * 100).sort_values(ascending=False))



USERS - Missing values (%)


user_id              0.0
registration_date    0.0
country              0.0
segment              0.0
dtype: float64


PRODUCTS - Missing values (%)


product_id    0.0
category      0.0
price         0.0
vendor_id     0.0
dtype: float64


EVENTS - Missing values (%)


event_id      0.0
user_id       0.0
event_type    0.0
product_id    0.0
timestamp     0.0
revenue       0.0
dtype: float64


TRANSACTIONS - Missing values (%)


transaction_id    0.0
user_id           0.0
product_id        0.0
quantity          0.0
dtype: float64


PROPENSION - Missing values (%)


user_id                   0.0
edad                      0.0
sesiones_30d              0.0
categoria_frecuente       0.0
gasto_historico           0.0
tiempo_promedio_sesion    0.0
descuento_usado           0.0
productos_vistos          0.0
abandono_carrito          0.0
compra                    0.0
dtype: float64


UPLIFT - Missing values (%)


user_id               0.0
grupo                 0.0
prob_compra_previa    0.0
descuento_ofrecido    0.0
engagement_score      0.0
historial_compras     0.0
compra                0.0
dtype: float64

In [8]:
print("Transactions users in users table:",
      transactions["user_id"].isin(users["user_id"]).mean())

print("Transactions products in products table:",
      transactions["product_id"].isin(products["product_id"]).mean())


Transactions users in users table: 1.0
Transactions products in products table: 1.0


In [None]:
## Observaciones iniciales

- 
- 
- 
