In [1]:
# notebooks/test_final_dataset.ipynb

import polars as pl
import matplotlib.pyplot as plt

# Cargar dataset generado
df = pl.read_parquet("../output/final_dataset.parquet")

# Mostrar esquema y primeras filas
print("📊 Esquema del dataset final:")
print(df.schema)

print("\n🔍 Muestras:")
print(df.head(10))

# Validar que todos los días estén en la última semana
ultima_semana = df.select("week").unique()
print(f"\n✅ Semana(s) en el dataset: {ultima_semana}")

# Estadísticas básicas por columnas agregadas
print("\n📈 Estadísticas de columnas agregadas:")
print(df.select([
    pl.col("prints_prev3w").sum().alias("Total prints anteriores"),
    pl.col("taps_prev3w").sum().alias("Total taps anteriores"),
    pl.col("pays_prev3w").sum().alias("Total pagos anteriores"),
    pl.col("total_amount_prev3w").sum().alias("Total gastado"),
    pl.col("clicked").sum().alias("Total de clics en última semana")
]))

# Validar distribución de clicks
df_hist = df.select(["clicked"]).to_pandas()
df_hist["clicked"].value_counts().plot(kind="bar", title="Distribución de clics")
plt.show()


📊 Esquema del dataset final:
Schema({'day': Date, 'user_id': Int64, 'position': Int64, 'value_prop': String, 'timestamp': Date, 'week': Int8, 'clicked': Int32, 'prints_prev3w': Int64, 'taps_prev3w': Int64, 'pays_prev3w': Int64, 'total_amount_prev3w': Float64})

🔍 Muestras:
shape: (10, 11)
┌────────────┬─────────┬──────────┬────────────┬───┬───────────┬───────────┬───────────┬───────────┐
│ day        ┆ user_id ┆ position ┆ value_prop ┆ … ┆ prints_pr ┆ taps_prev ┆ pays_prev ┆ total_amo │
│ ---        ┆ ---     ┆ ---      ┆ ---        ┆   ┆ ev3w      ┆ 3w        ┆ 3w        ┆ unt_prev3 │
│ date       ┆ i64     ┆ i64      ┆ str        ┆   ┆ ---       ┆ ---       ┆ ---       ┆ w         │
│            ┆         ┆          ┆            ┆   ┆ i64       ┆ i64       ┆ i64       ┆ ---       │
│            ┆         ┆          ┆            ┆   ┆           ┆           ┆           ┆ f64       │
╞════════════╪═════════╪══════════╪════════════╪═══╪═══════════╪═══════════╪═══════════╪═══════════╡
│ 2

ModuleNotFoundError: No module named 'pyarrow'

In [2]:
df

day,user_id,position,value_prop,timestamp,week,clicked,prints_prev3w,taps_prev3w,pays_prev3w,total_amount_prev3w
date,i64,i64,str,date,i8,i32,i64,i64,i64,f64
2020-11-30,90211,1,"""point""",2020-11-30,49,0,0,0,2,59.68
2020-11-30,38905,2,"""send_money""",2020-11-30,49,0,0,0,0,0.0
2020-11-30,52473,3,"""credits_consumer""",2020-11-30,49,1,0,0,2,126.51
2020-11-30,50712,0,"""point""",2020-11-30,49,0,0,0,0,0.0
2020-11-30,32071,3,"""point""",2020-11-30,49,0,0,0,0,0.0
…,…,…,…,…,…,…,…,…,…,…
2020-11-30,41810,2,"""prepaid""",2020-11-30,49,0,0,0,0,0.0
2020-11-30,83389,0,"""send_money""",2020-11-30,49,0,0,0,1,99.24
2020-11-30,75902,2,"""credits_consumer""",2020-11-30,49,0,0,0,2,85.82
2020-11-30,21802,0,"""cellphone_recharge""",2020-11-30,49,0,0,0,1,37.36
