In [61]:
# Descarga de archivos
import pandas as pd
from pathlib import Path

# Directorio de datos raw 

RAW = Path("data/raw")

#Directorio de datos procesados
PROCESSED = Path("data/processed")

#Leer los archivos CSV
sales = pd.read_csv(RAW / "sales_train.csv")
test  = pd.read_csv(RAW / "test.csv")
items = pd.read_csv(RAW / "items_en.csv")
shops = pd.read_csv(RAW / "shops_en.csv")
cats  = pd.read_csv(RAW / "item_categories_en.csv")
sample  = pd.read_csv(RAW / "sample_submission.csv")

print("sales:", sales.shape)
print("test :", test.shape)
print("items:", items.shape)
print("shops:", shops.shape)
print("cats :", cats.shape)
print("sample :", sample.shape)


test.head()

items.head()
shops.head()
cats.head(20)
sample.head()
sales.head()

sales: (2935849, 6)
test : (214200, 3)
items: (22170, 3)
shops: (60, 2)
cats : (84, 2)
sample : (214200, 2)


Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day
0,02.01.2013,0,59,22154,999.0,1.0
1,03.01.2013,0,25,2552,899.0,1.0
2,05.01.2013,0,25,2552,899.0,-1.0
3,06.01.2013,0,25,2554,1709.05,1.0
4,15.01.2013,0,25,2555,1099.0,1.0


In [62]:
# unimos las tablas relevantes:

df = (
    sales
    .merge(items, on="item_id", how="left")
    .merge(shops, on="shop_id", how="left")
    .merge(cats, on="item_category_id", how="left")
)

print("df :", df.shape)
df.head()


df : (2935849, 10)


Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day,item_name,item_category_id,shop_name,item_category_name
0,02.01.2013,0,59,22154,999.0,1.0,Scene 2012 (BD),37,"Yaroslavl shopping center ""Altair""",Movies - Blu-Ray
1,03.01.2013,0,25,2552,899.0,1.0,DEEP PURPLE The House Of Blue Light LP,58,"Moscow TRC ""Atrium""",Music - Vinyl
2,05.01.2013,0,25,2552,899.0,-1.0,DEEP PURPLE The House Of Blue Light LP,58,"Moscow TRC ""Atrium""",Music - Vinyl
3,06.01.2013,0,25,2554,1709.05,1.0,DEEP PURPLE Who Do You Think We Are LP,58,"Moscow TRC ""Atrium""",Music - Vinyl
4,15.01.2013,0,25,2555,1099.0,1.0,DEEP PURPLE 30 Very Best Of 2CD (Фирм.),56,"Moscow TRC ""Atrium""",Music - CD production firm


In [81]:
# validamos tipos de datos
df["item_price"] = df["item_price"].astype(float)
df["item_cnt_day"] = pd.to_numeric(df["item_cnt_day"])

# Ajustamos tipo de datos de fecha
df["date"] = pd.to_datetime(df["date"], format="%d.%m.%Y")

# calculamos ventas diarias (Sales) cantidas * precio
df["sales"] = (df["item_cnt_day"] * df["item_price"]).astype(float)

#Validados tipo de datos numericos
df["sales"] = df["sales"].astype(float)

# Validamos el dataframe
print("df :", df.shape)
df.head(5)


df : (2935849, 11)


Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day,item_name,item_category_id,shop_name,item_category_name,sales
0,2013-01-02,0,59,22154,999.0,1.0,Scene 2012 (BD),37,"Yaroslavl shopping center ""Altair""",Movies - Blu-Ray,999.0
1,2013-01-03,0,25,2552,899.0,1.0,DEEP PURPLE The House Of Blue Light LP,58,"Moscow TRC ""Atrium""",Music - Vinyl,899.0
2,2013-01-05,0,25,2552,899.0,-1.0,DEEP PURPLE The House Of Blue Light LP,58,"Moscow TRC ""Atrium""",Music - Vinyl,-899.0
3,2013-01-06,0,25,2554,1709.05,1.0,DEEP PURPLE Who Do You Think We Are LP,58,"Moscow TRC ""Atrium""",Music - Vinyl,1709.05
4,2013-01-15,0,25,2555,1099.0,1.0,DEEP PURPLE 30 Very Best Of 2CD (Фирм.),56,"Moscow TRC ""Atrium""",Music - CD production firm,1099.0


In [118]:
# Agrega cifras control para validacion posterior, agrupando por total generaal

# Agrupamos por total general para validación
total_general = (
    df
    .groupby(
        [ year := df["date"].dt.year ],
        as_index=False
    )   
    .agg(
        total_sales=("sales", "sum"),
        total_units=("item_cnt_day", "sum"),
        avg_price=("item_price", "mean"),
    )
)

total_general.head()

Unnamed: 0,date,total_sales,total_units,avg_price
0,2013,1217525000.0,1562733.0,726.797801
1,2014,1346778000.0,1320889.0,988.344237
2,2015,834623100.0,764584.0,1062.324531


In [None]:
# Agregación mensual: tienda + producto + categoría

# 1) Asegurar fecha en datetime
df["date"] = pd.to_datetime(df["date"], errors="coerce")

# 2) Agregación mensual (ME = month-end)
monthly = (
    df
    .groupby(
        [pd.Grouper(key="date", freq="ME"), "shop_id", 
         "item_id", "item_name", "item_category_id"],
        as_index=False
    )
    .agg(
        monthly_sales=("sales", "sum"),
        monthly_units=("item_cnt_day", "sum"),
        avg_price=("item_price", "mean"),
        min_price=("item_price", "min"),    
        max_price=("item_price", "max"),
        num_transactions=("item_cnt_day", "size"),
        active_days=("date", lambda s: s.dt.date.nunique()),
    )
)

# 3) Agregar año y mes
monthly["year"] = monthly["date"].dt.year
monthly["month"] = monthly["date"].dt.month

# 4) Formato para visualización
monthly_fmt = monthly.copy()
monthly_fmt["monthly_sales"] = monthly_fmt["monthly_sales"].apply(lambda x: f"${x:,.2f}")
monthly_fmt["avg_price"] = monthly_fmt["avg_price"].apply(lambda x: f"${x:,.2f}")
monthly_fmt["monthly_units"] = monthly_fmt["monthly_units"].apply(lambda x: f"{int(x):,}")

monthly_fmt.head()

Unnamed: 0,date,shop_id,item_id,item_name,item_category_id,monthly_sales,monthly_units,avg_price,min_price,max_price,year,month
0,2013-01-31,0,32,1+1,40,"$1,326.00",6,$221.00,221.0,221.0,2013,1
1,2013-01-31,0,33,1+1 (BD),37,"$1,041.00",3,$347.00,347.0,347.0,2013,1
2,2013-01-31,0,35,10 YEARS LATER,40,$247.00,1,$247.00,247.0,247.0,2013,1
3,2013-01-31,0,43,EUR 100 million,40,$221.00,1,$221.00,221.0,221.0,2013,1
4,2013-01-31,0,51,100 best classical works (mp3-CD) (Digipack),57,$257.00,2,$128.50,127.0,130.0,2013,1


In [125]:
print("Filas originales:", df.shape[0])
print("Filas mensuales:", monthly.shape[0])

print("monthly :", monthly.shape)


Filas originales: 2935849
Filas mensuales: 1609124
monthly : (1609124, 10)


In [69]:
# Guardamos el dataset en formato csv

df.to_csv(PROCESSED / "df_base.csv", index=False)