In [35]:
# Descarga de archivos
import pandas as pd
from pathlib import Path

RAW = Path("data/raw")

sales = pd.read_csv(RAW / "sales_train.csv")
test  = pd.read_csv(RAW / "test.csv")
items = pd.read_csv(RAW / "items_en.csv")
shops = pd.read_csv(RAW / "shops_en.csv")
cats  = pd.read_csv(RAW / "item_categories_en.csv")
sample  = pd.read_csv(RAW / "sample_submission.csv")

print("sales:", sales.shape)
print("test :", test.shape)
print("items:", items.shape)
print("shops:", shops.shape)
print("cats :", cats.shape)
print("sample :", sample.shape)


test.head()
items.head()
shops.head()
cats.head(20)
sample.head()
sales.head()

sales: (2935849, 6)
test : (214200, 3)
items: (22170, 3)
shops: (60, 2)
cats : (84, 2)
sample : (214200, 2)


Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day
0,02.01.2013,0,59,22154,999.0,1.0
1,03.01.2013,0,25,2552,899.0,1.0
2,05.01.2013,0,25,2552,899.0,-1.0
3,06.01.2013,0,25,2554,1709.05,1.0
4,15.01.2013,0,25,2555,1099.0,1.0


In [47]:
# unimos las tablas relevantes:

df = (
    sales
    .merge(items, on="item_id", how="left")
    .merge(shops, on="shop_id", how="left")
    .merge(cats, on="item_category_id", how="left")
)

print("df :", df.shape)
df.head()


df : (2935849, 10)


Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day,item_name,item_category_id,shop_name,item_category_name
0,02.01.2013,0,59,22154,999.0,1.0,Scene 2012 (BD),37,"Yaroslavl shopping center ""Altair""",Movies - Blu-Ray
1,03.01.2013,0,25,2552,899.0,1.0,DEEP PURPLE The House Of Blue Light LP,58,"Moscow TRC ""Atrium""",Music - Vinyl
2,05.01.2013,0,25,2552,899.0,-1.0,DEEP PURPLE The House Of Blue Light LP,58,"Moscow TRC ""Atrium""",Music - Vinyl
3,06.01.2013,0,25,2554,1709.05,1.0,DEEP PURPLE Who Do You Think We Are LP,58,"Moscow TRC ""Atrium""",Music - Vinyl
4,15.01.2013,0,25,2555,1099.0,1.0,DEEP PURPLE 30 Very Best Of 2CD (Фирм.),56,"Moscow TRC ""Atrium""",Music - CD production firm


In [52]:
# validamos tipos de datos
df["item_price"] = df["item_price"].astype(float)
df["item_cnt_day"] = pd.to_numeric(df["item_cnt_day"])

# Ajustamos tipo de datos de fecha
df["date"] = pd.to_datetime(df["date"], format="%d.%m.%Y")
df.head(5)


Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day,item_name,item_category_id,shop_name,item_category_name
0,2013-01-02,0,59,22154,999.0,1.0,Scene 2012 (BD),37,"Yaroslavl shopping center ""Altair""",Movies - Blu-Ray
1,2013-01-03,0,25,2552,899.0,1.0,DEEP PURPLE The House Of Blue Light LP,58,"Moscow TRC ""Atrium""",Music - Vinyl
2,2013-01-05,0,25,2552,899.0,-1.0,DEEP PURPLE The House Of Blue Light LP,58,"Moscow TRC ""Atrium""",Music - Vinyl
3,2013-01-06,0,25,2554,1709.05,1.0,DEEP PURPLE Who Do You Think We Are LP,58,"Moscow TRC ""Atrium""",Music - Vinyl
4,2013-01-15,0,25,2555,1099.0,1.0,DEEP PURPLE 30 Very Best Of 2CD (Фирм.),56,"Moscow TRC ""Atrium""",Music - CD production firm


In [55]:
# calculamos ventas diarias (Sales) cantidas * precio

df["sales"] = df["item_cnt_day"] * df["item_price"]

#Validados tipo de datos numericos
df["sales"] = df["sales"].astype(float)
df.head(5)
print("df :", df.shape)

df : (2935849, 11)


df : (2935849, 11)


Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day,item_name,item_category_id,shop_name,item_category_name,sales
2935844,2015-10-10,33,25,7409,299.0,1.0,V/A Nu Jazz Selection (digipack),55,"Moscow TRC ""Atrium""",Music - CD of local production,299.0
2935845,2015-10-09,33,25,7460,299.0,1.0,V/A The Golden Jazz Collection 1 2CD,55,"Moscow TRC ""Atrium""",Music - CD of local production,299.0
2935846,2015-10-14,33,25,7459,349.0,1.0,V/A The Best Of The 3 Tenors,55,"Moscow TRC ""Atrium""",Music - CD of local production,349.0
2935847,2015-10-22,33,25,7440,299.0,1.0,V/A Relax Collection Planet MP3 (mp3-CD) (jewel),57,"Moscow TRC ""Atrium""",Music - MP3,299.0
2935848,2015-10-03,33,25,7460,299.0,1.0,V/A The Golden Jazz Collection 1 2CD,55,"Moscow TRC ""Atrium""",Music - CD of local production,299.0


In [57]:
# Guardamos el dataset en formato parquet

df.to_csv(PROCESSED / "df_base.csv", index=False)