Instruccions per processar l'archiu train.csv per entrenar l'algoritme

In [None]:
import pandas as pd
file="train"
df = pd.read_csv(f"./../data/{file}.csv",sep=";")
df.head()

In [None]:
# Seleccionem només les columnes que utilitzarem pel training
df = df[["ID","image_embedding","category","aggregated_family","length_type","family","color_rgb","archetype","moment","life_cycle_length","fabric","phase_in","num_week_iso", "num_stores","num_sizes","price","year","weekly_demand"]]
df.head(100)

In [None]:
# Aqui convertim phase_in a una nova columna "week_iso" que conté el número de setmana ISO de l'any de cada fila

df["phase_in"] = pd.to_datetime(df["phase_in"], format="%d/%m/%Y")

# Semana ISO de la fecha de phase_in
df["week_iso"] = df["phase_in"].dt.isocalendar().week

# A cada fila el valor representa la setmana que es va posar a la venda més les setmanes que porta a la venda
df["week_iso"] = (df["week_iso"] + df["num_week_iso"] -1) % 52

df.head(100)

In [None]:
# Aquí convertim la columna color_rgb en tres columnes noves R, G, B
# Separar la columna colores en tres nuevas
df[["R", "G", "B"]] = df["color_rgb"].str.split(",", expand=True)

# Convertir a enteros (porque inicialmente son strings)
df[["R", "G", "B"]] = df[["R", "G", "B"]].astype(int)

In [None]:
# Eliminem les columnes que ja no necessitem

df = df.drop(columns=["color_rgb","life_cycle_length","phase_in"])
df.head()

In [None]:
# Apliquem One-Hot Encoding a les columnes categòriques

def oneHot(df, column_index):
    column_name = df.columns[column_index]
    one_hot_df = pd.get_dummies(df[column_name], prefix=column_name).astype(int)
    df = pd.concat([df, one_hot_df], axis=1)
    return df

column_index_list = [1,2,3,4,5,6,7]

# apply one-hot to each column
for i in column_index_list:
    df = oneHot(df, i)

df.head(20)

In [None]:
# Aquí veiem les columnes finals del DataFrame, hem de tenir això en compte per a el processament de les dades de test
# Si en test es creen menys columnes, hem de afegir les que falten amb valor 0
df.columns.tolist()

In [None]:
# Eliminem les columnes originals categòriques
df = df.drop(columns=["aggregated_family","family","archetype","moment","fabric","length_type","category"])
df.head()


In [None]:
# Processament de la columna image_embedding
# Es crea una columna per cada dimensió de l'embedding
df["image_embedding"] = df["image_embedding"].str.split(",").apply(lambda x: [float(i) for i in x])

embedding_df = pd.DataFrame(df["image_embedding"].tolist(), columns=[f"f{i}" for i in range(len(df["image_embedding"][0])) ])
embedding_df.head()
df = pd.concat([df.drop(columns=["image_embedding"]), embedding_df], axis=1)

In [None]:
df.head()

In [None]:
# Guardem el DataFrame processat a un nou fitxer CSV
df.to_csv("./../data/train-parsed.csv",index=False)