In [1]:
import numpy as np 
import pandas as pd 

import scipy.sparse as sparse
from datetime import timedelta
from IPython.display import display

import os

In [2]:
#CHARGEMENT OPTIMISE DES TRANSACTIONS
# On définit quels types de données on garde
cols = ['t_dat','customer_id','article_id']
df = pd.read_csv('/kaggle/input/h-and-m-personalized-fashion-recommendations/transactions_train.csv', usecols=cols)
# On convertit la colonne date en vrai format date
df['t_dat']= pd.to_datetime(df['t_dat'])

#FILTRAGE TEMPOREL 
#On garde uniquement les transactions des 5 dernières semaines, la mode change vite, on ne s'intéresse qu'aux tendances récentes
# Trouver la dernière date du fichier
max_date = df['t_dat'].max()
# Calculer la date de début (5 semaines avant la fin)
start_date = max_date - pd.Timedelta(weeks=5)
# On garde que ce qui est récent
df_recent = df[df['t_dat'] > start_date]

#CREATION DU VALIDATION SET
# On coupe les 7 derniers jours, on va les utiliser pour tester
split_date = max_date - pd.Timedelta(days=7)

# Train Set (On garde tout sauf les 7 derniers jours)
train_set = df_recent[df_recent['t_dat'] <= split_date]

#Validation Set (Les 7 derniers jours)
val_set = df_recent[df_recent['t_dat'] > split_date]

In [3]:
# 1. Afficher les DataFrames (Aperçu des 5 premières lignes et dimensions)
print("--- APERÇU DF_RECENT ---")
print(f"Dimensions (lignes, colonnes): {df_recent.shape}")
display(df_recent.head())

print("\n--- APERÇU TRAIN_SET ---")
print(f"Dimensions (lignes, colonnes): {train_set.shape}")
display(train_set.head())

print("\n--- APERÇU VAL_SET ---")
print(f"Dimensions (lignes, colonnes): {val_set.shape}")
display(val_set.head())

# 2. Compter les clients uniques dans df_recent
nb_unique_customers = df_recent['customer_id'].nunique()

print("-" * 30)
print(f"Nombre de clients uniques dans df_recent : {nb_unique_customers}")
print("-" * 30)

--- APERÇU DF_RECENT ---
Dimensions (lignes, colonnes): (1300034, 3)


Unnamed: 0,t_dat,customer_id,article_id
30488290,2020-08-19,00080403a669b3b89d1bef1ec73ea466d95e39698d6dde...,784053005
30488291,2020-08-19,00080403a669b3b89d1bef1ec73ea466d95e39698d6dde...,784053005
30488292,2020-08-19,00080403a669b3b89d1bef1ec73ea466d95e39698d6dde...,926921001
30488293,2020-08-19,00080403a669b3b89d1bef1ec73ea466d95e39698d6dde...,868038001
30488294,2020-08-19,00080403a669b3b89d1bef1ec73ea466d95e39698d6dde...,868038001



--- APERÇU TRAIN_SET ---
Dimensions (lignes, colonnes): (1059723, 3)


Unnamed: 0,t_dat,customer_id,article_id
30488290,2020-08-19,00080403a669b3b89d1bef1ec73ea466d95e39698d6dde...,784053005
30488291,2020-08-19,00080403a669b3b89d1bef1ec73ea466d95e39698d6dde...,784053005
30488292,2020-08-19,00080403a669b3b89d1bef1ec73ea466d95e39698d6dde...,926921001
30488293,2020-08-19,00080403a669b3b89d1bef1ec73ea466d95e39698d6dde...,868038001
30488294,2020-08-19,00080403a669b3b89d1bef1ec73ea466d95e39698d6dde...,868038001



--- APERÇU VAL_SET ---
Dimensions (lignes, colonnes): (240311, 3)


Unnamed: 0,t_dat,customer_id,article_id
31548013,2020-09-16,000fb6e772c5d0023892065e659963da90b1866035558e...,786022008
31548014,2020-09-16,000fb6e772c5d0023892065e659963da90b1866035558e...,913272003
31548015,2020-09-16,000fb6e772c5d0023892065e659963da90b1866035558e...,889669006
31548016,2020-09-16,0010e8eb18f131e724d6997909af0808adbba057529edb...,237347060
31548017,2020-09-16,0010e8eb18f131e724d6997909af0808adbba057529edb...,562245001


------------------------------
Nombre de clients uniques dans df_recent : 273166
------------------------------


In [4]:
#CHARGEMENT OPTIMISE DES ARTICLES
# Sélection des colonnes utiles pour la recommandation
articles_cols_to_keep = ['article_id', 'product_type_name','product_group_name','colour_group_name','index_group_name']
df_articles = pd.read_csv('/kaggle/input/h-and-m-personalized-fashion-recommendations/articles.csv', usecols=articles_cols_to_keep)

# Merge 
train_enriched = train_set.merge(df_articles, on='article_id', how='left')

print(train_enriched.head())

       t_dat                                        customer_id  article_id  \
0 2020-08-19  00080403a669b3b89d1bef1ec73ea466d95e39698d6dde...   784053005   
1 2020-08-19  00080403a669b3b89d1bef1ec73ea466d95e39698d6dde...   784053005   
2 2020-08-19  00080403a669b3b89d1bef1ec73ea466d95e39698d6dde...   926921001   
3 2020-08-19  00080403a669b3b89d1bef1ec73ea466d95e39698d6dde...   868038001   
4 2020-08-19  00080403a669b3b89d1bef1ec73ea466d95e39698d6dde...   868038001   

  product_type_name  product_group_name colour_group_name index_group_name  
0             Skirt  Garment Lower body             Black          Divided  
1             Skirt  Garment Lower body             Black          Divided  
2          Trousers  Garment Lower body         Dark Grey          Divided  
3            Jacket  Garment Upper body             Black          Divided  
4            Jacket  Garment Upper body             Black          Divided  


In [5]:
#CHARGEMENT OPTIMISE DES CLIENTS
# Sélection des colonnes utiles pour la recommandation
cust_cols_to_keep = ['customer_id', 'age']
df_customers = pd.read_csv('/kaggle/input/h-and-m-personalized-fashion-recommendations/customers.csv', usecols=cust_cols_to_keep)

# Merge
train_enriched = train_enriched.merge(df_customers, on='customer_id', how='left')

# Gestion des NaN
mean_age = df_customers['age'].mean()
train_enriched['age'] = train_enriched['age'].fillna(mean_age)

print(train_enriched.head())
print(train_enriched.isna().sum())


       t_dat                                        customer_id  article_id  \
0 2020-08-19  00080403a669b3b89d1bef1ec73ea466d95e39698d6dde...   784053005   
1 2020-08-19  00080403a669b3b89d1bef1ec73ea466d95e39698d6dde...   784053005   
2 2020-08-19  00080403a669b3b89d1bef1ec73ea466d95e39698d6dde...   926921001   
3 2020-08-19  00080403a669b3b89d1bef1ec73ea466d95e39698d6dde...   868038001   
4 2020-08-19  00080403a669b3b89d1bef1ec73ea466d95e39698d6dde...   868038001   

  product_type_name  product_group_name colour_group_name index_group_name  \
0             Skirt  Garment Lower body             Black          Divided   
1             Skirt  Garment Lower body             Black          Divided   
2          Trousers  Garment Lower body         Dark Grey          Divided   
3            Jacket  Garment Upper body             Black          Divided   
4            Jacket  Garment Upper body             Black          Divided   

    age  
0  34.0  
1  34.0  
2  34.0  
3  34.0  
4  34.