In [3]:
import numpy as np 
import pandas as pd 

import scipy.sparse as sparse
from datetime import timedelta
from IPython.display import display

import os

In [4]:
#CHARGEMENT OPTIMISE DES TRANSACTIONS
# On définit quels types de données on garde
cols = ['t_dat','customer_id','article_id']
df = pd.read_csv(r'C:\Users\hasal\Downloads\Advanced ML\h-and-m-personalized-fashion-recommendations\transactions_train.csv', usecols=cols)
# On convertit la colonne date en vrai format date
df['t_dat']= pd.to_datetime(df['t_dat'])

#FILTRAGE TEMPOREL 
#On garde uniquement les transactions des 5 dernières semaines, la mode change vite, on ne s'intéresse qu'aux tendances récentes
# Trouver la dernière date du fichier
max_date = df['t_dat'].max()
# Calculer la date de début (5 semaines avant la fin)
start_date = max_date - pd.Timedelta(weeks=5)
# On garde que ce qui est récent
df_recent = df[df['t_dat'] > start_date]

#CREATION DU VALIDATION SET
# On coupe les 7 derniers jours, on va les utiliser pour tester
split_date = max_date - pd.Timedelta(days=7)

# Train Set (On garde tout sauf les 7 derniers jours)
train_set = df_recent[df_recent['t_dat'] <= split_date]

#Validation Set (Les 7 derniers jours)
val_set = df_recent[df_recent['t_dat'] > split_date]

In [5]:
# 1. Afficher les DataFrames (Aperçu des 5 premières lignes et dimensions)
print("--- APERÇU DF_RECENT ---")
print(f"Dimensions (lignes, colonnes): {df_recent.shape}")
display(df_recent.head())

print("\n--- APERÇU TRAIN_SET ---")
print(f"Dimensions (lignes, colonnes): {train_set.shape}")
display(train_set.head())

print("\n--- APERÇU VAL_SET ---")
print(f"Dimensions (lignes, colonnes): {val_set.shape}")
display(val_set.head())

# 2. Compter les clients uniques dans df_recent
nb_unique_customers = train_set['customer_id'].nunique()
nb_unique_articles = train_set['article_id'].nunique()

print("-" * 30)
print(f"Nombre de clients uniques dans train_set : {nb_unique_customers}")
print(f"Nombre d'articles uniques dans train_set : {nb_unique_articles}")
print("-" * 30)

--- APERÇU DF_RECENT ---
Dimensions (lignes, colonnes): (1300034, 3)


Unnamed: 0,t_dat,customer_id,article_id
30488290,2020-08-19,00080403a669b3b89d1bef1ec73ea466d95e39698d6dde...,784053005
30488291,2020-08-19,00080403a669b3b89d1bef1ec73ea466d95e39698d6dde...,784053005
30488292,2020-08-19,00080403a669b3b89d1bef1ec73ea466d95e39698d6dde...,926921001
30488293,2020-08-19,00080403a669b3b89d1bef1ec73ea466d95e39698d6dde...,868038001
30488294,2020-08-19,00080403a669b3b89d1bef1ec73ea466d95e39698d6dde...,868038001



--- APERÇU TRAIN_SET ---
Dimensions (lignes, colonnes): (1059723, 3)


Unnamed: 0,t_dat,customer_id,article_id
30488290,2020-08-19,00080403a669b3b89d1bef1ec73ea466d95e39698d6dde...,784053005
30488291,2020-08-19,00080403a669b3b89d1bef1ec73ea466d95e39698d6dde...,784053005
30488292,2020-08-19,00080403a669b3b89d1bef1ec73ea466d95e39698d6dde...,926921001
30488293,2020-08-19,00080403a669b3b89d1bef1ec73ea466d95e39698d6dde...,868038001
30488294,2020-08-19,00080403a669b3b89d1bef1ec73ea466d95e39698d6dde...,868038001



--- APERÇU VAL_SET ---
Dimensions (lignes, colonnes): (240311, 3)


Unnamed: 0,t_dat,customer_id,article_id
31548013,2020-09-16,000fb6e772c5d0023892065e659963da90b1866035558e...,786022008
31548014,2020-09-16,000fb6e772c5d0023892065e659963da90b1866035558e...,913272003
31548015,2020-09-16,000fb6e772c5d0023892065e659963da90b1866035558e...,889669006
31548016,2020-09-16,0010e8eb18f131e724d6997909af0808adbba057529edb...,237347060
31548017,2020-09-16,0010e8eb18f131e724d6997909af0808adbba057529edb...,562245001


------------------------------
Nombre de clients uniques dans train_set : 235277
Nombre d'articles uniques dans train_set : 29009
------------------------------


In [6]:
#CHARGEMENT OPTIMISE DES ARTICLES
# Sélection des colonnes utiles pour la recommandation
articles_cols_to_keep = ['article_id', 'product_type_name','product_group_name','colour_group_name','index_group_name']
df_articles = pd.read_csv(r'C:\Users\hasal\Downloads\Advanced ML\h-and-m-personalized-fashion-recommendations\articles.csv', usecols=articles_cols_to_keep)

# Merge 
train_enriched = train_set.merge(df_articles, on='article_id', how='left')

print(train_enriched.head())

       t_dat                                        customer_id  article_id  \
0 2020-08-19  00080403a669b3b89d1bef1ec73ea466d95e39698d6dde...   784053005   
1 2020-08-19  00080403a669b3b89d1bef1ec73ea466d95e39698d6dde...   784053005   
2 2020-08-19  00080403a669b3b89d1bef1ec73ea466d95e39698d6dde...   926921001   
3 2020-08-19  00080403a669b3b89d1bef1ec73ea466d95e39698d6dde...   868038001   
4 2020-08-19  00080403a669b3b89d1bef1ec73ea466d95e39698d6dde...   868038001   

  product_type_name  product_group_name colour_group_name index_group_name  
0             Skirt  Garment Lower body             Black          Divided  
1             Skirt  Garment Lower body             Black          Divided  
2          Trousers  Garment Lower body         Dark Grey          Divided  
3            Jacket  Garment Upper body             Black          Divided  
4            Jacket  Garment Upper body             Black          Divided  


In [7]:
#CHARGEMENT OPTIMISE DES CLIENTS
# Sélection des colonnes utiles pour la recommandation
cust_cols_to_keep = ['customer_id', 'age']
df_customers = pd.read_csv('C:\\Users\\hasal\\Downloads\\Advanced ML\\h-and-m-personalized-fashion-recommendations\\customers.csv', usecols=cust_cols_to_keep)

# Merge
train_enriched = train_enriched.merge(df_customers, on='customer_id', how='left')

# Gestion des NaN
mean_age = df_customers['age'].mean()
train_enriched['age'] = train_enriched['age'].fillna(mean_age)

print(train_enriched.head())
print(train_enriched.isna().sum())


       t_dat                                        customer_id  article_id  \
0 2020-08-19  00080403a669b3b89d1bef1ec73ea466d95e39698d6dde...   784053005   
1 2020-08-19  00080403a669b3b89d1bef1ec73ea466d95e39698d6dde...   784053005   
2 2020-08-19  00080403a669b3b89d1bef1ec73ea466d95e39698d6dde...   926921001   
3 2020-08-19  00080403a669b3b89d1bef1ec73ea466d95e39698d6dde...   868038001   
4 2020-08-19  00080403a669b3b89d1bef1ec73ea466d95e39698d6dde...   868038001   

  product_type_name  product_group_name colour_group_name index_group_name  \
0             Skirt  Garment Lower body             Black          Divided   
1             Skirt  Garment Lower body             Black          Divided   
2          Trousers  Garment Lower body         Dark Grey          Divided   
3            Jacket  Garment Upper body             Black          Divided   
4            Jacket  Garment Upper body             Black          Divided   

    age  
0  34.0  
1  34.0  
2  34.0  
3  34.0  
4  34.

## Algorithme de recommendation 

In [10]:
#!pip install implicit 
import implicit # ne marche qu'avec python 3.11 ou inférieur

Collecting implicit
  Using cached implicit-0.7.2-cp311-cp311-win_amd64.whl.metadata (6.3 kB)
Collecting tqdm>=4.27 (from implicit)
  Using cached tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Collecting threadpoolctl (from implicit)
  Using cached threadpoolctl-3.6.0-py3-none-any.whl.metadata (13 kB)
Using cached implicit-0.7.2-cp311-cp311-win_amd64.whl (750 kB)
Using cached tqdm-4.67.1-py3-none-any.whl (78 kB)
Using cached threadpoolctl-3.6.0-py3-none-any.whl (18 kB)
Installing collected packages: tqdm, threadpoolctl, implicit

   ---------------------------------------- 0/3 [tqdm]
   ---------------------------------------- 0/3 [tqdm]
   ---------------------------------------- 0/3 [tqdm]
   ---------------------------------------- 0/3 [tqdm]
   ---------------------------------------- 0/3 [tqdm]
   ---------------------------------------- 0/3 [tqdm]
   ---------------------------------------- 0/3 [tqdm]
   ---------------------------------------- 0/3 [tqdm]
   ---------------------

  from .autonotebook import tqdm as notebook_tqdm


### Préparation de données pour ALS

In [11]:
# MAPPING DES IDS VERS DES INDICES ENTIERS
user_unique = train_set['customer_id'].unique()
item_unique = train_set['article_id'].unique()


user_to_idx = {v: k for k, v in enumerate(user_unique)}
item_to_idx = {v: k for k, v in enumerate(item_unique)}
idx_to_user = {k: v for k, v in enumerate(user_unique)}
idx_to_item = {k: v for k, v in enumerate(item_unique)}

# Ajout des colonnes d'indices
train_set = train_set.copy() 
train_set['user_idx'] = train_set['customer_id'].map(user_to_idx)
train_set['item_idx'] = train_set['article_id'].map(item_to_idx)

# Nous donnons plus de poids aux achats récents
train_set['days_since'] = (max_date - train_set['t_dat']).dt.days
train_set['weight'] = 1.0 + (10.0 / (train_set['days_since'] + 1))

In [12]:
# Creation de la matrice creuse utilisateur-article pondérée
interaction_matrix = sparse.csr_matrix(
    (train_set['weight'], (train_set['user_idx'], train_set['item_idx'])),
    shape=(len(user_unique), len(item_unique))
)

print(f"Matrice Creuse: {100 * (1 - (interaction_matrix.nnz / (interaction_matrix.shape[0] * interaction_matrix.shape[1]))):.5f}%")

Matrice Creuse: 99.98623%


In [13]:
model = implicit.als.AlternatingLeastSquares(
    factors=200, 
    regularization=0.01, 
    iterations=5, 
    calculate_training_loss=True,
    random_state=42
)

model.fit(interaction_matrix)

  check_blas_config()
100%|██████████| 5/5 [03:28<00:00, 41.60s/it, loss=0.000187]
