In [6]:
import datetime
import numpy as np
import pandas as pd
from tqdm import tqdm
from collections import Counter

In [7]:

def load_data(data_path='/kaggle/input/h-and-m-personalized-fashion-recommendations'):
    """Loads transactions, customers, and articles data."""
    print("Loading data...")
    try:
        # Ensure article_id is read as string to preserve leading zeros
        transactions = pd.read_csv(f'{data_path}/transactions_train.csv', dtype={'article_id': str})
        customers = pd.read_csv(f'{data_path}/customers.csv')
        articles = pd.read_csv(f'{data_path}/articles.csv', dtype={'article_id': str})
    except FileNotFoundError:
        print("CSV files not found, trying to load Parquet files...")
        transactions = pd.read_parquet(f'{data_path}/transactions.pqt')
        customers = pd.read_parquet(f'{data_path}/customers.pqt')
        articles = pd.read_parquet(f'{data_path}/articles.pqt')

    transactions["t_dat"] = pd.to_datetime(transactions["t_dat"])
    return transactions, customers, articles


In [10]:

def generate_candidates(transactions):
    """Generates candidate articles for each user based on the notebook's logic."""
    
    print("Splitting data into time windows...")
    # Time windows from the notebook
    train1 = transactions.loc[(transactions["t_dat"] >= datetime.datetime(2020,9,8)) & (transactions['t_dat'] < datetime.datetime(2020,9,16))]
    train2 = transactions.loc[(transactions["t_dat"] >= datetime.datetime(2020,9,1)) & (transactions['t_dat'] < datetime.datetime(2020,9,8))]
    train3 = transactions.loc[(transactions["t_dat"] >= datetime.datetime(2020,8,23)) & (transactions['t_dat'] < datetime.datetime(2020,9,1))]
    train4 = transactions.loc[(transactions["t_dat"] >= datetime.datetime(2020,8,15)) & (transactions['t_dat'] < datetime.datetime(2020,8,23))]
    
    val = transactions.loc[transactions["t_dat"] >= datetime.datetime(2020,9,16)]
    val_users = val['customer_id'].unique()

    print("Grouping purchases by user for different time windows...")
    positive_items_per_user1 = train1.groupby(['customer_id'])['article_id'].apply(list)
    positive_items_per_user2 = train2.groupby(['customer_id'])['article_id'].apply(list)
    positive_items_per_user3 = train3.groupby(['customer_id'])['article_id'].apply(list)
    positive_items_per_user4 = train4.groupby(['customer_id'])['article_id'].apply(list)

    # --- Next item prediction logic ---
    print("Calculating next-item predictions...")
    train_all = pd.concat([train1, train2, train3, train4], axis=0)
    train_all = train_all.sort_values(['customer_id', 't_dat'])
    user_group = train_all.groupby(['customer_id'])['article_id'].apply(list)
    
    next_items = {}
    for user in tqdm(user_group.keys(), desc="Processing next items"):
        items = user_group[user]
        for i,item in enumerate(items[:-1]):
            if item not in next_items:
                next_items[item] = []
            if item != items[i+1]:
                next_items[item].append(items[i+1])

    pred_next = {}
    for item in next_items:
        if len(next_items[item]) >= 5:
            most_common = Counter(next_items[item]).most_common()
            ratio = most_common[0][1]/len(next_items[item])
            if ratio >= 0.1:
                pred_next[item] = most_common[0][0]

    # --- Popular items logic ---
    print("Calculating popular items...")
    train_pop = transactions.loc[(transactions["t_dat"] >= datetime.datetime(2020,9,1)) & (transactions['t_dat'] < datetime.datetime(2020,9,16))]
    train_pop['pop_factor'] = train_pop['t_dat'].apply(lambda x: 1/(datetime.datetime(2020,9,16) - x).days)
    popular_items_group = train_pop.groupby(['article_id'])['pop_factor'].sum()
    _, popular_items = zip(*sorted(zip(popular_items_group, popular_items_group.keys()))[::-1])

    # --- Generate candidate list for each user ---
    print("Generating final candidate lists for validation users...")
    candidate_data = []
    for user in tqdm(val_users, desc="Generating candidates"):
        user_output = []
        if user in positive_items_per_user1:
            user_output += list(Counter(positive_items_per_user1[user]).keys())[:12]
        if user in positive_items_per_user2:
            user_output += list(Counter(positive_items_per_user2[user]).keys())[:12]
        if user in positive_items_per_user3:
            user_output += list(Counter(positive_items_per_user3[user]).keys())[:12]
        if user in positive_items_per_user4:
            user_output += list(Counter(positive_items_per_user4[user]).keys())[:12]
        
        user_output = list(dict.fromkeys(user_output))

        next_preds = [pred_next[item] for item in user_output if item in pred_next and pred_next[item] not in user_output]
        user_output += next_preds
        
        user_output += list(popular_items[:24 - len(user_output)])
        
        for article in list(dict.fromkeys(user_output))[:24]:
            candidate_data.append({'customer_id': user, 'article_id': article})
            
    candidates_df = pd.DataFrame(candidate_data)
    return candidates_df, val




In [11]:
def create_supervised_dataset(candidates_df, validation_df):
    """Merges candidates with actual purchases to create a labeled dataset."""
    print("Creating supervised dataset with labels...")
    
    actual_purchases = validation_df[['customer_id', 'article_id']].copy()
    actual_purchases['purchased'] = 1
    
    data = pd.merge(
        candidates_df,
        actual_purchases,
        on=['customer_id', 'article_id'],
        how='left'
    )
    
    data['purchased'].fillna(0, inplace=True)
    data['purchased'] = data['purchased'].astype(int)
    
    return data


In [4]:

def add_features(df, customers, articles, transactions):
    """Adds customer, article, and popularity features."""
    print("Adding features...")
    
    df = pd.merge(df, customers, on='customer_id', how='left')
    df = pd.merge(df, articles, on='article_id', how='left')
    
    transactions['week'] = (transactions['t_dat'] - transactions['t_dat'].min()).dt.days // 7
    
    sales = transactions.groupby('week')['article_id'].value_counts() \
        .groupby('week').rank(method='dense', ascending=False) \
        .groupby('week').head(12).rename('bestseller_rank').astype('int8')
    sales = sales.reset_index()
    
    val_week = transactions['week'].max() + 1
    sales.week += 1
    
    bestsellers_for_val = sales[sales.week == val_week]
    
    df = pd.merge(df, bestsellers_for_val[['article_id', 'bestseller_rank']], on='article_id', how='left')
    df['bestseller_rank'].fillna(999, inplace=True)

    # Clean up dtypes and fill NaNs
    for col in df.columns:
        if df[col].dtype == 'object' and col not in ['customer_id', 'article_id']:
             df[col].fillna('Unknown', inplace=True)
        elif pd.api.types.is_numeric_dtype(df[col]):
            df[col].fillna(-1, inplace=True)

    return df


In [8]:


print("Starting script to create supervised learning dataset...")

# Load all necessary data
transactions, customers, articles = load_data()


Starting script to create supervised learning dataset...
Loading data...


In [12]:

# Generate candidates based on notebook logic
candidates, validation_set = generate_candidates(transactions)

# Create the base supervised dataset (customer, article, purchased_label)
supervised_df = create_supervised_dataset(candidates, validation_set)

# Add features to the supervised dataset
final_df = add_features(supervised_df, customers, articles, transactions)

# Save the result
output_path = 'supervised_dataset.parquet'
print(f"Saving final dataset to {output_path}...")
final_df.to_parquet(output_path, index=False)

print("Script finished successfully!")
print("\nFinal DataFrame head:")
print(final_df.head())
print("\nColumns:", final_df.columns.tolist())
print("\nShape:", final_df.shape)

Splitting data into time windows...
Grouping purchases by user for different time windows...
Calculating next-item predictions...


Processing next items: 100%|██████████| 254618/254618 [00:01<00:00, 131138.13it/s]


Calculating popular items...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Generating final candidate lists for validation users...


Generating candidates: 100%|██████████| 68984/68984 [00:04<00:00, 15173.44it/s]


Creating supervised dataset with labels...
Adding features...
Saving final dataset to supervised_dataset.parquet...
Script finished successfully!

Final DataFrame head:
                                         customer_id  article_id  purchased  \
0  000fb6e772c5d0023892065e659963da90b1866035558e...  0884319006          0   
1  000fb6e772c5d0023892065e659963da90b1866035558e...  0762846026          0   
2  000fb6e772c5d0023892065e659963da90b1866035558e...  0919273004          1   
3  000fb6e772c5d0023892065e659963da90b1866035558e...  0927922002          0   
4  000fb6e772c5d0023892065e659963da90b1866035558e...  0611415034          0   

    FN  Active club_member_status fashion_news_frequency   age  \
0  1.0     1.0             ACTIVE              Regularly  42.0   
1  1.0     1.0             ACTIVE              Regularly  42.0   
2  1.0     1.0             ACTIVE              Regularly  42.0   
3  1.0     1.0             ACTIVE              Regularly  42.0   
4  1.0     1.0            

In [23]:
final_df['purchased'].value_counts()

0    1638315
1      15173
Name: purchased, dtype: int64

In [28]:
%%time
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from lightgbm.sklearn import LGBMRanker
from sklearn.metrics import ndcg_score

X = final_df.drop(columns=['purchased'])
y = final_df['purchased']

for col in X.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col].astype(str))

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

train_group = X_train.groupby('customer_id').size().tolist()
test_group = X_test.groupby('customer_id').size().tolist()

ranker = LGBMRanker(
    objective="lambdarank",
    metric="ndcg",
    boosting_type="gbdt",
    learning_rate=0.05,
    n_estimators=100
)

ranker.fit(
    X_train, y_train,
    group=train_group,
    eval_set=[(X_test, y_test)],
    eval_group=[test_group],
    eval_at=[5, 10]
)

y_pred = ranker.predict(X_test)
X_test['score'] = y_pred
X_test['customer_id'] = final_df.loc[X_test.index, 'customer_id']
ranked = X_test.sort_values(by=['customer_id', 'score'], ascending=[True, False])

ndcg = ndcg_score([y_test.values], [y_pred])
print("NDCG Score:", ndcg)
print(ranked.head(20))

[1]	valid_0's ndcg@5: 0.986183	valid_0's ndcg@10: 0.987114
[2]	valid_0's ndcg@5: 0.986574	valid_0's ndcg@10: 0.987483
[3]	valid_0's ndcg@5: 0.986751	valid_0's ndcg@10: 0.987625
[4]	valid_0's ndcg@5: 0.986709	valid_0's ndcg@10: 0.987576
[5]	valid_0's ndcg@5: 0.986791	valid_0's ndcg@10: 0.987665
[6]	valid_0's ndcg@5: 0.986778	valid_0's ndcg@10: 0.98762
[7]	valid_0's ndcg@5: 0.986841	valid_0's ndcg@10: 0.987709
[8]	valid_0's ndcg@5: 0.986908	valid_0's ndcg@10: 0.987778
[9]	valid_0's ndcg@5: 0.986893	valid_0's ndcg@10: 0.987754
[10]	valid_0's ndcg@5: 0.986932	valid_0's ndcg@10: 0.98777
[11]	valid_0's ndcg@5: 0.986882	valid_0's ndcg@10: 0.987721
[12]	valid_0's ndcg@5: 0.98688	valid_0's ndcg@10: 0.987718
[13]	valid_0's ndcg@5: 0.986903	valid_0's ndcg@10: 0.987736
[14]	valid_0's ndcg@5: 0.986892	valid_0's ndcg@10: 0.987725
[15]	valid_0's ndcg@5: 0.986922	valid_0's ndcg@10: 0.987759
[16]	valid_0's ndcg@5: 0.986946	valid_0's ndcg@10: 0.987779
[17]	valid_0's ndcg@5: 0.986952	valid_0's ndcg@10: 0

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


NDCG Score: 0.6623802830015895
                                               customer_id  article_id   FN  \
1056306  00039306476aaf41a07fed942884f16b30abfa83a2a8be...       18882 -1.0   
1056293  00039306476aaf41a07fed942884f16b30abfa83a2a8be...        4976 -1.0   
1449935  0003e867a930d0d6842f923d6ba7c9b77aba33fe2a0fbf...       17991  1.0   
1449936  0003e867a930d0d6842f923d6ba7c9b77aba33fe2a0fbf...       17268  1.0   
1449937  0003e867a930d0d6842f923d6ba7c9b77aba33fe2a0fbf...       17435  1.0   
1449945  0003e867a930d0d6842f923d6ba7c9b77aba33fe2a0fbf...       18609  1.0   
1449943  0003e867a930d0d6842f923d6ba7c9b77aba33fe2a0fbf...         437  1.0   
1449947  0003e867a930d0d6842f923d6ba7c9b77aba33fe2a0fbf...        3414  1.0   
1449948  0003e867a930d0d6842f923d6ba7c9b77aba33fe2a0fbf...        4976  1.0   
1251359  000493dd9fc463df1acc2081450c9e75ef8e87d5dd17ed...        4263 -1.0   
1251361  000493dd9fc463df1acc2081450c9e75ef8e87d5dd17ed...       18625 -1.0   
1251368  000493dd9fc4

In [29]:
%%time
from sklearn.metrics import average_precision_score
import numpy as np

X_test = X_test.copy()
X_test.loc[:, 'score'] = y_pred
X_test.loc[:, 'true_label'] = y_test.values
X_test.loc[:, 'customer_id'] = final_df.loc[X_test.index, 'customer_id'].values

grouped = X_test.groupby('customer_id')
average_precisions = []
K = 12

for _, group in grouped:
    top_k = group.sort_values('score', ascending=False).head(K)
    y_true = top_k['true_label'].values
    y_scores = top_k['score'].values
    if y_true.sum() == 0:
        average_precisions.append(0)
    else:
        ap = average_precision_score(y_true, y_scores)
        average_precisions.append(ap)

map_score = np.mean(average_precisions)
print("MAP@12 Score:", map_score)

MAP@12 Score: 0.02537382515179059
CPU times: user 33.8 s, sys: 180 ms, total: 34 s
Wall time: 33.8 s


In [30]:
%%time
from sklearn.metrics import average_precision_score
import numpy as np

X_test = X_test.copy()
X_test.loc[:, 'true_label'] = y_test.values
X_test.loc[:, 'customer_id'] = final_df.loc[X_test.index, 'customer_id'].values

grouped = X_test.groupby('customer_id')
average_precisions = []
K = 12

for _, group in grouped:
    top_k = group.head(K)
    y_true = top_k['true_label'].values
    y_scores = np.arange(len(y_true), 0, -1)  # 임의 점수: 뒤로 갈수록 낮게
    if y_true.sum() == 0:
        average_precisions.append(0)
    else:
        ap = average_precision_score(y_true, y_scores)
        average_precisions.append(ap)

map_score = np.mean(average_precisions)
print("MAP@12 Score (no model):", map_score)

MAP@12 Score (no model): 0.019156676870812826
CPU times: user 9.46 s, sys: 38.9 ms, total: 9.5 s
Wall time: 9.5 s
