In [23]:
import pandas as pd
import numpy as np
import cudf
import os

<h1> 트랜잭션 데이터 로드 함수

In [24]:
def load_and_preprocess_transactions(path='./transactions_train.csv', parquet_path='transactions.pqt'):
    transaction_columns = [
        't_dat',
        'customer_id',
        'article_id'
    ]
    if os.path.exists(parquet_path):
        print(f"Loading cached Parquet file: {parquet_path}")
        return cudf.read_parquet(parquet_path)
    
    
    print(f"Processing raw CSV file: {path}")
    df = pd.read_csv(path)
    df = df[transaction_columns]
    
    gdf = cudf.DataFrame.from_pandas(df)
    gdf['customer_id'] = gdf['customer_id'].str[-16:].str.hex_to_int().astype('int64')
    gdf['article_id'] = gdf['article_id'].astype('int32')
    gdf['t_dat'] = cudf.to_datetime(gdf['t_dat'])
    
    gdf.to_parquet(parquet_path, index=False)
    print(f"Saved processed data to: {parquet_path}")
    
    return gdf

<h1> 아이템 데이터 로드 함수

In [3]:
def load_and_preprocess_articles(path='./articles.csv', parquet_path='articles.pqt'):
    categorical_columns = [
        'article_id',
        'product_type_name',
        'product_group_name',
        'graphical_appearance_name',
        'colour_group_name',
        'perceived_colour_value_name',
        'perceived_colour_master_name',
        'department_name',
        'index_name',
        'garment_group_name'
    ]

    if os.path.exists(parquet_path):
        print(f"Loading cached Parquet file: {parquet_path}")
        return cudf.read_parquet(parquet_path)

    df = pd.read_csv(path)
    df = df[categorical_columns]

    gdf = cudf.DataFrame.from_pandas(df)
    gdf['article_id'] = gdf['article_id'].astype('int32')
    for col in categorical_columns:
        gdf[col] = gdf[col].astype('str')

    gdf.to_parquet(parquet_path, index=False)
    return gdf

<h1> 유저정보 데이터 로드 함수

In [4]:
def load_and_preprocess_customers(path='./customers.csv', parquet_path='customers.pqt'):
    selected_customer_columns = [
        'customer_id',
        'club_member_status',
        'fashion_news_frequency',
        'age',
        'Active'
    ]

    if os.path.exists(parquet_path):
        print(f"Loading cached Parquet file: {parquet_path}")
        return cudf.read_parquet(parquet_path)

    df = pd.read_csv(path)
    df = df[selected_customer_columns]

    gdf = cudf.DataFrame.from_pandas(df)
    gdf['customer_id'] = gdf['customer_id'].str[-16:].str.hex_to_int().astype('int64')
    gdf['club_member_status'] = gdf['club_member_status'].astype('str')
    gdf['fashion_news_frequency'] = gdf['fashion_news_frequency'].astype('str')
    gdf['age'] = gdf['age'].where(gdf['age'] >= 13, 13)
    gdf['age'] = gdf['age'].where(gdf['age'] <= 100, 100)
    gdf['Active'] = gdf['Active'].fillna(0).astype(bool)

    gdf.to_parquet(parquet_path, index=False)
    return gdf

In [25]:
articles=load_and_preprocess_articles()

Loading cached Parquet file: articles.pqt


In [26]:
customers=load_and_preprocess_customers()

Loading cached Parquet file: customers.pqt


In [27]:
transactions=load_and_preprocess_transactions()

Loading cached Parquet file: transactions.pqt


<h1> 최근 1주일 구매 데이터 필터링 함수

In [28]:
def filter_last_week_purchases(train):
    tmp = train.groupby('customer_id').t_dat.max().reset_index()
    tmp.columns = ['customer_id', 'max_dat']
    train = train.merge(tmp, on='customer_id', how='left')
    train['diff_dat'] = (train.max_dat - train.t_dat).dt.days
    return train[train['diff_dat'] <= 7]

<h1> 자주 구매한 아이템 추천 함수

In [29]:

def recommend_frequent_items(train):
    tmp = train.groupby(['customer_id', 'article_id'])['t_dat'].agg('count').reset_index()
    tmp.columns = ['customer_id', 'article_id', 'ct']
    train = train.merge(tmp, on=['customer_id', 'article_id'], how='left')
    train = train.sort_values(['ct', 't_dat'], ascending=False)
    train = train.drop_duplicates(['customer_id', 'article_id'])
    return train[['customer_id', 'article_id']]

<h1> 함께 구매된 아이템 추천 함수

In [10]:
def recommend_paired_items(train, pair_dict_path):
    pairs = np.load(pair_dict_path, allow_pickle=True).item()
    train['article_id2'] = train['article_id'].map(pairs)
    train2 = train[['customer_id', 'article_id2']].dropna()
    train2 = train2[train2['article_id2'].list.len() > 0]
    train2 = train2.explode('article_id2')
    train2 = train2.drop_duplicates(['customer_id', 'article_id2'])
    train2 = train2.rename(columns={'article_id2': 'article_id'})
    return cudf.concat([train[['customer_id', 'article_id']], train2], ignore_index=True)

<h1> 고객별 추천 문자열 생성 함수

In [11]:
def generate_prediction_strings(train_df):
    train_df['article_id'] = ' 0' + train_df['article_id'].astype(str)
    preds = train_df.groupby('customer_id')['article_id'].sum().reset_index()
    preds.columns = ['customer_id', 'prediction']
    return cudf.DataFrame(preds)

<h1> 인기 아이템 추출 함수

In [12]:
def get_top12_items(train_path, cutoff_date='2020-09-16'):
    train = cudf.read_parquet(train_path)
    train.t_dat = cudf.to_datetime(train.t_dat)
    train = train[train.t_dat >= cudf.to_datetime(cutoff_date)]
    top12 = ' 0' + ' 0'.join(train.article_id.value_counts().to_pandas().index.astype(str)[:12])
    return top12

<h1> 아이템 페어 사전 생성 함수

In [13]:
def generate_item_pairs_cudf(df, min_item_rank=1000, max_item_rank=5000, top_n=3, save_path='pairs_cudf.npy'):
    if os.path.exists(save_path):
        print(f"Loading cached pairs from: {save_path}")
        return np.load(save_path, allow_pickle=True).item()
    vc = df['article_id'].value_counts()
    target_items = vc.index[min_item_rank:max_item_rank].to_pandas().values
    
    pairs = {}

    for i in target_items:
        users = df[df['article_id'] == i]['customer_id'].unique()
        co_items = df[(df['customer_id'].isin(users)) & (df['article_id'] != i)]['article_id']
        co_counts = co_items.value_counts()

        if len(co_counts) >= top_n:
            pairs[i] = co_counts.index[:top_n].to_pandas().tolist()
        else:
            pairs[i] = co_counts.index.to_pandas().tolist()

    np.save(save_path, pairs)
    return pairs


<h1> 데이터 라벨링 함수

In [14]:

def create_full_labels(transactions_df: cudf.DataFrame, candidate_df: cudf.DataFrame, label_week_number: int) -> cudf.DataFrame:
    
    positive_labels_df = transactions_df[transactions_df['week_number'] == label_week_number].copy()
    positive_labels_df = positive_labels_df[["customer_id", "article_id"]].drop_duplicates()
    positive_labels_df["match"] = 1

    labeled_candidates_df = candidate_df.merge(positive_labels_df, how="left", on=["customer_id", "article_id"])
    labeled_candidates_df["match"] = labeled_candidates_df["match"].fillna(0).astype(int)

    return labeled_candidates_df


<h1> 최근 1주일 구매 데이터 필터링

In [30]:
transactions= filter_last_week_purchases(transactions)

In [31]:
# 1. 고객별 등장 횟수 계산
counts = transactions['customer_id'].value_counts()

# 2. 30회 이상 등장한 고객만 필터링
valid_ids = counts[counts > 100].index

# 3. 원본 데이터에서 해당 고객만 추출
filtered = transactions[transactions['customer_id'].isin(valid_ids)]


In [32]:
filtered=filtered[['customer_id', 'article_id']]

In [33]:
filtered['customer_id'].unique()

0     2399959835489063121
1     1177544493873523124
2      657699922079694914
3      875937969512579611
4     8936835422870337874
5    -1106694820500549775
6    -7250849952617471376
7    -4132058122892733136
8     5367623498777542563
9    -7480916235333800764
10   -5253228977262187067
11   -4601407992705575197
12    -394771176629714848
13    4801599675390043558
14    2174037403290139027
15    8821185501661401907
16    3979856084354100577
17   -6418561962546478385
18    3078333695715817805
19   -8729471104332666510
20    4741411091101275745
21   -1541792417288283097
22    6045499144411732060
23    1363801644058974650
24   -6637316028259518039
25   -6313430495078977632
26   -1742215151385577582
27   -1347257129369757303
28    2861268073324966212
29    4615355821391819566
30   -5787374227437117595
31   -6762542213259176601
Name: customer_id, dtype: int64

In [34]:
merged = cudf.merge(filtered, articles, on='article_id', how='left')
print(merged)


              customer_id  article_id product_type_name  product_group_name  \
0    -6762542213259176601   902507001            Hoodie  Garment Upper body   
1    -6762542213259176601   902507001            Hoodie  Garment Upper body   
2    -6762542213259176601   902507001            Hoodie  Garment Upper body   
3    -6762542213259176601   902507001            Hoodie  Garment Upper body   
4    -6762542213259176601   896758002            Hoodie  Garment Upper body   
...                   ...         ...               ...                 ...   
4071 -1742215151385577582   798412002  Underwear bottom           Underwear   
4072 -1742215151385577582   868751002  Underwear bottom           Underwear   
4073 -1742215151385577582   803986006  Underwear bottom           Underwear   
4074 -1742215151385577582   803986006  Underwear bottom           Underwear   
4075 -1742215151385577582   803986006  Underwear bottom           Underwear   

     graphical_appearance_name colour_group_name pe

In [35]:
merged = cudf.merge(merged, customers, on='customer_id', how='left')
print(merged)


              customer_id  article_id product_type_name  product_group_name  \
0    -5787374227437117595   823365002         Dungarees   Garment Full body   
1    -5787374227437117595   831777002          Vest top  Garment Upper body   
2    -5787374227437117595   837741002            Shorts  Garment Lower body   
3    -5787374227437117595   852851001          Trousers  Garment Lower body   
4    -5787374227437117595   861023001             Shirt  Garment Upper body   
...                   ...         ...               ...                 ...   
4071 -5253228977262187067   765216001        Sunglasses         Accessories   
4072 -5253228977262187067   765216001        Sunglasses         Accessories   
4073 -5253228977262187067   765216001        Sunglasses         Accessories   
4074 -5253228977262187067   765216001        Sunglasses         Accessories   
4075 -5253228977262187067   765216001        Sunglasses         Accessories   

     graphical_appearance_name colour_group_name pe

In [36]:
merged['label'] = 1

customers = merged['customer_id'].unique().to_pandas().values
articles = merged['article_id'].unique().to_pandas().values

neg_samples = []
for c in customers:
    bought = merged[merged['customer_id'] == c]['article_id'].to_pandas().values
    not_bought = np.setdiff1d(articles, bought)
    if len(not_bought) > 0:
        # 랜덤 선택 범위를 넓혀서 구매한 개수의 3배까지 샘플링
        sampled = np.random.choice(
            not_bought,
            size=min(len(bought) * 3, len(not_bought)),
            replace=False
        )
        for a in sampled:
            neg_samples.append([c, a, 0])

neg_df = cudf.DataFrame(neg_samples, columns=['customer_id','article_id','label'])
dataset = cudf.concat([merged, neg_df], ignore_index=True)

print(dataset)



               customer_id  article_id product_type_name  product_group_name  \
0     -5787374227437117595   823365002         Dungarees   Garment Full body   
1     -5787374227437117595   831777002          Vest top  Garment Upper body   
2     -5787374227437117595   837741002            Shorts  Garment Lower body   
3     -5787374227437117595   852851001          Trousers  Garment Lower body   
4     -5787374227437117595   861023001             Shirt  Garment Upper body   
...                    ...         ...               ...                 ...   
16299  2174037403290139027   560270001              <NA>                <NA>   
16300  2174037403290139027   853468003              <NA>                <NA>   
16301  2174037403290139027   564930001              <NA>                <NA>   
16302  2174037403290139027   714191001              <NA>                <NA>   
16303  2174037403290139027   799356002              <NA>                <NA>   

      graphical_appearance_name colour_

In [21]:
merged['label'] = 1

customers = merged['customer_id'].unique().to_pandas().values
articles = merged['article_id'].unique().to_pandas().values

neg_samples = []
for c in customers:
    bought = merged[merged['customer_id'] == c]['article_id'].to_pandas().values
    not_bought = np.setdiff1d(articles, bought)
    if len(not_bought) > 0:
        sampled = np.random.choice(not_bought, size=min(len(bought), len(not_bought)), replace=False)
        for a in sampled:
            neg_samples.append([c, a, 0])

neg_df = cudf.DataFrame(neg_samples, columns=['customer_id','article_id','label'])
dataset = cudf.concat([merged, neg_df], ignore_index=True)
print(dataset)


                customer_id  article_id product_type_name  product_group_name  \
0       -155063171615458432   629420007           T-shirt  Garment Upper body   
1       -155063171615458432   629420007           T-shirt  Garment Upper body   
2       -155063171615458432   629420007           T-shirt  Garment Upper body   
3       -155063171615458432   697097005          Vest top  Garment Upper body   
4       -155063171615458432   656763007            Shorts  Garment Lower body   
...                     ...         ...               ...                 ...   
138523 -1717040823211133881   716958001              <NA>                <NA>   
138524 -1717040823211133881   660590001              <NA>                <NA>   
138525 -1717040823211133881   863620007              <NA>                <NA>   
138526 -1717040823211133881   791487004              <NA>                <NA>   
138527 -1717040823211133881   637073001              <NA>                <NA>   

       graphical_appearance

<h1> 사용자가 자주 구매한 아이템 뽑기

In [None]:
import cudf
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
import lightgbm as lgb

dataset['customer_count'] = dataset.groupby('customer_id')['article_id'].transform('count')
dataset['item_count'] = dataset.groupby('article_id')['customer_id'].transform('count')

X = dataset[['customer_count','item_count']].to_pandas()
y = dataset['label'].to_pandas()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

models = {

    "LightGBM": None
}

results = {}

for name, model in models.items():
    if name == "LightGBM":
        train_data = lgb.Dataset(X_train, label=y_train)
        test_data = lgb.Dataset(X_test, label=y_test)
        params = {
            'objective': 'binary',
            'metric': 'binary_logloss',
            'boosting_type': 'gbdt',
            'is_unbalance': True
        }
        lgb_model = lgb.train(params, train_data, valid_sets=[test_data], num_boost_round=50)
        y_pred = lgb_model.predict(X_test)
        y_pred_binary = (y_pred > 0.5).astype(int)
    else:
        model.fit(X_train, y_train)
        y_pred_binary = model.predict(X_test)

    acc = accuracy_score(y_test, y_pred_binary)
    prec = precision_score(y_test, y_pred_binary)
    rec = recall_score(y_test, y_pred_binary)
    f1 = f1_score(y_test, y_pred_binary)

    results[name] = {"Accuracy": acc, "Precision": prec, "Recall": rec, "F1-score": f1}

for model_name, metrics in results.items():
    print(model_name, metrics)

LightGBM {'Accuracy': 0.7034651947255444, 'Precision': 0.42857142857142855, 'Recall': 0.6037267080745342, 'F1-score': 0.5012893243940175}


In [17]:
transactions_combined = recommend_frequent_items(transactions)

In [18]:
transactions_combined

Unnamed: 0,customer_id,article_id
34768,-7250849952617471376,570002001
193664,-906958334866810496,852521001
148642,-4549251887148392985,727880001
580816,3949177095849657300,715255013
41616,-7899046446501212654,728162001
...,...,...
27783,-7839243934531519584,594177003
27784,-7839243934531519584,624327002
27785,-7839243934531519584,684307001
27790,3407394559576156506,624537001


<h1> 아이템 페어 사전 생성

In [19]:
pairs = generate_item_pairs_cudf(transactions_combined, min_item_rank=1000, max_item_rank=3000, top_n=3, save_path='pairs_cudf.npy')

Loading cached pairs from: pairs_cudf.npy


<h1> 아이템 페어 적용

In [20]:
transactions_combined = recommend_paired_items(transactions_combined, './pairs_cudf.npy')

<h1> 중복 데이터 제거

In [21]:
transactions_combined = transactions_combined.drop_duplicates(['customer_id', 'article_id'])

<h1> 최근 n 주간의 구매 트렌드 분석을 위한 변수 추가

In [22]:
transactions['week_number'] = ((transactions['t_dat'].max() - transactions['t_dat']).dt.days) // 7

In [23]:
transactions[transactions['week_number']==1]

Unnamed: 0,t_dat,customer_id,article_id,max_dat,diff_dat,week_number
31245333,2020-09-08,6210973690979042899,905365001,2020-09-09,1,1
31245334,2020-09-08,6210973690979042899,905365002,2020-09-09,1,1
31245335,2020-09-08,6210973690979042899,762846027,2020-09-09,1,1
31245336,2020-09-08,6210973690979042899,896169005,2020-09-09,1,1
31246676,2020-09-08,-280909329237915575,868134003,2020-09-09,1,1
...,...,...,...,...,...,...
31535491,2020-09-14,-5230895468037998334,926502001,2020-09-16,2,1
31535492,2020-09-14,-5230895468037998334,926502001,2020-09-16,2,1
31536973,2020-09-14,-5913474707398878237,920012003,2020-09-16,2,1
31536974,2020-09-14,-5913474707398878237,897738001,2020-09-16,2,1


In [24]:
transactions_combined

Unnamed: 0,customer_id,article_id
0,-7250849952617471376,570002001
1,-906958334866810496,852521001
2,-4549251887148392985,727880001
3,3949177095849657300,715255013
4,-7899046446501212654,728162001
...,...,...
921867,-2024381817713445244,778064028
921868,-2024381817713445244,821163008
921869,-5886955024125046945,568597006
921870,-5886955024125046945,507909001


In [25]:
test

Unnamed: 0,t_dat,customer_id,article_id,max_dat,diff_dat
1,2018-09-20,7243449685764312450,629396001,2018-09-20,0
2,2018-09-20,7243449685764312450,556350001,2018-09-20,0
3,2018-09-20,7243449685764312450,585398005,2018-09-20,0
4,2018-09-20,7243449685764312450,661136002,2018-09-20,0
5,2018-09-20,7243449685764312450,623282005,2018-09-20,0
...,...,...,...,...,...
31788319,2020-09-22,8420097870129598355,674606048,2020-09-22,0
31788320,2020-09-22,-4081631389065695504,881111001,2020-09-22,0
31788321,2020-09-22,-4081631389065695504,881111001,2020-09-22,0
31788322,2020-09-22,-9090566858217444904,810169002,2020-09-22,0


<h1> 라벨링

In [26]:
target_label_week = 1

In [27]:

full_labeled_data = create_full_labels(transactions, transactions_combined, target_label_week)

In [28]:
target_label_week = 2

In [29]:
test_data=create_full_labels(transactions, test, target_label_week)

In [30]:
articles=load_and_preprocess_articles()

Loading cached Parquet file: articles.pqt


In [31]:
customers=load_and_preprocess_customers()

Loading cached Parquet file: customers.pqt


In [32]:
full_labeled_data = full_labeled_data.merge(articles, on='article_id', how='left')

In [33]:
full_labeled_data = full_labeled_data.merge(customers, on='customer_id', how='left')

In [34]:
full_labeled_data.columns

Index(['customer_id', 'article_id', 'match', 'product_type_name',
       'product_group_name', 'graphical_appearance_name', 'colour_group_name',
       'perceived_colour_value_name', 'perceived_colour_master_name',
       'department_name', 'index_name', 'garment_group_name',
       'club_member_status', 'fashion_news_frequency', 'age', 'Active'],
      dtype='object')

In [35]:
feature_cols = [col for col in full_labeled_data.columns if col not in ['customer_id', 'article_id', 'match']]

X = full_labeled_data[feature_cols].to_pandas()
y = full_labeled_data['match'].to_pandas()


In [36]:
test_data

Unnamed: 0,t_dat,customer_id,article_id,max_dat,diff_dat,match
0,2018-09-22,-1277898007968919118,272591001,2018-09-22,0,0
1,2018-09-22,-1277898007968919118,539723005,2018-09-22,0,0
2,2018-09-22,-1277898007968919118,413707001,2018-09-22,0,0
3,2018-09-22,-1277898007968919118,519583008,2018-09-22,0,0
4,2018-09-22,-1277898007968919118,678073001,2018-09-22,0,0
...,...,...,...,...,...,...
4586888,2020-09-22,-8794222907624475748,839494001,2020-09-22,0,0
4586889,2020-09-22,-5944093268288223675,895610003,2020-09-22,0,0
4586890,2020-09-22,2266793672368155934,935092001,2020-09-22,0,0
4586891,2020-09-22,-4733711972209332303,715828037,2020-09-22,0,0


In [37]:
test_data = test_data.merge(customers, on='customer_id', how='left')

In [38]:
test_data = test_data.merge(articles, on='article_id', how='left')

In [39]:

X_test = test_data[feature_cols].to_pandas()
y_test = test_data['match'].to_pandas()


In [40]:
model = lgb.LGBMClassifier(
    objective='binary',
    boosting_type='gbdt',
    n_estimators=100,
    learning_rate=0.1,
    num_leaves=31,
    max_depth=-1,
    random_state=42
)

In [41]:
cat_cols = X.select_dtypes(include='object').columns.tolist()
X[cat_cols] = X[cat_cols].astype('category')

model.fit(X, y, categorical_feature=cat_cols)


[LightGBM] [Info] Number of positive: 43884, number of negative: 842195
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.035511 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 569
[LightGBM] [Info] Number of data points in the train set: 886079, number of used features: 13
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.049526 -> initscore=-2.954462
[LightGBM] [Info] Start training from score -2.954462


0,1,2
,boosting_type,'gbdt'
,num_leaves,31
,max_depth,-1
,learning_rate,0.1
,n_estimators,100
,subsample_for_bin,200000
,objective,'binary'
,class_weight,
,min_split_gain,0.0
,min_child_weight,0.001


In [42]:
cat_cols = X_test.select_dtypes(include='object').columns.tolist()
X_test[cat_cols] = X_test[cat_cols].astype('category')

In [43]:
y_pred = model.predict(X_test) 

In [44]:
y_test.value_counts()

match
0    4583088
1       3805
Name: count, dtype: int64

In [45]:
y_pred

array([0, 0, 0, ..., 0, 0, 0], shape=(4586893,))

In [46]:
from sklearn.metrics import precision_score, recall_score, f1_score

precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)


Precision: 0.0006775067750677507
Recall: 0.00026281208935611036
F1 Score: 0.0003787161522438932


In [None]:
cat_cols = balanced_labeled_data.select_dtypes(include='object').columns.tolist()
balanced_labeled_data[cat_cols] = balanced_labeled_data[cat_cols].astype('category')

model.fit(balanced_labeled_data, y, categorical_feature=cat_cols)


In [48]:
def apk(actual, predicted, k=12):
    if not actual:
        return 0.0
    if len(predicted) > k:
        predicted = predicted[:k]
    score = 0.0
    num_hits = 0.0
    for i, p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i + 1.0)
    return score / min(len(actual), k)

def mapk(actual_list, predicted_list, k=12):
    return sum(apk(a, p, k) for a, p in zip(actual_list, predicted_list)) / len(actual_list)

In [49]:
for cid in full_labeled_data['customer_id'].unique().to_pandas():
    sample_data = full_labeled_data[full_labeled_data['customer_id'] == cid].copy()
    sample_actual = sample_data[sample_data['match'] == 1]['article_id'].to_pandas().tolist()

    if sample_actual:
        sample_idx = sample_data.index.to_pandas()
        X_sample = X.loc[sample_idx]
        sample_data['score'] = model.predict_proba(X_sample)[:, 1]

        sample_topk = sample_data.sort_values('score', ascending=False).head(12)
        sample_pred = sample_topk['article_id'].to_pandas().tolist()

        print("Customer ID:", cid)
        print("Predicted:", sample_pred)
        print("Actual:", sample_actual)
        print("MAP@12:", apk(sample_actual, sample_pred, k=12))
        

Customer ID: -4910175418393404262
Predicted: [926164001, 887681001, 885951003, 834906012, 885951004, 855080008, 855080009, 829991003, 506098007, 855080001, 736923003, 800245002]
Actual: [885951002, 926164001, 885951004, 860797002, 885951001, 736923011, 885951003, 506098007, 855080008, 800245003]
MAP@12: 0.3488888888888889
Customer ID: -6630129866339193069
Predicted: [865430006, 881833002, 898233001, 856115001, 904230003, 855778002, 894752003, 893290001, 894752001, 884535001, 877666001, 882296001]
Actual: [884535001, 882296001, 865430006, 916497001, 849144002, 898676001, 882728001, 873604005, 882354001, 903960001, 855778002, 881833002, 785402001, 849114001, 825781005, 856310003, 898233001, 894752003, 901607003, 856115001, 894941003, 904230003, 877666001, 893290001, 894752001]
MAP@12: 1.0
Customer ID: 3392205661623200679
Predicted: [751471037, 898713001, 751471039, 751471001, 779781012]
Actual: [751471039, 898713001, 751471037, 751471001, 779781012]
MAP@12: 1.0
Customer ID: -169710223123

KeyboardInterrupt: 