In [15]:
import lightgbm as lgb
import pandas as pd
import numpy as np
import cudf
import os

<h1> 트랜잭션 데이터 로드 함수

In [16]:
def load_and_preprocess_transactions(path='./transactions_train.csv', parquet_path='transactions.pqt'):
    transaction_columns = [
        't_dat',
        'customer_id',
        'article_id'
    ]
    if os.path.exists(parquet_path):
        print(f"Loading cached Parquet file: {parquet_path}")
        return cudf.read_parquet(parquet_path)
    
    
    print(f"Processing raw CSV file: {path}")
    df = pd.read_csv(path)
    df = df[transaction_columns]
    
    gdf = cudf.DataFrame.from_pandas(df)
    gdf['customer_id'] = gdf['customer_id'].str[-16:].str.hex_to_int().astype('int64')
    gdf['article_id'] = gdf['article_id'].astype('int32')
    gdf['t_dat'] = cudf.to_datetime(gdf['t_dat'])
    
    gdf.to_parquet(parquet_path, index=False)
    print(f"Saved processed data to: {parquet_path}")
    
    return gdf

<h1> 아이템 데이터 로드 함수

In [49]:
def load_and_preprocess_articles(path='./articles.csv', parquet_path='articles.pqt'):
    categorical_columns = [
        'article_id',
        'product_type_name',
        'product_group_name',
        'graphical_appearance_name',
        'colour_group_name',
        'perceived_colour_value_name',
        'perceived_colour_master_name',
        'department_name',
        'index_name',
        'garment_group_name'
    ]

    if os.path.exists(parquet_path):
        return cudf.read_parquet(parquet_path)

    df = pd.read_csv(path)
    df = df[categorical_columns]

    gdf = cudf.DataFrame.from_pandas(df)
    gdf['article_id'] = gdf['article_id'].astype('int32')
    for col in categorical_columns:
        gdf[col] = gdf[col].astype('str')

    gdf.to_parquet(parquet_path, index=False)
    return gdf

<h1> 유저정보 데이터 로드 함수

In [50]:
def load_and_preprocess_customers(path='./customers.csv', parquet_path='customers.pqt'):
    selected_customer_columns = [
        'customer_id',
        'club_member_status',
        'fashion_news_frequency',
        'age',
        'Active'
    ]

    if os.path.exists(parquet_path):
        return cudf.read_parquet(parquet_path)

    df = pd.read_csv(path)
    df = df[selected_customer_columns]

    gdf = cudf.DataFrame.from_pandas(df)
    gdf['customer_id'] = gdf['customer_id'].str[-16:].str.hex_to_int().astype('int64')
    gdf['club_member_status'] = gdf['club_member_status'].astype('str')
    gdf['fashion_news_frequency'] = gdf['fashion_news_frequency'].astype('str')
    gdf['age'] = gdf['age'].where(gdf['age'] >= 13, 13)
    gdf['age'] = gdf['age'].where(gdf['age'] <= 100, 100)
    gdf['Active'] = gdf['Active'].fillna(0).astype(bool)

    gdf.to_parquet(parquet_path, index=False)
    return gdf

<h1> 최근 1주일 구매 데이터 필터링 함수

In [19]:

def filter_last_week_purchases(train):
    tmp = train.groupby('customer_id').t_dat.max().reset_index()
    tmp.columns = ['customer_id', 'max_dat']
    train = train.merge(tmp, on='customer_id', how='left')
    train['diff_dat'] = (train.max_dat - train.t_dat).dt.days
    return train[train['diff_dat'] <= 6]

<h1> 자주 구매한 아이템 추천 함수

In [20]:

def recommend_frequent_items(train):
    tmp = train.groupby(['customer_id', 'article_id'])['t_dat'].agg('count').reset_index()
    tmp.columns = ['customer_id', 'article_id', 'ct']
    train = train.merge(tmp, on=['customer_id', 'article_id'], how='left')
    train = train.sort_values(['ct', 't_dat'], ascending=False)
    train = train.drop_duplicates(['customer_id', 'article_id'])
    return train[['customer_id', 'article_id']]

<h1> 함께 구매된 아이템 추천 함수

In [42]:
def recommend_paired_items(train, pair_dict_path):
    pairs = np.load(pair_dict_path, allow_pickle=True).item()
    train['article_id2'] = train['article_id'].map(pairs)
    train2 = train[['customer_id', 'article_id2']].dropna()
    train2 = train2[train2['article_id2'].list.len() > 0]
    train2 = train2.explode('article_id2')
    train2 = train2.drop_duplicates(['customer_id', 'article_id2'])
    train2 = train2.rename(columns={'article_id2': 'article_id'})
    return cudf.concat([train[['customer_id', 'article_id']], train2], ignore_index=True)

<h1> 고객별 추천 문자열 생성 함수

In [22]:
def generate_prediction_strings(train_df):
    train_df['article_id'] = ' 0' + train_df['article_id'].astype(str)
    preds = train_df.groupby('customer_id')['article_id'].sum().reset_index()
    preds.columns = ['customer_id', 'prediction']
    return cudf.DataFrame(preds)

<h1> 인기 아이템 추출 함수

In [23]:
def get_top12_items(train_path, cutoff_date='2020-09-16'):
    train = cudf.read_parquet(train_path)
    train.t_dat = cudf.to_datetime(train.t_dat)
    train = train[train.t_dat >= cudf.to_datetime(cutoff_date)]
    top12 = ' 0' + ' 0'.join(train.article_id.value_counts().to_pandas().index.astype(str)[:12])
    return top12

<h1> 아이템 페어 사전 생성 함수

In [24]:
def generate_item_pairs_cudf(df, min_item_rank=1000, max_item_rank=5000, top_n=3, save_path='pairs_cudf.npy'):
    vc = df['article_id'].value_counts()
    target_items = vc.index[min_item_rank:max_item_rank].to_pandas().values
    
    pairs = {}

    for i in target_items:
        users = df[df['article_id'] == i]['customer_id'].unique()
        co_items = df[(df['customer_id'].isin(users)) & (df['article_id'] != i)]['article_id']
        co_counts = co_items.value_counts()

        if len(co_counts) >= top_n:
            pairs[i] = co_counts.index[:top_n].to_pandas().tolist()
        else:
            pairs[i] = co_counts.index.to_pandas().tolist()

    np.save(save_path, pairs)
    return pairs


<h1> 데이터 라벨링 함수

In [25]:

def create_full_labels(transactions_df: cudf.DataFrame, candidate_df: cudf.DataFrame, label_week_number: int) -> cudf.DataFrame:
    if not isinstance(transactions_df, cudf.DataFrame):
        raise TypeError("Input transactions_df must be a cudf.DataFrame.")
    if not isinstance(candidate_df, cudf.DataFrame):
        raise TypeError("Input candidate_df must be a cudf.DataFrame.")
    if not all(col in transactions_df.columns for col in ["customer_id", "article_id", "week_number"]):
        raise ValueError("transactions_df must contain 'customer_id', 'article_id', and 'week_number' columns.")
    if not all(col in candidate_df.columns for col in ["customer_id", "article_id"]):
        raise ValueError("candidate_df must contain 'customer_id' and 'article_id' columns.")

    positive_labels_df = transactions_df[transactions_df['week_number'] == label_week_number].copy()
    positive_labels_df = positive_labels_df[["customer_id", "article_id"]].drop_duplicates()
    positive_labels_df["match"] = 1

    labeled_candidates_df = candidate_df.merge(positive_labels_df, how="left", on=["customer_id", "article_id"])
    labeled_candidates_df["match"] = labeled_candidates_df["match"].fillna(0).astype(int)

    return labeled_candidates_df


<h1> 트랜잭션 데이터 로드

In [26]:
transactions = load_and_preprocess_transactions()

Processing raw CSV file: ./transactions_train.csv
Saved processed data to: transactions.pqt


<h1> 최근 1주일 구매 데이터 필터링

In [27]:
transactions = filter_last_week_purchases(transactions)

<h1> 사용자가 자주 구매한 아이템 뽑기

In [51]:
transactions_combined = recommend_frequent_items(transactions)

<h1> 아이템 페어 사전 생성

In [39]:
pairs = generate_item_pairs_cudf(transactions_combined, min_item_rank=1000, max_item_rank=3000, top_n=3, save_path='pairs_cudf.npy')

<h1> 아이템 페어 적용

In [52]:
transactions_combined= recommend_paired_items(transactions_combined, './pairs_cudf.npy')

<h1> 중복 데이터 제거

In [53]:
transactions_combined = transactions_combined.drop_duplicates(['customer_id', 'article_id'])

<h1> 최근 n 주간의 구매 트렌드 분석을 위한 변수 추가

In [54]:
transactions['week_number'] = ((transactions['t_dat'].max() - transactions['t_dat']).dt.days) // 7

<h1> 라벨링

In [55]:
target_label_week = 2
full_labeled_data = create_full_labels(transactions, transactions_combined, target_label_week)

In [56]:
articles=load_and_preprocess_articles()

In [57]:
customers=load_and_preprocess_customers()

In [58]:
full_labeled_data = full_labeled_data.merge(articles, on='article_id', how='left')

In [59]:
full_labeled_data = full_labeled_data.merge(customers, on='customer_id', how='left')

In [64]:
feature_cols = [col for col in full_labeled_data.columns if col not in ['customer_id', 'article_id', 'match']]

X = full_labeled_data[feature_cols].to_pandas()
y = full_labeled_data['match'].to_pandas()


In [65]:
model = lgb.LGBMClassifier(
    objective='binary',
    boosting_type='gbdt',
    n_estimators=100,
    learning_rate=0.1,
    num_leaves=31,
    max_depth=-1,
    random_state=42
)

In [66]:
cat_cols = X.select_dtypes(include='object').columns.tolist()
X[cat_cols] = X[cat_cols].astype('category')

model.fit(X, y, categorical_feature=cat_cols)


[LightGBM] [Info] Number of positive: 181502, number of negative: 6463453
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.296198 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 582
[LightGBM] [Info] Number of data points in the train set: 6644955, number of used features: 13
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.027314 -> initscore=-3.572652
[LightGBM] [Info] Start training from score -3.572652


0,1,2
,boosting_type,'gbdt'
,num_leaves,31
,max_depth,-1
,learning_rate,0.1
,n_estimators,100
,subsample_for_bin,200000
,objective,'binary'
,class_weight,
,min_split_gain,0.0
,min_child_weight,0.001


In [None]:
full_labeled_data['score'] = model.predict_proba(X)[:, 1]
topk = full_labeled_data.sort_values(['customer_id', 'score'], ascending=False).groupby('customer_id').head(12)
topk['article_id'] = ' 0' + topk['article_id'].astype(str)

submission = topk.groupby('customer_id').apply(
    lambda df: ''.join(df['article_id'].to_pandas())
).reset_index().rename(columns={0: 'prediction'})

def apk(actual, predicted, k=12):
    if len(predicted) > k:
        predicted = predicted[:k]
    score = 0.0
    num_hits = 0.0
    for i, p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i + 1.0)
    return score / min(len(actual), k)

def mapk(actual_list, predicted_list, k=12):
    return sum(apk(a, p, k) for a, p in zip(actual_list, predicted_list)) / len(actual_list)



In [None]:
predicted = topk.groupby('customer_id').apply(
    lambda df: df['article_id'].to_pandas().tolist()
).to_pandas().to_dict()

actual = full_labeled_data[full_labeled_data['match'] == 1].groupby('customer_id').apply(
    lambda df: df['article_id'].to_pandas().tolist()
).to_pandas().to_dict()

common_customers = set(predicted.keys()) & set(actual.keys())
pred_list = [predicted[c] for c in common_customers]
act_list = [actual[c] for c in common_customers]

print(submission.head())
print("MAP@12:", mapk(act_list, pred_list, k=12))