In [1]:
import pandas as pd
import numpy as np
import cudf
import os

<h1> 트랜잭션 데이터 로드 함수

In [2]:
def load_and_preprocess_transactions(path='./transactions_train.csv', parquet_path='transactions.pqt'):
    transaction_columns = [
        't_dat',
        'customer_id',
        'article_id'
    ]
    if os.path.exists(parquet_path):
        print(f"Loading cached Parquet file: {parquet_path}")
        return cudf.read_parquet(parquet_path)
    
    
    print(f"Processing raw CSV file: {path}")
    df = pd.read_csv(path)
    df = df[transaction_columns]
    
    gdf = cudf.DataFrame.from_pandas(df)
    gdf['customer_id'] = gdf['customer_id'].str[-16:].str.hex_to_int().astype('int64')
    gdf['article_id'] = gdf['article_id'].astype('int32')
    gdf['t_dat'] = cudf.to_datetime(gdf['t_dat'])
    
    gdf.to_parquet(parquet_path, index=False)
    print(f"Saved processed data to: {parquet_path}")
    
    return gdf

<h1> 아이템 데이터 로드 함수

In [3]:
def load_and_preprocess_articles(path='./articles.csv', parquet_path='articles.pqt'):
    categorical_columns = [
        'article_id',
        'product_type_name',
        'product_group_name',
        'graphical_appearance_name',
        'colour_group_name',
        'perceived_colour_value_name',
        'perceived_colour_master_name',
        'department_name',
        'index_name',
        'garment_group_name'
    ]

    if os.path.exists(parquet_path):
        print(f"Loading cached Parquet file: {parquet_path}")
        return cudf.read_parquet(parquet_path)

    df = pd.read_csv(path)
    df = df[categorical_columns]

    gdf = cudf.DataFrame.from_pandas(df)
    gdf['article_id'] = gdf['article_id'].astype('int32')
    for col in categorical_columns:
        gdf[col] = gdf[col].astype('str')

    gdf.to_parquet(parquet_path, index=False)
    return gdf

<h1> 유저정보 데이터 로드 함수

In [4]:
def load_and_preprocess_customers(path='./customers.csv', parquet_path='customers.pqt'):
    selected_customer_columns = [
        'customer_id',
        'club_member_status',
        'fashion_news_frequency',
        'age',
        'Active'
    ]

    if os.path.exists(parquet_path):
        print(f"Loading cached Parquet file: {parquet_path}")
        return cudf.read_parquet(parquet_path)

    df = pd.read_csv(path)
    df = df[selected_customer_columns]

    gdf = cudf.DataFrame.from_pandas(df)
    gdf['customer_id'] = gdf['customer_id'].str[-16:].str.hex_to_int().astype('int64')
    gdf['club_member_status'] = gdf['club_member_status'].astype('str')
    gdf['fashion_news_frequency'] = gdf['fashion_news_frequency'].astype('str')
    gdf['age'] = gdf['age'].where(gdf['age'] >= 13, 13)
    gdf['age'] = gdf['age'].where(gdf['age'] <= 100, 100)
    gdf['Active'] = gdf['Active'].fillna(0).astype(bool)

    gdf.to_parquet(parquet_path, index=False)
    return gdf

In [5]:
articles=load_and_preprocess_articles()

Loading cached Parquet file: articles.pqt


In [6]:
customers=load_and_preprocess_customers()

Loading cached Parquet file: customers.pqt


In [7]:
transactions=load_and_preprocess_transactions()

Loading cached Parquet file: transactions.pqt


<h1> 최근 1주일 구매 데이터 필터링 함수

In [8]:
def filter_last_week_purchases(train):
    tmp = train.groupby('customer_id').t_dat.max().reset_index()
    tmp.columns = ['customer_id', 'max_dat']
    train = train.merge(tmp, on='customer_id', how='left')
    train['diff_dat'] = (train.max_dat - train.t_dat).dt.days
    return train[train['diff_dat'] <= 7]

<h1> 자주 구매한 아이템 추천 함수

In [9]:

def recommend_frequent_items(train):
    tmp = train.groupby(['customer_id', 'article_id'])['t_dat'].agg('count').reset_index()
    tmp.columns = ['customer_id', 'article_id', 'ct']
    train = train.merge(tmp, on=['customer_id', 'article_id'], how='left')
    train = train.sort_values(['ct', 't_dat'], ascending=False)
    train = train.drop_duplicates(['customer_id', 'article_id'])
    return train[['customer_id', 'article_id']]

<h1> 함께 구매된 아이템 추천 함수

In [10]:
def recommend_paired_items(train, pair_dict_path):
    pairs = np.load(pair_dict_path, allow_pickle=True).item()
    train['article_id2'] = train['article_id'].map(pairs)
    train2 = train[['customer_id', 'article_id2']].dropna()
    train2 = train2[train2['article_id2'].list.len() > 0]
    train2 = train2.explode('article_id2')
    train2 = train2.drop_duplicates(['customer_id', 'article_id2'])
    train2 = train2.rename(columns={'article_id2': 'article_id'})
    return cudf.concat([train[['customer_id', 'article_id']], train2], ignore_index=True)

<h1> 고객별 추천 문자열 생성 함수

In [11]:
def generate_prediction_strings(train_df):
    train_df['article_id'] = ' 0' + train_df['article_id'].astype(str)
    preds = train_df.groupby('customer_id')['article_id'].sum().reset_index()
    preds.columns = ['customer_id', 'prediction']
    return cudf.DataFrame(preds)

<h1> 인기 아이템 추출 함수

In [12]:
def get_top12_items(train_path, cutoff_date='2020-09-16'):
    train = cudf.read_parquet(train_path)
    train.t_dat = cudf.to_datetime(train.t_dat)
    train = train[train.t_dat >= cudf.to_datetime(cutoff_date)]
    top12 = ' 0' + ' 0'.join(train.article_id.value_counts().to_pandas().index.astype(str)[:12])
    return top12

<h1> 아이템 페어 사전 생성 함수

In [13]:
def generate_item_pairs_cudf(df, min_item_rank=1000, max_item_rank=5000, top_n=3, save_path='pairs_cudf.npy'):
    if os.path.exists(save_path):
        print(f"Loading cached pairs from: {save_path}")
        return np.load(save_path, allow_pickle=True).item()
    vc = df['article_id'].value_counts()
    target_items = vc.index[min_item_rank:max_item_rank].to_pandas().values
    
    pairs = {}

    for i in target_items:
        users = df[df['article_id'] == i]['customer_id'].unique()
        co_items = df[(df['customer_id'].isin(users)) & (df['article_id'] != i)]['article_id']
        co_counts = co_items.value_counts()

        if len(co_counts) >= top_n:
            pairs[i] = co_counts.index[:top_n].to_pandas().tolist()
        else:
            pairs[i] = co_counts.index.to_pandas().tolist()

    np.save(save_path, pairs)
    return pairs


<h1> 데이터 라벨링 함수

In [14]:

def create_full_labels(transactions_df: cudf.DataFrame, candidate_df: cudf.DataFrame, label_week_number: int) -> cudf.DataFrame:
    
    positive_labels_df = transactions_df[transactions_df['week_number'] == label_week_number].copy()
    positive_labels_df = positive_labels_df[["customer_id", "article_id"]].drop_duplicates()
    positive_labels_df["match"] = 1

    labeled_candidates_df = candidate_df.merge(positive_labels_df, how="left", on=["customer_id", "article_id"])
    labeled_candidates_df["match"] = labeled_candidates_df["match"].fillna(0).astype(int)

    return labeled_candidates_df


<h1> 최근 1주일 구매 데이터 필터링

In [15]:
transactions= filter_last_week_purchases(transactions)

In [16]:
# 1. 고객별 등장 횟수 계산
counts = transactions['customer_id'].value_counts()

# 2. 30회 이상 등장한 고객만 필터링
valid_ids = counts[counts > 100].index

# 3. 원본 데이터에서 해당 고객만 추출
filtered = transactions[transactions['customer_id'].isin(valid_ids)]


In [17]:
filtered=filtered[['customer_id', 'article_id']]

In [18]:
filtered['customer_id'].unique()

0     2399959835489063121
1     1177544493873523124
2      657699922079694914
3      875937969512579611
4     8936835422870337874
5    -1106694820500549775
6    -7250849952617471376
7    -4132058122892733136
8     5367623498777542563
9    -7480916235333800764
10   -5253228977262187067
11   -4601407992705575197
12    -394771176629714848
13    4801599675390043558
14    2174037403290139027
15    8821185501661401907
16    3979856084354100577
17   -6418561962546478385
18    3078333695715817805
19   -8729471104332666510
20    4741411091101275745
21   -1541792417288283097
22    6045499144411732060
23    1363801644058974650
24   -6637316028259518039
25   -6313430495078977632
26   -1742215151385577582
27   -1347257129369757303
28    2861268073324966212
29    4615355821391819566
30   -5787374227437117595
31   -6762542213259176601
Name: customer_id, dtype: int64

In [19]:
merged = cudf.merge(filtered, articles, on='article_id', how='left')
print(merged)


              customer_id  article_id product_type_name  product_group_name  \
0     8821185501661401907   788261004          Vest top  Garment Upper body   
1     8821185501661401907   788261004          Vest top  Garment Upper body   
2     8821185501661401907   788261004          Vest top  Garment Upper body   
3     8821185501661401907   788261004          Vest top  Garment Upper body   
4     8821185501661401907   788261004          Vest top  Garment Upper body   
...                   ...         ...               ...                 ...   
4071  6045499144411732060   817150002            Blouse  Garment Upper body   
4072  6045499144411732060   817150002            Blouse  Garment Upper body   
4073  6045499144411732060   865086002               Bra           Underwear   
4074  6045499144411732060   865086002               Bra           Underwear   
4075  6045499144411732060   865086002               Bra           Underwear   

     graphical_appearance_name colour_group_name pe

In [20]:
merged = cudf.merge(merged, customers, on='customer_id', how='left')
print(merged)


              customer_id  article_id product_type_name  product_group_name  \
0    -5787374227437117595   817067002            Blouse  Garment Upper body   
1    -5787374227437117595   707717006            Blouse  Garment Upper body   
2    -5787374227437117595   786197005             Skirt  Garment Lower body   
3    -5787374227437117595   872233003             Dress   Garment Full body   
4    -5787374227437117595   884319005            Blouse  Garment Upper body   
...                   ...         ...               ...                 ...   
4071  2174037403290139027   755616004             Dress   Garment Full body   
4072  2174037403290139027   755616004             Dress   Garment Full body   
4073  2174037403290139027   755616004             Dress   Garment Full body   
4074  2174037403290139027   755616004             Dress   Garment Full body   
4075  2174037403290139027   759871015          Vest top  Garment Upper body   

     graphical_appearance_name colour_group_name pe

In [21]:
merged['label'] = 1

customers = merged['customer_id'].unique().to_pandas().values
articles = merged['article_id'].unique().to_pandas().values

neg_samples = []
for c in customers:
    bought = merged[merged['customer_id'] == c]['article_id'].to_pandas().values
    not_bought = np.setdiff1d(articles, bought)
    if len(not_bought) > 0:
        # 랜덤 선택 범위를 넓혀서 구매한 개수의 3배까지 샘플링
        sampled = np.random.choice(
            not_bought,
            size=min(len(bought) * 3, len(not_bought)),
            replace=False
        )
        for a in sampled:
            neg_samples.append([c, a, 0])

neg_df = cudf.DataFrame(neg_samples, columns=['customer_id','article_id','label'])
dataset = cudf.concat([merged, neg_df], ignore_index=True)

print(dataset)



               customer_id  article_id product_type_name  product_group_name  \
0     -5787374227437117595   817067002            Blouse  Garment Upper body   
1     -5787374227437117595   707717006            Blouse  Garment Upper body   
2     -5787374227437117595   786197005             Skirt  Garment Lower body   
3     -5787374227437117595   872233003             Dress   Garment Full body   
4     -5787374227437117595   884319005            Blouse  Garment Upper body   
...                    ...         ...               ...                 ...   
16299  2174037403290139027   762096004              <NA>                <NA>   
16300  2174037403290139027   573414005              <NA>                <NA>   
16301  2174037403290139027   688537004              <NA>                <NA>   
16302  2174037403290139027   815168001              <NA>                <NA>   
16303  2174037403290139027   706016002              <NA>                <NA>   

      graphical_appearance_name colour_

In [22]:
merged['label'] = 1

customers = merged['customer_id'].unique().to_pandas().values
articles = merged['article_id'].unique().to_pandas().values

neg_samples = []
for c in customers:
    bought = merged[merged['customer_id'] == c]['article_id'].to_pandas().values
    not_bought = np.setdiff1d(articles, bought)
    if len(not_bought) > 0:
        sampled = np.random.choice(not_bought, size=min(len(bought), len(not_bought)), replace=False)
        for a in sampled:
            neg_samples.append([c, a, 0])

neg_df = cudf.DataFrame(neg_samples, columns=['customer_id','article_id','label'])
dataset = cudf.concat([merged, neg_df], ignore_index=True)
print(dataset)


              customer_id  article_id product_type_name  product_group_name  \
0    -5787374227437117595   817067002            Blouse  Garment Upper body   
1    -5787374227437117595   707717006            Blouse  Garment Upper body   
2    -5787374227437117595   786197005             Skirt  Garment Lower body   
3    -5787374227437117595   872233003             Dress   Garment Full body   
4    -5787374227437117595   884319005            Blouse  Garment Upper body   
...                   ...         ...               ...                 ...   
8147  2174037403290139027   750921001              <NA>                <NA>   
8148  2174037403290139027   852535001              <NA>                <NA>   
8149  2174037403290139027   732671003              <NA>                <NA>   
8150  2174037403290139027   536139054              <NA>                <NA>   
8151  2174037403290139027   581298006              <NA>                <NA>   

     graphical_appearance_name colour_group_name pe

<h1> 사용자가 자주 구매한 아이템 뽑기

In [34]:
from category_encoders import TargetEncoder


In [36]:
dataset['customer_count'] = dataset.groupby('customer_id')['article_id'].transform('count')
dataset['item_count'] = dataset.groupby('article_id')['customer_id'].transform('count')

exclude_cols = ['customer_id', 'article_id', 'label']
feature_cols = [col for col in dataset.columns if col not in exclude_cols]

X = dataset[feature_cols].to_pandas()
y = dataset['label'].to_pandas()

In [38]:
encoder = TargetEncoder()
X_encoded = encoder.fit_transform(X, y)


In [40]:
X.describe()

Unnamed: 0,age,customer_count,item_count
count,4076.0,8152.0,8152.0
mean,37.84053,269.857704,12.631992
std,13.009165,72.120126,13.322698
min,19.0,204.0,1.0
25%,27.0,218.0,7.0
50%,35.0,232.0,9.0
75%,44.0,312.0,13.0
max,74.0,458.0,83.0


In [44]:
X_encoded.fillna(X_encoded.median(), inplace=True)


In [45]:
X_encoded

Unnamed: 0,product_type_name,product_group_name,graphical_appearance_name,colour_group_name,perceived_colour_value_name,perceived_colour_master_name,department_name,index_name,garment_group_name,club_member_status,fashion_news_frequency,age,Active,customer_count,item_count
0,1.000000,1.0,1.000000,1.000000,1.0,1.000000,1.000000,1.0,1.00000,1.0,1.0,47.0,1.0,206,5
1,1.000000,1.0,1.000000,0.901092,1.0,0.993307,1.000000,1.0,1.00000,1.0,1.0,47.0,1.0,206,4
2,0.999444,1.0,0.999695,0.999995,1.0,1.000000,0.901092,1.0,0.99008,1.0,1.0,47.0,1.0,206,9
3,1.000000,1.0,1.000000,0.999998,1.0,1.000000,1.000000,1.0,1.00000,1.0,1.0,47.0,1.0,206,3
4,1.000000,1.0,1.000000,0.999998,1.0,1.000000,1.000000,1.0,1.00000,1.0,1.0,47.0,1.0,206,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8147,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.00000,0.0,0.0,35.0,0.0,262,10
8148,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.00000,0.0,0.0,35.0,0.0,262,5
8149,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.00000,0.0,0.0,35.0,0.0,262,14
8150,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.00000,0.0,0.0,35.0,0.0,262,14


In [48]:
X=X_encoded

In [49]:
import cudf
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, log_loss, matthews_corrcoef, confusion_matrix,
    balanced_accuracy_score, brier_score_loss
)
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
import lightgbm as lgb


In [51]:
X_train

Unnamed: 0,product_type_name,product_group_name,graphical_appearance_name,colour_group_name,perceived_colour_value_name,perceived_colour_master_name,department_name,index_name,garment_group_name,club_member_status,fashion_news_frequency,age,Active,customer_count,item_count
7216,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,35.0,0.0,250,8
7929,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,35.0,0.0,312,22
4950,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,35.0,0.0,250,5
1634,0.999989,1.0,1.000000,1.000000,1.0,1.0,0.962071,1.000000,0.997517,1.0,1.0,40.0,1.0,262,9
5907,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,35.0,0.0,214,7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5226,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,35.0,0.0,388,13
5390,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,35.0,0.0,224,11
860,1.000000,1.0,0.799344,0.996654,1.0,1.0,0.995024,0.999995,1.000000,1.0,1.0,48.0,1.0,208,12
7603,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,35.0,0.0,212,6


In [52]:
X_test

Unnamed: 0,product_type_name,product_group_name,graphical_appearance_name,colour_group_name,perceived_colour_value_name,perceived_colour_master_name,department_name,index_name,garment_group_name,club_member_status,fashion_news_frequency,age,Active,customer_count,item_count
5814,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,35.0,0.0,308,5
483,0.799344,0.962071,1.0,0.999995,1.0,1.0,0.954439,1.0,1.000000,1.0,1.0,74.0,1.0,388,18
7606,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,35.0,0.0,212,11
3939,1.000000,1.000000,1.0,0.999995,1.0,1.0,1.000000,1.0,1.000000,1.0,1.0,40.0,1.0,262,8
3821,0.591213,1.000000,1.0,1.000000,1.0,1.0,0.737510,1.0,1.000000,1.0,1.0,24.0,1.0,250,9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5058,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,35.0,0.0,262,14
2867,0.644525,0.999996,1.0,1.000000,1.0,1.0,0.799344,1.0,0.999996,1.0,1.0,23.0,1.0,206,13
293,1.000000,1.000000,1.0,1.000000,1.0,1.0,0.999989,1.0,1.000000,1.0,1.0,24.0,1.0,250,83
5958,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,35.0,0.0,220,5


In [50]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


models = {
    "LogisticRegression": LogisticRegression(max_iter=1000),
    "RandomForest": RandomForestClassifier(n_estimators=100),
    "GradientBoosting": GradientBoostingClassifier(n_estimators=100),
    
    "KNeighbors": KNeighborsClassifier(),
    "LightGBM": None
}

results = {}

# 학습 및 평가
for name, model in models.items():
    if name == "LightGBM":
        train_data = lgb.Dataset(X_train, label=y_train)
        test_data = lgb.Dataset(X_test, label=y_test)
        params = {
            'objective': 'binary',
            'metric': 'binary_logloss',
            'boosting_type': 'gbdt',
            'is_unbalance': True,
            'verbosity': -1
        }
        lgb_model = lgb.train(params, train_data, valid_sets=[test_data], num_boost_round=50)
        y_pred = lgb_model.predict(X_test)
        y_pred_binary = (y_pred > 0.5).astype(int)
    else:
        model.fit(X_train, y_train)
        y_pred = model.predict_proba(X_test)[:, 1]
        y_pred_binary = model.predict(X_test)

    acc = accuracy_score(y_test, y_pred_binary)
    prec = precision_score(y_test, y_pred_binary)
    rec = recall_score(y_test, y_pred_binary)
    f1 = f1_score(y_test, y_pred_binary)
    roc_auc = roc_auc_score(y_test, y_pred)
    logloss = log_loss(y_test, y_pred)
    mcc = matthews_corrcoef(y_test, y_pred_binary)
    balanced_acc = balanced_accuracy_score(y_test, y_pred_binary)
    brier = brier_score_loss(y_test, y_pred)
    conf_matrix = confusion_matrix(y_test, y_pred_binary)

    results[name] = {
        "Accuracy": acc,
        "Precision": prec,
        "Recall": rec,
        "F1-score": f1,
        "ROC AUC": roc_auc,
        "Log Loss": logloss,
        "MCC": mcc,
        "Balanced Accuracy": balanced_acc,
        "Brier Score": brier,
        "Confusion Matrix": conf_matrix.tolist()
    }

# 결과 출력
for model_name, metrics in results.items():
    print(f"{model_name}:")
    for metric_name, value in metrics.items():
        print(f"  {metric_name}: {value}")


LogisticRegression:
  Accuracy: 1.0
  Precision: 1.0
  Recall: 1.0
  F1-score: 1.0
  ROC AUC: 1.0
  Log Loss: 0.00039807643618970624
  MCC: 1.0
  Balanced Accuracy: 1.0
  Brier Score: 1.8570862223182904e-07
  Confusion Matrix: [[790, 0], [0, 841]]
RandomForest:
  Accuracy: 1.0
  Precision: 1.0
  Recall: 1.0
  F1-score: 1.0
  ROC AUC: 1.0
  Log Loss: 2.2204460492503136e-16
  MCC: 1.0
  Balanced Accuracy: 1.0
  Brier Score: 0.0
  Confusion Matrix: [[790, 0], [0, 841]]
GradientBoosting:
  Accuracy: 1.0
  Precision: 1.0
  Recall: 1.0
  F1-score: 1.0
  ROC AUC: 1.0
  Log Loss: 2.1906938138531577e-05
  MCC: 1.0
  Balanced Accuracy: 1.0
  Brier Score: 4.79929726995088e-10
  Confusion Matrix: [[790, 0], [0, 841]]
KNeighbors:
  Accuracy: 0.9981606376456161
  Precision: 0.9964454976303317
  Recall: 1.0
  F1-score: 0.9982195845697329
  ROC AUC: 0.9993670886075949
  Log Loss: 0.024097701538017147
  MCC: 0.9963240072316665
  Balanced Accuracy: 0.9981012658227848
  Brier Score: 0.0013488657265481301

In [30]:
import cudf
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
import lightgbm as lgb

dataset['customer_count'] = dataset.groupby('customer_id')['article_id'].transform('count')
dataset['item_count'] = dataset.groupby('article_id')['customer_id'].transform('count')

exclude_cols = ['customer_id', 'article_id', 'label']
feature_cols = [col for col in dataset.columns if col not in exclude_cols]

X = dataset[feature_cols].to_pandas()
y = dataset['label'].to_pandas()


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

models = {

    "LightGBM": None
}

results = {}

for name, model in models.items():
    if name == "LightGBM":
        train_data = lgb.Dataset(X_train, label=y_train)
        test_data = lgb.Dataset(X_test, label=y_test)
        params = {
            'objective': 'binary',
            'metric': 'binary_logloss',
            'boosting_type': 'gbdt',
            'is_unbalance': True
        }
        lgb_model = lgb.train(params, train_data, valid_sets=[test_data], num_boost_round=50)
        y_pred = lgb_model.predict(X_test)
        y_pred_binary = (y_pred > 0.5).astype(int)
    else:
        model.fit(X_train, y_train)
        y_pred_binary = model.predict(X_test)

    acc = accuracy_score(y_test, y_pred_binary)
    prec = precision_score(y_test, y_pred_binary)
    rec = recall_score(y_test, y_pred_binary)
    f1 = f1_score(y_test, y_pred_binary)

    results[name] = {"Accuracy": acc, "Precision": prec, "Recall": rec, "F1-score": f1}

for model_name, metrics in results.items():
    print(model_name, metrics)

ValueError: pandas dtypes must be int, float or bool.
Fields with bad pandas dtypes: product_type_name: object, product_group_name: object, graphical_appearance_name: object, colour_group_name: object, perceived_colour_value_name: object, perceived_colour_master_name: object, department_name: object, index_name: object, garment_group_name: object, club_member_status: object, fashion_news_frequency: object, Active: object

In [29]:
for model_name, metrics in results.items():
    print(f"{model_name}:")
    for metric_name, value in metrics.items():
        print(f"  {metric_name}: {value}")

LogisticRegression:
  Accuracy: 1.0
  Precision: 1.0
  Recall: 1.0
  F1-score: 1.0
  ROC AUC: 1.0
  Log Loss: 0.0014182742376269881
  MCC: 1.0
  Balanced Accuracy: 1.0
  Brier Score: 5.249428668780503e-06
  Confusion Matrix: [[790, 0], [0, 841]]
RandomForest:
  Accuracy: 1.0
  Precision: 1.0
  Recall: 1.0
  F1-score: 1.0
  ROC AUC: 1.0
  Log Loss: 4.95480823808053e-05
  MCC: 1.0
  Balanced Accuracy: 1.0
  Brier Score: 9.809932556713691e-07
  Confusion Matrix: [[790, 0], [0, 841]]
GradientBoosting:
  Accuracy: 1.0
  Precision: 1.0
  Recall: 1.0
  F1-score: 1.0
  ROC AUC: 1.0
  Log Loss: 2.1906938138531577e-05
  MCC: 1.0
  Balanced Accuracy: 1.0
  Brier Score: 4.79929726995088e-10
  Confusion Matrix: [[790, 0], [0, 841]]
KNeighbors:
  Accuracy: 0.9981606376456161
  Precision: 0.997624703087886
  Recall: 0.9988109393579072
  F1-score: 0.9982174688057041
  ROC AUC: 0.9993384909465826
  Log Loss: 0.02750330750999037
  MCC: 0.9963182841136548
  Balanced Accuracy: 0.9981396468941435
  Brier S