In [1]:
import pandas as pd
import numpy as np
import pickle
import ast
import torch
from itertools import product
import math

In [2]:
with open("pos_items.pkl","rb") as f:
    pos_items = pickle.load(f)
    
with open("pos_len_list.pkl","rb") as f:
    pos_len_list = pickle.load(f)
    
with open("id.pkl","rb") as f:
    id_list = pickle.load(f)

In [3]:
len(pos_items)

192403

# Compute CV
* topk_index: top_items

In [4]:
import numpy as np
def ndcg_(pos_index, pos_len):
    r"""NDCG_ (also known as normalized discounted cumulative gain) is a measure of ranking quality.
    Through normalizing the score, users and their recommendation list results in the whole test set can be evaluated.
    .. _NDCG: https://en.wikipedia.org/wiki/Discounted_cumulative_gain#Normalized_DCG

    .. math::
        \begin{gather}
            \mathrm {DCG@K}=\sum_{i=1}^{K} \frac{2^{rel_i}-1}{\log_{2}{(i+1)}}\\
            \mathrm {IDCG@K}=\sum_{i=1}^{K}\frac{1}{\log_{2}{(i+1)}}\\
            \mathrm {NDCG_u@K}=\frac{DCG_u@K}{IDCG_u@K}\\
            \mathrm {NDCG@K}=\frac{\sum \nolimits_{u \in u^{te}NDCG_u@K}}{|u^{te}|}
        \end{gather}

    :math:`K` stands for recommending :math:`K` items.
    And the :math:`rel_i` is the relevance of the item in position :math:`i` in the recommendation list.
    :math:`2^{rel_i}` equals to 1 if the item hits otherwise 0.
    :math:`U^{te}` is for all users in the test set.
    """
    len_rank = np.full_like(pos_len, pos_index.shape[1])
    idcg_len = np.where(pos_len > len_rank, len_rank, pos_len)

    # iranks = np.zeros_like(pos_index, dtype=np.float)
    iranks = np.zeros_like(pos_index, dtype=float)
    iranks[:, :] = np.arange(1, pos_index.shape[1] + 1)
    idcg = np.cumsum(1.0 / np.log2(iranks + 1), axis=1)
    for row, idx in enumerate(idcg_len):
        idcg[row, idx:] = idcg[row, idx - 1]

    # ranks = np.zeros_like(pos_index, dtype=np.float)
    ranks = np.zeros_like(pos_index, dtype=float)
    ranks[:, :] = np.arange(1, pos_index.shape[1] + 1)
    dcg = 1.0 / np.log2(ranks + 1)
    dcg = np.cumsum(np.where(pos_index, dcg, 0), axis=1)

    result = dcg / idcg
    return result.mean(axis=0)


"""Function name and function mapper.
Useful when we have to serialize evaluation metric names
and call the functions based on deserialized names
"""
metrics_dict = {
    'ndcg': ndcg_,
}

def _calculate_metrics(pos_len_list, topk_index):
    """integrate the results of each batch and evaluate the topk metrics by users

    Args:
        pos_len_list (list): a list of users' positive items
        topk_index (np.ndarray): a matrix which contains the index of the topk items for users
    Returns:
        np.ndarray: a matrix which contains the metrics result
    """
    result_list = []
    for metric in metrics:
        metric_fuc = metrics_dict[metric.lower()]
        result = metric_fuc(topk_index, pos_len_list)
        result_list.append(result)
    return np.stack(result_list, axis=0)

# 순서 Ensemble

In [6]:
df1 = pd.read_csv("./../csv/SLMRec-inha-Aug-07-2023-04-19-09-['learning_rate', 'ssl_temp', 'ssl_alpha', 'reg', 'seed']-(0.0001, 1.0, 0.01, 0.01, 999)-100.csv")

df2 = pd.read_csv("./../csv/BM3-inha-idx0-top100-ndcg50-0.0-Aug-06-2023-23-29-56.csv")

df3 = pd.read_csv("./../csv/BM3-inha-Aug-04-2023-21-33-10-['n_layers', 'reg_weight', 'dropout', 'seed']-(1, 0.01, 0.3, 999)-100.csv")

df4 = pd.read_csv("./../csv/SLMRec-inha-Aug-07-2023-02-27-12-['learning_rate', 'ssl_temp', 'ssl_alpha', 'reg', 'seed']-(0.0001, 1.0, 0.01, 0.0001, 999)-100.csv")

new_dfs = [df1, df2, df3, df4]

# 빈번한 item_df에 따라서 weight 부여

In [7]:
for new_df in new_dfs:
    # dtype 변경
    new_df['user_id'] = new_df['user_id'].astype('int32')
    new_df['item_id'] = new_df['item_id'].astype('int32')

    # ID별로 그룹을 지정하고, 각 그룹에 대해 지수승의 가중치를 부여
    new_df['order_weight'] = new_df.groupby('user_id').cumcount(ascending=False).apply(lambda x: math.pow(2, x))
    # new_df['order_weight'] = new_df.groupby('user_id').cumcount(ascending=False) + 1

#     # 아이템 빈도수에 따른 가중치 계산, 로그 변환을 통해 빈도수의 영향을 줄임
#     item_counts = item_df['itemID'].value_counts()
#     top100_items = item_counts[:100]
#     top100_item_weights = np.log1p(top100_items)

#     # 상위 100개 아이템에 대한 가중치를 new_df에 추가
#     new_df['item_weight'] = new_df['item_id'].map(top100_item_weights).fillna(0)    

#     # 최종 가중치는 order_weight와 item_weight의 합산 또는 곱으로 할 수 있습니다.
#     # 아래 예제는 합산을 사용하였습니다.
#     new_df['final_weight'] = new_df['order_weight'] + new_df['item_weight']

# 결과 확인
for i, new_df in enumerate(new_dfs):
    print(f'new_df{i+1}:\n', new_df)

new_df1:
           user_id  item_id  order_weight
0               0    38629  6.338253e+29
1               0     2489  3.169127e+29
2               0    26494  1.584563e+29
3               0    44623  7.922816e+28
4               0    48094  3.961408e+28
...           ...      ...           ...
19240295   192402    24959  1.600000e+01
19240296   192402    16595  8.000000e+00
19240297   192402    29321  4.000000e+00
19240298   192402     8087  2.000000e+00
19240299   192402    45587  1.000000e+00

[19240300 rows x 3 columns]
new_df2:
           user_id  item_id  order_weight
0               0    52521  6.338253e+29
1               0     4393  3.169127e+29
2               0    33659  1.584563e+29
3               0     8198  7.922816e+28
4               0    56414  3.961408e+28
...           ...      ...           ...
19240295   192402    62490  1.600000e+01
19240296   192402    31422  8.000000e+00
19240297   192402    51081  4.000000e+00
19240298   192402    24452  2.000000e+00
19240299

# dataframe 합친 후 order_weight 평균

In [8]:
df_concat = pd.concat(new_dfs)

In [9]:
df_concat

Unnamed: 0,user_id,item_id,order_weight
0,0,38629,6.338253e+29
1,0,2489,3.169127e+29
2,0,26494,1.584563e+29
3,0,44623,7.922816e+28
4,0,48094,3.961408e+28
...,...,...,...
19240295,192402,24959,1.600000e+01
19240296,192402,16595,8.000000e+00
19240297,192402,29321,4.000000e+00
19240298,192402,8087,2.000000e+00


In [10]:
import gc
del new_dfs, df1, df2
gc.collect()

0

In [11]:
df_grouped = df_concat.groupby(['user_id', 'item_id'])['order_weight'].mean().reset_index()

del df_concat

df_top_50 = df_grouped.groupby('user_id').apply(lambda x: x.nlargest(50, 'order_weight')).reset_index(drop=True)

In [12]:
id_list_np = id_list.numpy()

# id_list_np을 DataFrame으로 변환하여 index와 id 컬럼으로 구성
df_id_list = pd.DataFrame({'index': np.arange(len(id_list_np)), 'user_id': id_list_np})

# df_top_50과 df_id_list을 id 컬럼을 기준으로 병합
df_sorted = pd.merge(df_top_50, df_id_list, on='user_id', how='left')

del df_top_50

# id를 정렬하고 order_weight을 내림차순으로 정렬
df_sorted = df_sorted.sort_values(['index', 'order_weight'], ascending=[True, False]).reset_index(drop=True).drop(columns=['index'])

In [13]:
df_sorted

Unnamed: 0,user_id,item_id,order_weight
0,114341,36933,4.753690e+29
1,114341,14265,3.169320e+29
2,114341,26442,1.287458e+29
3,114341,1175,1.213181e+29
4,114341,40057,7.922848e+28
...,...,...,...
9620145,43567,19862,1.180592e+21
9620146,43567,3533,5.903318e+20
9620147,43567,5345,5.902958e+20
9620148,43567,11423,5.902958e+20


In [14]:
# DataFrame을 numpy 행렬로 변환합니다.
user_ids = df_sorted['user_id'].unique()
item_matrix = df_sorted['item_id'].values.reshape(len(user_ids), 50)

print(item_matrix)  # 변환된 행렬을 출력합니다.

[[36933 14265 26442 ... 32830 11627 13591]
 [10994 36933  3323 ... 37667  6131  9719]
 [37753 45064 10994 ... 51479 55313 26072]
 ...
 [10511 25900 56586 ... 12192 33172 50301]
 [ 1344 57043 53162 ...  7379  3533 40179]
 [ 2163 23590 34381 ...  5345 11423 58033]]


In [15]:
len(item_matrix[0])

50

# topk_index 만들기

In [16]:
metrics = ['NDCG']
topk_list = [50]
topk_index = item_matrix

In [17]:
topk_index[0]

array([36933, 14265, 26442,  1175, 40057, 14145, 10501, 61767, 24993,
        7426,  1005, 22110, 57843, 55181, 19715, 61855,  3141, 50223,
       12348, 22402,  3297, 24190, 55313, 58883, 59882, 52201, 37952,
        3323,  5362, 28054, 31090, 51409, 32873, 40667, 11643, 35996,
       57367, 33423, 43120, 25791,  4556, 41630, 35273, 26866, 40783,
       13736, 34106, 32830, 11627, 13591], dtype=int32)

In [18]:
bool_rec_matrix = []
for m, n in zip(pos_items, topk_index):
    bool_rec_matrix.append([True if i in m else False for i in n])
bool_rec_matrix = np.asarray(bool_rec_matrix)

# get metrics
metric_dict = {}
result_list = _calculate_metrics(pos_len_list, bool_rec_matrix)
for metric, value in zip(metrics, result_list):
    for k in topk_list:
        key = '{}@{}'.format(metric, k)
        metric_dict[key] = round(value[k - 1], 4)

ndcg_at_50 = metric_dict.get('NDCG@50', None)

In [19]:
ndcg_at_50

0.0

In [20]:
metric_dict

{'NDCG@50': 0.0}

# save topk_index ( k=50)

In [21]:
max_k = 50

# save topk_index (k=50)
x_df = pd.DataFrame(topk_index)
x_df.insert(0, 'id', id_list)
x_df.columns = ['id']+['top_'+str(i) for i in range(max_k)]
x_df = x_df.astype(int)
# x_df.to_csv(file_path, sep='\t', index=False)

In [22]:
x_df[x_df['id']==114341]

Unnamed: 0,id,top_0,top_1,top_2,top_3,top_4,top_5,top_6,top_7,top_8,...,top_40,top_41,top_42,top_43,top_44,top_45,top_46,top_47,top_48,top_49
0,114341,36933,14265,26442,1175,40057,14145,10501,61767,24993,...,4556,41630,35273,26866,40783,13736,34106,32830,11627,13591


In [23]:
# melt 함수를 사용하여 재구성
reshaped_df = x_df.melt(id_vars=['id'], value_vars=[f'top_{i}' for i in range(50)], value_name='top_items')

# 'variable' column을 숫자로 변환
reshaped_df['variable'] = reshaped_df['variable'].str.replace('top_', '').astype(int)

# 'id'와 'variable' 컬럼 기준으로 오름차순 정렬
sorted_df = reshaped_df.sort_values(by=['id', 'variable']).reset_index(drop=True).drop(columns=['variable'])

# print(sorted_df)

# submission 준비

In [24]:
submission = sorted_df[['id', 'top_items']]
submission.columns =['user_id', 'item_id']

In [25]:
submission

Unnamed: 0,user_id,item_id
0,0,38629
1,0,2489
2,0,12124
3,0,52521
4,0,26494
...,...,...
9620145,192402,50478
9620146,192402,39826
9620147,192402,51536
9620148,192402,22354


In [26]:
counts = submission.groupby('user_id')['item_id'].value_counts()

counts = counts[counts >= 2]
print(counts)

Series([], Name: count, dtype: int64)


In [27]:
submission.to_csv('0807_Ensemble_BM3_SLMRec_ver2.csv', index=False)