In [17]:
import pandas as pd
import numpy as np

# 1. Data Load

## rating data

In [18]:
df_ratings = pd.read_csv('./data/AMAZON_FASHION.csv', names=['asin','reviewerID','rating','unixReviewTime'])
df_ratings

Unnamed: 0,asin,reviewerID,rating,unixReviewTime
0,7106116521,A1D4G1SNUZWQOT,5.0,1413763200
1,7106116521,A3DDWDH9PX2YX2,2.0,1411862400
2,7106116521,A2MWC41EW7XL15,4.0,1408924800
3,7106116521,A2UH2QQ275NV45,2.0,1408838400
4,7106116521,A89F3LQADZBS5,3.0,1406419200
...,...,...,...,...
883631,B01HJHTH5U,A1ZSB2Q144UTEY,5.0,1487635200
883632,B01HJHTH5U,A2CCDV0J5VB6F2,5.0,1480032000
883633,B01HJHTH5U,A3O90PACS7B61K,3.0,1478736000
883634,B01HJHF97K,A2HO94I89U3LNH,3.0,1478736000


In [19]:
print("상품 unique : ", df_ratings['asin'].nunique())
print("리뷰어 unique : ", df_ratings['reviewerID'].nunique())

상품 unique :  186189
리뷰어 unique :  749233


## metadata

In [20]:
df_meta = pd.read_json('./data/meta_AMAZON_FASHION.json', lines=True)
df_meta = df_meta[['asin','title','imageURLHighRes']]
# imageURLHighRes가 있는 row들만 필터링
df_meta = df_meta[~df_meta['imageURLHighRes'].isna()]
# df_meta

Unnamed: 0,asin,title,imageURLHighRes
0,0764443682,Slime Time Fall Fest [With CDROM and Collector...,[https://images-na.ssl-images-amazon.com/image...
1,1291691480,XCC Qi promise new spider snake preparing men'...,[https://images-na.ssl-images-amazon.com/image...
2,1940280001,Magical Things I Really Do Do Too!,[https://images-na.ssl-images-amazon.com/image...
3,1940735033,"Ashes to Ashes, Oranges to Oranges",[https://images-na.ssl-images-amazon.com/image...
4,1940967805,Aether & Empire #1 - 2016 First Printing Comic...,[https://images-na.ssl-images-amazon.com/image...
...,...,...,...
186632,B01HJGXL4O,JT Women's Elegant Off Shoulder Chiffon Maxi L...,[https://images-na.ssl-images-amazon.com/image...
186633,B01HJHF97K,Microcosm Retro Vintage Black Crochet Lace One...,[https://images-na.ssl-images-amazon.com/image...
186634,B01HJGJ9LS,Lookatool Classic Plain Vintage Army Military ...,[https://images-na.ssl-images-amazon.com/image...
186635,B01HJHTH5U,Edith Windsor Women's Deep V-neck Beaded Sequi...,[https://images-na.ssl-images-amazon.com/image...


In [5]:
df_meta[df_meta['asin'] == 'B00KW4LCCE']['title']

48898    Colorful Peacock Bird Wing Y Bib Collar Necklace
Name: title, dtype: object

In [None]:
df_ratings[df_ratings['reviewerID'] == 'A3G5KDMFNRUXHB']

## rating & meta data merge

### rating table 평점 5개 이상인 유저만 필터링

In [21]:
# 'reviewerID' 기준으로 5 이상인 값을 가지는 유저를 필터링
## 3개로 진행해서 다시 test
reviewer_counts = df_ratings['reviewerID'].value_counts()
reviewer_filter = reviewer_counts[reviewer_counts >= 5].index # 5 이상으로 필터링하기로 협의

# 필터링 조건을 만족하는 데이터 추출
filtered_df = df_ratings[df_ratings['reviewerID'].isin(reviewer_filter)]

In [22]:
print("상품 unique : ", filtered_df['asin'].nunique())
print("리뷰어 unique : ", filtered_df['reviewerID'].nunique())

상품 unique :  13197
리뷰어 unique :  3718


In [23]:
# rating이 5개 이상인 유저로 필터링된 데이터에서 이미지 url이 존재하는 아이템이 있는지 메타데이터에서 동일한 값 찾기
common_values = np.intersect1d(df_meta['asin'].unique(), filtered_df['asin'].unique())

# 동일한 값의 개수 확인
num_common_values = len(common_values)

# 결과 출력
print("두 배열에서 동일한 값:", common_values)
print("동일한 값의 개수:", num_common_values)

두 배열에서 동일한 값: ['7106116521' 'B00008JPRZ' 'B00012O2RY' ... 'B01HJEOBUO' 'B01HJG5NLI'
 'B01HJGJ9LS']
동일한 값의 개수: 10681


In [24]:
# 메타데이터에서 rating데이터와 공통으로 존재하는 아이템을 필터링
df_meta_filtered = df_meta[df_meta['asin'].isin(common_values)]

In [25]:
#merge inner
df_meta_rating = pd.merge(df_meta_filtered, filtered_df)
df_meta_rating = df_meta_rating[df_meta_rating['rating'] >= 4.0]

In [26]:
# Nan값 확인
df_meta_rating.isna().sum()

asin               0
title              0
imageURLHighRes    0
reviewerID         0
rating             0
unixReviewTime     0
dtype: int64

In [27]:
df_meta_rating['reviewerID'].value_counts()

reviewerID
A3JBQHQZEZPQK4    33
A1RRX286ZRI830    24
ALFRMOGTO1K4M     22
A3CZ0K3S0BXHKE    21
AENH50GW3OKDA     20
                  ..
A2E1QSIUA6H0XW     1
A1USW773RKOSKG     1
A1DU6H3WIN1G0T     1
A3T5H74TOGU049     1
A2Y0U0RAIFFART     1
Name: count, Length: 3556, dtype: int64

In [28]:
# df_meta_rating merge 한 것에서 다시 reviewerID가 5 이상인 것들로 필터링
df_meta_rating_counts = df_meta_rating['reviewerID'].value_counts()
df_meta_rating_filter = df_meta_rating_counts[df_meta_rating_counts >=4].index
filtered_df_meta_rating = df_meta_rating[df_meta_rating['reviewerID'].isin(df_meta_rating_filter)]

In [29]:
filtered_df_meta_rating.shape

(12690, 6)

In [30]:
filtered_df_meta_rating.to_csv('/home/siyun/ephemeral/lightgcn/data/rating_filtered_4.csv', index=False)

## 최종 데이터셋

In [31]:
print("상품 unique : ", filtered_df_meta_rating['asin'].nunique())
print("리뷰어 unique : ", filtered_df_meta_rating['reviewerID'].nunique())
print(f"sparsity :  {1-len(filtered_df_meta_rating) / (filtered_df_meta_rating['asin'].nunique()*filtered_df_meta_rating['reviewerID'].nunique()):.15f}")

상품 unique :  6484
리뷰어 unique :  2195
sparsity :  0.999108371193012


# 2. train test split

In [32]:
filtered_df_meta_rating.rating.unique()

array([5., 4.])

In [33]:
# 전처리
# 1. rating 값을 1로 전부 변환
filtered_df_meta_rating['rating'] = 1
# 모델 input으로 사용할 column
df_input = filtered_df_meta_rating[['asin','reviewerID','rating', 'unixReviewTime']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df_meta_rating['rating'] = 1


In [62]:
df_input

Unnamed: 0,asin,reviewerID,rating,unixReviewTime
0,7106116521,AD0OENWU7N4L6,1,1437436800
1,B00008JPRZ,A281NL3AO4QIGE,1,1508630400
2,B00008JPRZ,A281NL3AO4QIGE,1,1508630400
5,B0001F331C,A2M1IZTVL7TMC0,1,1352937600
6,B0001F331C,A2M1IZTVL7TMC0,1,1352937600
...,...,...,...,...
20769,B01HHSZEQC,A2FV2M9MG76NB7,1,1483401600
20772,B01HI84VBA,AZD378NGJIDQH,1,1471824000
20775,B01HIRHI2K,A2UU7363R2ESCN,1,1506902400
20778,B01HJEOBUO,A21DXQFAP5TTHY,1,1471219200


In [63]:
df_input.to_csv('/home/siyun/ephemeral/BERT4Rec/data/rating4_bert.csv', index=False)

df_input.to_csv('/home/siyun/ephemeral/lightgcn/data/rating_4_and_1.csv', index=False)

In [36]:
df_input.asin.nunique()

6484

In [37]:
df_input.reviewerID.nunique()

2195

In [38]:
user_list = df_input['asin'].unique()
user_list.shape

(6484,)

In [39]:
# train test split
from sklearn.model_selection import train_test_split

# 각 사용자별로 데이터를 분할하기 위해 사용자 리스트 생성
user_list = df_input['reviewerID'].unique()

# 각 사용자별로 train, validation, test set으로 나누기
train_data = pd.DataFrame()
# valid_data = pd.DataFrame()
test_data = pd.DataFrame()

for user_id in user_list:
    user_data = df_input[df_input['reviewerID'] == user_id]
    # train_user, temp_user = train_test_split(user_data, test_size=0.4, random_state=42)
    # valid_user, test_user = train_test_split(temp_user, test_size=0.5, random_state=42)
    train_user, test_user = train_test_split(user_data, test_size=0.1, random_state=42)
    # print(train_user)

    train_data = pd.concat([train_data, train_user])
    # valid_data = pd.concat([valid_data, valid_user])
    test_data = pd.concat([test_data, test_user])

# 결과 출력
print("\nTrain set:")
print(train_data)
# print("\nValidation set:")
# print(valid_data)
print("\nTest set:")
print(test_data)


Train set:
             asin      reviewerID  rating  unixReviewTime
12060  B00V5BV778   AD0OENWU7N4L6       1      1436400000
1282   B0015ZD6HI   AD0OENWU7N4L6       1      1311638400
0      7106116521   AD0OENWU7N4L6       1      1437436800
4241   B009PMJZUA   AD0OENWU7N4L6       1      1365552000
2      B00008JPRZ  A281NL3AO4QIGE       1      1508630400
...           ...             ...     ...             ...
19487  B01EJ333P2   AE4AXSBD2GBRI       1      1467936000
20098  B01FV2C23E   AE4AXSBD2GBRI       1      1467936000
20198  B01G2K47JQ   AWSSM21BOTF5P       1      1486339200
20194  B01G2K2YBE   AWSSM21BOTF5P       1      1471219200
20197  B01G2K458Y   AWSSM21BOTF5P       1      1471219200

[10433 rows x 4 columns]

Test set:
             asin      reviewerID  rating  unixReviewTime
1281   B0015ZD6HI   AD0OENWU7N4L6       1      1311638400
1      B00008JPRZ  A281NL3AO4QIGE       1      1508630400
6      B0001F331C  A2M1IZTVL7TMC0       1      1352937600
1537   B001BFLRU0  A3FO

In [40]:
# train dataset csv로 저장
train_data.to_csv('./data/rating_trainset_amazon_fashion_4.csv',index=False)

In [41]:
# test dataset csv로 저장
test_data.to_csv('./data/rating_testset_amazon_fashion_4.csv',index=False)

# 3. EASE Model

In [65]:
import sys
import logging

import torch
import pandas as pd


class TorchEASE:
    def __init__(
        self, train, user_col="user_id", item_col="item_id", score_col=None, reg=300
    ):
        """

        :param train: Training DataFrame of user, item, score(optional) values
        :param user_col: Column name for users
        :param item_col: Column name for items
        :param score_col: Column name for scores. Implicit feedback otherwise
        :param reg: Regularization parameter
        """
        logging.basicConfig(
            format="%(asctime)s [%(levelname)s] %(name)s - %(message)s",
            level=logging.INFO,
            datefmt="%Y-%m-%d %H:%M:%S",
            stream=sys.stdout,
        )

        self.logger = logging.getLogger("notebook")
        self.logger.info("Building user + item lookup")
        # How much regularization do you need?
        self.reg = reg

        self.user_col = user_col
        self.item_col = item_col

        self.user_id_col = user_col + "_id"
        self.item_id_col = item_col + "_id"

        self.user_lookup = self.generate_labels(train, self.user_col)
        self.item_lookup = self.generate_labels(train, self.item_col)

        self.item_map = {}
        self.logger.info("Building item hashmap")
        for _item, _item_id in self.item_lookup.values:
            self.item_map[_item_id] = _item

        train = pd.merge(train, self.user_lookup, on=[self.user_col])
        train = pd.merge(train, self.item_lookup, on=[self.item_col])
        self.logger.info("User + item lookup complete")
        self.indices = torch.LongTensor(
            train[[self.user_id_col, self.item_id_col]].values
        )

        if not score_col:
            # Implicit values only
            self.values = torch.ones(self.indices.shape[0])
        else:
            self.values = torch.FloatTensor(train[score_col])
        # TODO: Is Sparse the best implementation?

        self.sparse = torch.sparse.FloatTensor(self.indices.t(), self.values)

        self.logger.info("Sparse data built")

    def generate_labels(self, df, col):
        dist_labels = df[[col]].drop_duplicates()
        dist_labels[col + "_id"] = dist_labels[col].astype("category").cat.codes

        return dist_labels

    def fit(self):
        self.logger.info("Building G Matrix")
        self.G = self.sparse.to_dense().t() @ self.sparse.to_dense()
        self.G += torch.eye(self.G.shape[0]) * self.reg

        self.P = self.G.inverse()

        self.logger.info("Building B matrix")
        B = self.P / (-1 * self.P.diag())
        # Set diagonals to 0. TODO: Use .fill_diag_
        B = B + torch.eye(B.shape[0])

        # Predictions for user `_u` will be self.sparse.to_dense()[_u]@self.B
        self.B = B

        return

    def fine_tune(self, new_data):
        """
        Fine-tune the model with new data.

        :param new_data: New training DataFrame of user, item, score(optional) values
        """
        self.logger.info("Fine-tuning model with new data")

        # Update indices and values with new data
        new_user_ids = self.generate_labels(new_data, self.user_col)
        new_item_ids = self.generate_labels(new_data, self.item_col)

        new_data = pd.merge(new_data, new_user_ids, on=[self.user_col])
        new_data = pd.merge(new_data, new_item_ids, on=[self.item_col])

        new_indices = torch.LongTensor(new_data[[self.user_id_col, self.item_id_col]].values)


        new_values = torch.ones(new_indices.shape[0])
      

        # Update sparse matrix
        self.indices = torch.cat((self.indices, new_indices), dim=0)
        self.values = torch.cat((self.values, new_values), dim=0)
        self.sparse = torch.sparse.FloatTensor(self.indices.t(), self.values)

        # Update user lookup
        new_user_lookup = self.generate_labels(new_data, self.user_col)
        self.user_lookup = pd.concat([self.user_lookup, new_user_lookup]).drop_duplicates()

        # Re-calculate G matrix
        self.logger.info("Updating G Matrix")
        new_data_tensor = torch.sparse.FloatTensor(self.indices.t(), self.values)
        G_update = torch.matmul(new_data_tensor.to_dense().t(), new_data_tensor.to_dense())
        G_update += torch.eye(G_update.shape[0]) * self.reg

        # Ensure G_update has the same size as self.G
        if G_update.shape != self.G.shape:
            self.logger.error("Shape mismatch between G_update and self.G")
            print("G_update.shape", G_update.shape)
            print("self.G.shape", self.G.shape)
            return

        self.G = G_update

        # Re-calculate B matrix
        self.logger.info("Updating B Matrix")
        P = self.G.inverse()
        B = P / (-1 * P.diag())
        B += torch.eye(B.shape[0])
        self.B = B

        self.logger.info("Fine-tuning complete.")

        return

    def predict_all(self, pred_df, k=5, remove_owned=True):
        """
        :param pred_df: DataFrame of users that need predictions
        :param k: Number of items to recommend to each user
        :param remove_owned: Do you want previously interacted items included?
        :return: DataFrame of users + their predictions in sorted order
        """
        pred_df = pred_df[[self.user_col]].drop_duplicates()
        n_orig = pred_df.shape[0]

        # Alert to number of dropped users in prediction set
        pred_df = pd.merge(pred_df, self.user_lookup, on=[self.user_col])
        n_curr = pred_df.shape[0]
        if n_orig - n_curr:
            self.logger.info(
                "Number of unknown users from prediction data = %i" % (n_orig - n_curr)
            )

        _output_preds = []
        # Select only user_ids in our user data
        _user_tensor = self.sparse.to_dense().index_select(
            dim=0, index=torch.LongTensor(pred_df[self.user_id_col])
        )

        # Make our (raw) predictions
        _preds_tensor = _user_tensor @ self.B
        self.logger.info("Predictions are made")
        if remove_owned:
            # Discount these items by a large factor (much faster than list comp.)
            self.logger.info("Removing owned items")
            _preds_tensor += -1.0 * _user_tensor

        self.logger.info("TopK selected per user")
        for _preds in _preds_tensor:
            # Very quick to use .topk() vs. argmax()
            _output_preds.append(
                [self.item_map[_id] for _id in _preds.topk(k).indices.tolist()]
            )

        pred_df["predicted_items"] = _output_preds
        self.logger.info("Predictions are returned to user")
        return pred_df

# 4. Metric

recall

In [43]:
def recall_at_k(ground_truth, predicted_items, k=10):
    """
    Calculate Recall@k
    Parameters:
        ground_truth (dict): 실제 사용자 평가 데이터. {userid: [itemid1, itemid2, ...]}
        predicted_items (dict): 모델이 예측한 아이템 목록. {userid: [predicted_itemid1, predicted_itemid2, ...]}
        k (int): 상위 k개의 예측 아이템을 사용하여 Recall 계산
    Returns:
        recall (float): Recall@k
    """
    total_recall = 0
    total_users = len(ground_truth)

    for user_id, true_items in ground_truth.items():
        predicted = predicted_items.get(user_id, [])[:k]
        true_positives = len(set(predicted) & set(true_items))
        total_recall += true_positives / len(true_items) if len(true_items) > 0 else 0

    recall = total_recall / total_users
    return recall

precision

In [44]:
def precision_at_k(ground_truth,predicted_items, k=10):
    """_summary_

    Args:
        ground_truth (_type_): _description_
        predicted_items (_type_): _description_
        k (int, optional): _description_. Defaults to 10.
    Returns :
        precision
    """
    total_precision = 0
    total_users = len(ground_truth)
    
    for user_id, true_items in ground_truth.items() :
        predicted = predicted_items.get(user_id, [])[:k]
        true_positives = len(set(predicted) & set(true_items))
        total_precision += true_positives / k if k > 0 else 0
        
    precision = total_precision / total_users
    return precision
        

f1

In [45]:
def f1_score(ground_truth,predicted_items, k=10) :
    recall = recall_at_k(ground_truth,predicted_items, k)
    precision = precision_at_k(ground_truth,predicted_items, k)
    
    if precision + recall > 0 :
        f1_score = 2 * (precision * recall) / (precision + recall)
    else :
        f1_score = 0
    return f1_score

ndcg

In [46]:
import numpy as np

def dcg_at_k(r,k) :
    r = np.asfarray(r)[:k]
    if r.size :
        return np.sum(r/np.log2(np.arange(2, r.size + 2)))
    return 0.0

def ndcg_at_k(ground_truth,predicted_items, k=10) :
    total_ndcg = 0
    total_users = len(ground_truth)
    
    for user, true_items in ground_truth.items() :
        predicted = predicted_items.get(user, [])[:k]
        ideal = dcg_at_k(sorted([1 if item in true_items else 0 for item in predicted],reverse=True),k)
        actual_dcg = dcg_at_k([1 if item in true_items else 0 for item in predicted], k)
        total_ndcg += (actual_dcg / ideal) if ideal > 0 else 0

    ndcg = total_ndcg / total_users
    return ndcg
        

diversity

In [47]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def calculate_diversity_ease(predicted_items, item_vectors):
    vectors = [item_vectors[item] for item in predicted_items if item in item_vectors]
    if len(vectors) < 2:
        return 0

    similarity_matrix = cosine_similarity(vectors)
    
    # 상위 삼각행렬의 값만 추출하여 평균 유사도 계산
    upper_triangle_indices = np.triu_indices_from(similarity_matrix, k=1)
    avg_similarity = np.mean(similarity_matrix[upper_triangle_indices])
    
    diversity = 1 - avg_similarity
    
    return diversity

# 5. train

In [76]:
model = TorchEASE(train_data, user_col="reviewerID", item_col="asin", reg=400)

2024-03-26 17:38:43 [INFO] notebook - Building user + item lookup
2024-03-26 17:38:43 [INFO] notebook - Building item hashmap
2024-03-26 17:38:43 [INFO] notebook - User + item lookup complete
2024-03-26 17:38:43 [INFO] notebook - Sparse data built


In [77]:
%%time
model.fit()

2024-03-26 17:38:46 [INFO] notebook - Building G Matrix
2024-03-26 17:38:48 [INFO] notebook - Building B matrix
CPU times: user 17.2 s, sys: 1.66 s, total: 18.9 s
Wall time: 2.57 s


# 6. predict

In [78]:
%%time
out = model.predict_all(test_data, k=10)

2024-03-26 17:38:50 [INFO] notebook - Predictions are made
2024-03-26 17:38:50 [INFO] notebook - Removing owned items
2024-03-26 17:38:50 [INFO] notebook - TopK selected per user
2024-03-26 17:38:50 [INFO] notebook - Predictions are returned to user
CPU times: user 4.3 s, sys: 298 ms, total: 4.59 s
Wall time: 609 ms


In [None]:
out

# 7. Evaluate

In [79]:
# 중복되는 reviewerID를 가진 행을 그룹화하고 asin 값을 리스트로 수집하여 딕셔너리에 저장
grouped = test_data.groupby('reviewerID')['asin'].apply(list).reset_index(name='true_item')
# 원래 데이터프레임과 병합하여 새로운 true_item 컬럼을 추가
test_data = pd.merge(test_data, grouped, on='reviewerID', how='left')
# 데이터프레임을 딕셔너리로 변환
result_dict = test_data.set_index('reviewerID').to_dict()['true_item']
print(result_dict)

MergeError: Passing 'suffixes' which cause duplicate columns {'true_item_x'} is not allowed.

In [80]:
result_pred = out.set_index('reviewerID').to_dict()['predicted_items']
# result_pred

In [81]:
# Recall@10 계산
predicted_items = result_pred
ground_truth = result_dict
recall = recall_at_k(ground_truth, predicted_items, k=10)
precision = precision_at_k(ground_truth, predicted_items, k=10)
f1 = f1_score(ground_truth, predicted_items, k=10)
ndcg = ndcg_at_k(ground_truth, predicted_items, k=10)

print(f"recall : {recall}, precision : {precision}, f1 : {f1}, ndcg : {ndcg}")


# ease의 경우는 item vector로 임베딩화 시킨 후 계싼해야함.
# b = model.B
# calculate_diversity_ease(predicted_items, item_vectors)

# print(f"diversity : {diversity}")

recall : 0.19047076689445708, precision : 0.019362186788155048, f1 : 0.03515110950467661, ndcg : 0.1740839062171303


In [50]:
result_pred['A3G5KDMFNRUXHB']

['B00LMUA8MC',
 'B00LMUA6C4',
 'B00KW4LCCE',
 'B00LMU8WSE',
 'B00E1L9QOU',
 'B00LMU7WUI',
 'B00KT1267K',
 'B00KA3ROE2',
 'B00LMU9NE6',
 'B00KA3WMJY']

In [49]:
test_data[['asin','reviewerID']].loc[test_data['reviewerID']=='A3G5KDMFNRUXHB']

Unnamed: 0,asin,reviewerID
1751,B00VL50JXQ,A3G5KDMFNRUXHB
1752,B00NL59JSA,A3G5KDMFNRUXHB
1753,B00GWSXSEE,A3G5KDMFNRUXHB
1754,B01DXTQ3H8,A3G5KDMFNRUXHB
1755,B01H5XMDMC,A3G5KDMFNRUXHB
1756,B018FVB6LW,A3G5KDMFNRUXHB
1757,B00KREP4NW,A3G5KDMFNRUXHB
1758,B01AZ31G2C,A3G5KDMFNRUXHB


In [20]:
tmp_data = test_data[['asin','reviewerID']].loc[test_data['reviewerID']=='A2CFCXKCAWPIXQ']

In [23]:
tmp_data['reviewerID'] = 'A2CFCXKCAWPIXQ_test'

In [26]:
tmp_data

Unnamed: 0,asin,reviewerID
18545,B01CS4MRKQ,A2CFCXKCAWPIXQ_test
18592,B01CV70HWU,A2CFCXKCAWPIXQ_test


In [154]:
model.fine_tune(tmp_data)

2024-03-16 16:10:21 [INFO] notebook - Fine-tuning model with new data
2024-03-16 16:10:21 [INFO] notebook - Updating G Matrix
G_update.shape torch.Size([7060, 7060])
self.G.shape torch.Size([7060, 7060])
2024-03-16 16:10:22 [INFO] notebook - Updating B Matrix
2024-03-16 16:10:25 [INFO] notebook - Fine-tuning complete.


In [156]:
%%time
tmp_new_result = model.predict_all(tmp_data, k=10)

2024-03-16 16:14:14 [INFO] notebook - Predictions are made
2024-03-16 16:14:14 [INFO] notebook - Removing owned items
2024-03-16 16:14:14 [INFO] notebook - TopK selected per user
2024-03-16 16:14:14 [INFO] notebook - Predictions are returned to user
CPU times: user 95 ms, sys: 55.9 ms, total: 151 ms
Wall time: 28.3 ms


In [71]:
%%time
tmp = model.predict_all(test_data[['asin','reviewerID']].loc[test_data['reviewerID']=='AD0OENWU7N4L6'], k=10)

2024-03-16 11:32:31 [INFO] notebook - Predictions are made
2024-03-16 11:32:31 [INFO] notebook - Removing owned items
2024-03-16 11:32:31 [INFO] notebook - TopK selected per user
2024-03-16 11:32:31 [INFO] notebook - Predictions are returned to user
CPU times: user 126 ms, sys: 61.9 ms, total: 188 ms
Wall time: 48.8 ms


In [159]:
predicted_items['A2CFCXKCAWPIXQ']

['B01DF878RQ',
 'B01EXAVD3K',
 'B01EMCB9QU',
 'B01E7XRXKU',
 'B01EU2YXQU',
 'B01H1R4WLM',
 'B01D37X10W',
 'B00KTMIZQK',
 'B019GGH8AI',
 'B01799KAY0']

In [162]:
tmp_new_result['predicted_items'][0]

['B00Q2M2C6A',
 'B00YBZ0JRY',
 'B00V5BV778',
 'B0015ZD6HI',
 'B00UDF11O6',
 'B000PMKBL6',
 'B003CTTFE8',
 'B003CTREGO',
 'B009PMJZUA',
 'B00MIMOVB2']

In [157]:
tmp_new_result

Unnamed: 0,reviewerID,reviewerID_id,predicted_items
0,123456997,0,"[B00Q2M2C6A, B00YBZ0JRY, B00V5BV778, B0015ZD6H..."


In [93]:
tmp_new_result

Unnamed: 0,reviewerID,reviewerID_id,predicted_items
0,123456999,0,"[B00V5BV778, B0015ZD6HI, B009PMJZUA, B00Q2M2C6..."


In [75]:
tmp

Unnamed: 0,reviewerID,reviewerID_id,predicted_items
0,AD0OENWU7N4L6,2134,"[B00QI51L0O, B00SFPUDKO, B00M6SF3YW, B00BQHNKG..."


# 8. model save

In [None]:
torch.save(model,f'./save/model.pt')

In [None]:
model = torch.load("./save/model.pt")
print(model)

<__main__.TorchEASE object at 0x7fd103b61c30>


In [None]:
inputs = test_data
outputs = model.predict_all(inputs,k=10)
print(outputs)

2024-03-16 11:07:05 [INFO] notebook - Predictions are made
2024-03-16 11:07:05 [INFO] notebook - Removing owned items
2024-03-16 11:07:05 [INFO] notebook - TopK selected per user
2024-03-16 11:07:05 [INFO] notebook - Predictions are returned to user
          reviewerID  reviewerID_id  \
0      AD0OENWU7N4L6           2134   
1     A281NL3AO4QIGE            823   
2     A3PSH91YKGP4IV           1808   
3     A2M1IZTVL7TMC0           1090   
4     A2XG05RVALZZSA           1287   
...              ...            ...   
2573  A2CFCXKCAWPIXQ            918   
2574  A1BQC32PHYS1GG            234   
2575  A3CMY6YN374VD1           1568   
2576  A2BLP5XDO3MJ5R            905   
2577   AE4AXSBD2GBRI           2148   

                                        predicted_items  
0     [B00QI51L0O, B00SFPUDKO, B00M6SF3YW, B00BQHNKG...  
1     [B001LNSY2Q, B00P1XXR6A, B00OAZOMMI, B01B1TX3D...  
2     [B00LMU7GHM, B00MIMOVB2, B00LMU8WSE, B00KW4LCC...  
3     [B000AO7PY0, B000BD7SGK, B0007PRMT0, B0009N

# 9. inference

In [None]:
##flow
# 1. input -> 학습된 모델, 클라이언트가 선택한 item 리스트
# 2. output -> top n개의 추천된 아이템

##될까? 하는 것들
# 1. 뭔가 지금 모델은 학습된 유저에 대해서만 predict이 가능한거 같은데.. -> 새로 들어온 데이터(클라이언트가 선택한 item 리스트)에 대해서 model.finetune을 구현해야할 듯..

In [None]:
def get_model_rec(model, input_ids, top_k):
    top_k = int(top_k)

    recommend_output = model.finetune(input_ids)

    recommend_output = recommend_output[:, -1, :]
    test_item_emb = model.item_embeddings.weight
    rating_pred = torch.matmul(recommend_output, test_item_emb.transpose(0, 1))

    rating_pred = rating_pred.cpu().data.numpy().copy()[0]

    rating_pred[input_ids[0]] = 0

    pred_ids = np.argsort(rating_pred)[::-1][:top_k]
    df = pd.read_csv("poster.csv", sep="\t")
    df.fillna("", inplace=True)
    return df[df["item"].isin(pred_ids)]
