## 라이브러리 임포트 및 데이터 불러오기

In [1]:
import torch
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import StandardScaler

In [None]:
df_book = pd.read_csv("data/books.csv")
df_user = pd.read_csv("data/users.csv")
df_train = pd.read_csv("data/train_ratings.csv")
df_test = pd.read_csv("data/test_ratings.csv")

book_na = df_book.isna()
user_na = df_user.isna()

df_book = df_book.fillna(-1)
df_user = df_user.fillna(-1)

print(df_book[book_na].sum())
print(df_user[user_na].sum())

isbn                        0
book_title                  0
book_author                -1
year_of_publication       0.0
publisher                   0
img_url                     0
language               -67227
category               -68851
summary                -67227
img_path                    0
dtype: object
user_id         0.0
location          0
age        -27833.0
dtype: object


In [3]:
print(df_user.columns)
print(df_book.columns)

Index(['user_id', 'location', 'age'], dtype='object')
Index(['isbn', 'book_title', 'book_author', 'year_of_publication', 'publisher',
       'img_url', 'language', 'category', 'summary', 'img_path'],
      dtype='object')


## 데이터 전처리
* 고유 번호 생성 및 매핑
* 통계량 추출



In [4]:
isbn2id = {}
id2isbn = []
for new_id, old_id in enumerate(df_book['isbn'].unique()):
    isbn2id[old_id] = new_id
    id2isbn.append(old_id)

df_book['isbn'] = df_book['isbn'].map(isbn2id)
df_train['isbn'] = df_train['isbn'].map(isbn2id)
df_test['isbn'] = df_test['isbn'].map(isbn2id)

user2id = {}
id2user = []
for new_id, old_id in enumerate(df_user['user_id'].unique()):
    user2id[old_id] = new_id
    id2isbn.append(old_id)

df_user['user_id'] = df_user['user_id'].map(user2id)
df_train['user_id'] = df_train['user_id'].map(user2id)
df_test['user_id'] = df_test['user_id'].map(user2id)

location2id = {}
id2location = []
for new_id, old_id in enumerate(df_user['location'].unique()):
    location2id[old_id] = new_id
    id2location.append(old_id)

df_user['location'] = df_user['location'].map(location2id)

age2id = {}
id2age = []
for new_id, old_id in enumerate(df_user['age'].unique()):
    age2id[old_id] = new_id
    id2age.append(old_id)

df_user['age'] = df_user['age'].map(age2id)

author2id = {}
id2author = []
for new_id, old_id in enumerate(df_book['book_author'].unique()):
    author2id[old_id] = new_id
    id2author.append(old_id)

df_book['book_author'] = df_book['book_author'].map(author2id)

year2id = {}
id2year = []
for new_id, old_id in enumerate(df_book['year_of_publication'].unique()):
    year2id[old_id] = new_id
    id2year.append(old_id)

df_book['year_of_publication'] = df_book['year_of_publication'].map(year2id)

publisher2id = {}
id2publisher = []
for new_id, old_id in enumerate(df_book['publisher'].unique()):
    publisher2id[old_id] = new_id
    id2publisher.append(old_id)

df_book['publisher'] = df_book['publisher'].map(publisher2id)

language2id = {}
id2language = []
for new_id, old_id in enumerate(df_book['language'].unique()):
    language2id[old_id] = new_id
    id2language.append(old_id)

df_book['language'] = df_book['language'].map(language2id)

category2id = {}
id2category = []
for new_id, old_id in enumerate(df_book['category'].unique()):
    category2id[old_id] = new_id
    id2category.append(old_id)

df_book['category'] = df_book['category'].map(category2id)

print(df_user.head(3))
print(df_book.head(3))

   user_id  location  age
0        0         0    0
1        1         1    1
2        2         2    0
   isbn                                         book_title  book_author  \
0     0                                       Clara Callan            0   
1     1                               Decision in Normandy            1   
2     2  Flu: The Story of the Great Influenza Pandemic...            2   

   year_of_publication  publisher  \
0                    0          0   
1                    1          1   
2                    2          2   

                                             img_url  language  category  \
0  http://images.amazon.com/images/P/0002005018.0...         0         0   
1  http://images.amazon.com/images/P/0060973129.0...         0         1   
2  http://images.amazon.com/images/P/0374157065.0...         0         2   

                                             summary  \
0  In a small town in Canada, Clara Callan reluct...   
1  Here, for the first time i

In [5]:
del df_book['img_url']
del df_book['summary']
del df_book['img_path']
del df_book['book_title']

In [6]:
cols = [
    'isbn', 
    'book_author', 
    'year_of_publication', 
    'publisher', 
    'language', 
    'category'
]

df = df_train.merge(df_book[cols], on='isbn', how='left')
df = df.merge(df_user, on='user_id', how='left')

# add counts of user / location / age / book / author / year of publication / publisher / language / category 
stats_of_user = df.groupby('user_id')['rating'].agg(['count', 'mean', 'std']).reset_index().fillna(-1)
stats_of_user.columns = ['user_id', 'user_count', 'user_mean', 'user_std']
df = df.merge(stats_of_user, on='user_id', how='left')

stats_of_location = df.groupby('location')['rating'].agg(['count', 'mean', 'std']).reset_index().fillna(-1)
stats_of_location.columns = ['location', 'location_count', 'location_mean', 'location_std']
df = df.merge(stats_of_location, on='location', how='left')

mask = df['location'] == -1
df.loc[mask, 'location_count'] = 0
df.loc[mask, 'location_mean'] = df.loc[mask, 'rating']
df.loc[mask, 'location_std'] = 0

stats_of_age = df.groupby('age')['rating'].agg(['count', 'mean', 'std']).reset_index().fillna(-1)
stats_of_age.columns = ['age', 'age_count', 'age_mean', 'age_std']
df = df.merge(stats_of_age, on='age', how='left')

mask = df['age'] == -1
df.loc[mask, 'age'] = 0
df.loc[mask, 'age'] = df.loc[mask, 'rating']
df.loc[mask, 'age'] = 0

stats_of_book = df.groupby('isbn')['rating'].agg(['count', 'mean', 'std']).reset_index().fillna(-1)
stats_of_book.columns = ['isbn', 'book_count', 'book_mean', 'book_std']
df = df.merge(stats_of_book, on='isbn', how='left')

stats_of_author = df.groupby('book_author')['rating'].agg(['count', 'mean', 'std']).reset_index().fillna(-1)
stats_of_author.columns = ['book_author', 'author_count', 'author_mean', 'author_std']
df = df.merge(stats_of_author, on='book_author', how='left')

mask = df['book_author'] == -1
df.loc[mask, 'book_author'] = 0
df.loc[mask, 'book_author'] = df.loc[mask, 'rating']
df.loc[mask, 'book_author'] = 0

stats_of_year = df.groupby('year_of_publication')['rating'].agg(['count', 'mean', 'std']).reset_index().fillna(-1)
stats_of_year.columns = ['year_of_publication', 'year_count', 'year_mean', 'year_std']
df = df.merge(stats_of_year, on='year_of_publication', how='left')

mask = df['year_of_publication'] == -1
df.loc[mask, 'year_of_publication'] = 0
df.loc[mask, 'year_of_publication'] = df.loc[mask, 'rating']
df.loc[mask, 'year_of_publication'] = 0

stats_of_publisher = df.groupby('publisher')['rating'].agg(['count', 'mean', 'std']).reset_index().fillna(-1)
stats_of_publisher.columns = ['publisher', 'publisher_count', 'publisher_mean', 'publisher_std']
df = df.merge(stats_of_publisher, on='publisher', how='left')

mask = df['publisher'] == -1
df.loc[mask, 'publisher'] = 0
df.loc[mask, 'publisher'] = df.loc[mask, 'rating']
df.loc[mask, 'publisher'] = 0

stats_of_language = df.groupby('language')['rating'].agg(['count', 'mean', 'std']).reset_index().fillna(-1)
stats_of_language.columns = ['language', 'language_count', 'language_mean', 'language_std']
df = df.merge(stats_of_language, on='language', how='left')

mask = df['language'] == -1
df.loc[mask, 'language'] = 0
df.loc[mask, 'language'] = df.loc[mask, 'rating']
df.loc[mask, 'language'] = 0

stats_of_category = df.groupby('category')['rating'].agg(['count', 'mean', 'std']).reset_index().fillna(-1)
stats_of_category.columns=  ['category', 'category_count', 'category_mean', 'category_std']
df = df.merge(stats_of_category, on='category', how='left')

mask = df['category'] == -1
df.loc[mask, 'category'] = 0
df.loc[mask, 'category'] = df.loc[mask, 'rating']
df.loc[mask, 'category'] = 0

print(df)

        user_id    isbn  rating  book_author  year_of_publication  publisher  \
0             0       0       4            0                    0          0   
1             3       0       7            0                    0          0   
2             7       0       8            0                    0          0   
3             9       0       8            0                    0          0   
4            10       0       9            0                    0          0   
...         ...     ...     ...          ...                  ...        ...   
306790     4515  149564       7         1323                    5       2340   
306791     4744  149565       6           72                   12        228   
306792     4744  149567       7        62056                   19      11569   
306793     4744  149568       7        62057                   11       7026   
306794     4744  149569      10        62058                    2      11570   

        language  category  location  a

In [7]:
user_cols = ['user_id', 'user_count', 'user_mean',
       'user_std', 'location_count', 'location_mean', 'location_std',
       'age_count', 'age_mean', 'age_std']
df_user = df_user.merge(df[user_cols], on='user_id', how='left')

book_cols = ['isbn', 'book_count', 'book_mean',
       'book_std', 'author_count', 'author_mean', 'author_std', 'year_count',
       'year_mean', 'year_std', 'publisher_count', 'publisher_mean',
       'publisher_std', 'language_count', 'language_mean', 'language_std',
       'category_count', 'category_mean', 'category_std']
df_book = df_book.merge(df[book_cols], on='isbn', how='left')


print(df_user.columns)
print(df_book.columns)
print(df_user.isna().sum())
print(df_book.isna().sum())

Index(['user_id', 'location', 'age', 'user_count', 'user_mean', 'user_std',
       'location_count', 'location_mean', 'location_std', 'age_count',
       'age_mean', 'age_std'],
      dtype='object')
Index(['isbn', 'book_author', 'year_of_publication', 'publisher', 'language',
       'category', 'book_count', 'book_mean', 'book_std', 'author_count',
       'author_mean', 'author_std', 'year_count', 'year_mean', 'year_std',
       'publisher_count', 'publisher_mean', 'publisher_std', 'language_count',
       'language_mean', 'language_std', 'category_count', 'category_mean',
       'category_std'],
      dtype='object')
user_id              0
location             0
age                  0
user_count        8289
user_mean         8289
user_std          8289
location_count    8289
location_mean     8289
location_std      8289
age_count         8289
age_mean          8289
age_std           8289
dtype: int64
isbn                       0
book_author                0
year_of_publication       

In [8]:
df_user = df_user.drop_duplicates().fillna(0)
df_book = df_book.drop_duplicates().fillna(0)

print(df_user.isna().sum())
print(df_book.isna().sum())
print(df_user)
print(df_book)
print(len(df_user.columns))
print(len(df_book.columns))

user_id           0
location          0
age               0
user_count        0
user_mean         0
user_std          0
location_count    0
location_mean     0
location_std      0
age_count         0
age_mean          0
age_std           0
dtype: int64
isbn                   0
book_author            0
year_of_publication    0
publisher              0
language               0
category               0
book_count             0
book_mean              0
book_std               0
author_count           0
author_mean            0
author_std             0
year_count             0
year_mean              0
year_std               0
publisher_count        0
publisher_mean         0
publisher_std          0
language_count         0
language_mean          0
language_std           0
category_count         0
category_mean          0
category_std           0
dtype: int64
        user_id  location  age  user_count  user_mean  user_std  \
0             0         0    0         7.0   4.428571  1.988060   


## 학습 데이터셋 정의

In [9]:
class bookset(Dataset):
    def __init__(self, df_train, mode='train', device='mps'):
"""
        PyTorch Dataset: 추천 시스템 모델 학습을 위한 데이터셋 클래스입니다.
        상호작용에 따른 유저, 아이템, 특징들의 ID를 분할하고 torch.Tensor 형태로 변환하여 관리합니다.

        입력 데이터프레임은 모든 특징 컬럼이 [0, size-1] 범위로 인코딩된 상태(전처리가 완료된 상태)여야 합니다.

        (파라미터 Parameters)
        :param df_train: 모든 상호작용 정보와 인코딩된 특징 ID를 포함하는 데이터프레임입니다.
        :type df_train: pandas.DataFrame
        :param mode: 데이터 분할 모드를 결정합니다. 'train'일 경우 80%, 'test'일 경우 20%를 사용합니다.
        :type mode: str
        :param device: 데이터를 로드할 장치 (CPU, CUDA, MPS 등).
        :type device: str

        (클래스 속성 Attributes: 모두 torch.Tensor, dtype=torch.long)
        :attr user: 유저 고유 ID (user_id)
        :attr user_f0: 지역 고유 ID (location)
        :attr user_f1: 나이 고유 ID (age)
        :attr item: 책 고유 ID (isbn)
        :attr item_f0: 저자 고유 ID (book_author)
        :attr item_f1: 연도 고유 ID (year_of_publication)
        :attr item_f2: 출판사 고유 ID (publisher)
        :attr item_f3: 언어 고유 ID (language)
        :attr item_f4: 카테고리 고유 ID (category)
        :attr y: 정답 레이블 (rating). (dtype=torch.float)
        """

        
        train_cols = ['user_id', 'isbn', 'location', 'age',
                      'book_author', 'year_of_publication',
                      'publisher', 'language', 'category']
        
        X_train, X_test, y_train, y_test = train_test_split(
            df_train[train_cols],
            df_train['rating'],
            test_size=0.2,
            random_state=42
        )

        if mode == 'train':
            X = X_train
            y = y_train
        else:
            X = X_test
            y = y_test

        self.user = torch.tensor(X['user_id'].values, dtype=torch.long, device=device)
        self.user_f0 = torch.tensor(X['location'].values, dtype=torch.long, device=device)
        self.user_f1 = torch.tensor(X['age'].values, dtype=torch.long, device=device)

        self.item = torch.tensor(X['isbn'].values, dtype=torch.long, device=device)
        self.item_f0 = torch.tensor(X['book_author'].values, dtype=torch.long, device=device)
        self.item_f1 = torch.tensor(X['year_of_publication'].values, dtype=torch.long, device=device)
        self.item_f2 = torch.tensor(X['publisher'].values, dtype=torch.long, device=device)
        self.item_f3 = torch.tensor(X['language'].values, dtype=torch.long, device=device)
        self.item_f4 = torch.tensor(X['category'].values, dtype=torch.long, device=device)

        self.y = torch.tensor(y.values, dtype=torch.float, device=device)

    def __len__(self):
        """
        데이터셋에 포함된 총 상호작용 개수를 반환합니다.
        
        :returns: 상호작용 개수
        :rtype: int
        """
        
        return len(self.y)

    def __getitem__(self, idx):
        """
        특정 인덱스에 해당하는 상호작용의 특징들과 레이블을 반환합니다.

        :param idx: 데이터 인덱스
        :type idx: int
        :returns: (특징 튜플, 레이블 텐서)
        :rtype: tuple of (tuple of torch.Tensor, torch.Tensor)
        """
        
        return (
            self.user[idx],
            self.item[idx],
            self.user_f0[idx],
            self.user_f1[idx],
            self.item_f0[idx],
            self.item_f1[idx],
            self.item_f2[idx],
            self.item_f3[idx],
            self.item_f4[idx]
        ), self.y[idx]


## 모델 정의

In [None]:
class Model(nn.Module):
    """
    ID, 특징 ID 임베딩과 CLIP 이미지, 텍스트 임베딩을 통합해서 별점을 예측하는 모델입니다.
    
    특징 가중치 (self.w)와 아이템 빈도(freq) 기반의 가중치 메커니즘을 사용합니다.

    (파라미터 Parameters)
    :param user_cnt: 유저 ID의 총 개수 (임베딩 테이블 크기).
    :param item_cnt: 아이템 ID의 총 개수 (임베딩 테이블 크기).
    :param user_f_cnt: 유저 특징별 개수 (list of int, [location_cnt, age_cnt]).
    :param item_f_cnt: 아이템 특징별 개수 (list of int, [author_cnt, year_cnt, ..., category_cnt]).
    :param embedding_dim: 임베딩 벡터의 차원 수 (d).
    :param freq_of_item: 각 아이템의 통계량 (빈도, 평균, 표준편차)을 포함하는 numpy 배열.
    :param device: 모델이 올라갈 장치 (CPU, CUDA, MPS 등).

    (모듈 Modules)
    :attr user_emb, item_emb, item_fX_emb: 각 ID 및 특징별 임베딩 테이블.
    :attr img_proj, txt_proj: CLIP 512차원 임베딩을 모델의 embedding_dim으로 투영하는 MLP.
    :attr mlp: 최종 예측을 수행하는 Multi-Layer Perceptron.

    """
    def __init__(self, user_cnt, item_cnt, user_f_cnt, item_f_cnt, embedding_dim, freq_of_item, device):
        super().__init__()
        self.device = device

        # 임베딩 정의
        self.user_emb = nn.Embedding(user_cnt, embedding_dim)
        self.user_f0_emb = nn.Embedding(user_f_cnt[0], embedding_dim)
        self.user_f1_emb = nn.Embedding(user_f_cnt[1], embedding_dim)

        self.item_emb = nn.Embedding(item_cnt, embedding_dim)
        self.item_f0_emb = nn.Embedding(item_f_cnt[0], embedding_dim)
        self.item_f1_emb = nn.Embedding(item_f_cnt[1], embedding_dim)
        self.item_f2_emb = nn.Embedding(item_f_cnt[2], embedding_dim)
        self.item_f3_emb = nn.Embedding(item_f_cnt[3], embedding_dim)
        self.item_f4_emb = nn.Embedding(item_f_cnt[4], embedding_dim)

        # 초기화
        for emb in [self.user_emb, self.item_emb, self.user_f0_emb, self.user_f1_emb, self.item_f0_emb, self.item_f1_emb, self.item_f2_emb, self.item_f3_emb, self.item_f4_emb]:
            nn.init.normal_(emb.weight, mean=0, std=0.01)

        # CLIP 임베딩 불러오기
        img_emb = torch.from_numpy(np.load("data/img_emb_clip.npy")).float()
        txt_emb = torch.from_numpy(np.load("data/text_emb_clip.npy")).float()

        self.register_buffer("img_emb", img_emb)
        self.register_buffer("txt_emb", txt_emb)

        self.img_proj = nn.Sequential(
            nn.Linear(512, embedding_dim),
            nn.Dropout(0.95),
            nn.ReLU()
        )

        self.txt_proj = nn.Sequential(
            nn.Linear(512, embedding_dim),
            nn.Dropout(0.3),
            nn.ReLU()
        )

        # 통계량 (빈도, 평균, 표준편차)
        item_freq = torch.tensor(freq_of_item).float().reshape(-1, 3)
        self.freq_raw = nn.Parameter(torch.log(item_freq[:, 0] + 0.00001), requires_grad=False)
        self.mean_std = nn.Parameter(item_freq[:, 1:], requires_grad=False)

        # 9개 임베딩 가중치 (유저 3 + 아이템 6)
        self.w = nn.Parameter(torch.zeros(9))  # 학습 가능한 가중치

        # 통합 MLP
        self.mlp = nn.Sequential(
            nn.Linear(embedding_dim * 3 + 2, 256),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(256, 1)
        )

        self.to(device)

    def forward(self, user_ids, item_ids, uf0, uf1, if0, if1, if2, if3, if4):
        
        # 유저 임베딩
        u0 = self.user_emb(user_ids)
        u1 = self.user_f0_emb(uf0)
        u2 = self.user_f1_emb(uf1)
        user_feats = torch.stack([u0, u1, u2], dim=1)

        # 아이템 임베딩
        i0 = self.item_emb(item_ids)
        i1 = self.item_f0_emb(if0)
        i2 = self.item_f1_emb(if1)
        i3 = self.item_f2_emb(if2)
        i4 = self.item_f3_emb(if3)
        i5 = self.item_f4_emb(if4)
        item_feats = torch.stack([i0, i1, i2, i3, i4, i5], dim=1)
        
        # 유저 임베딩, 아이템 임베딩 concat
        feats = torch.cat([user_feats, item_feats], dim=1)
        feats = feats + torch.randn_like(feats).to(device) * 0.1

        # 빈도 기반 가중치 생성 (로그빈도 -> 가중치 element-wise 곱 -> 시그모이드)
        freq = self.freq_raw[item_ids].unsqueeze(1).repeat(1, 9)
        w = self.w.unsqueeze(0)
        alpha = torch.sigmoid(freq * w).unsqueeze(-1)

        # 빈도 기반 가중치와 concat된 임베딩 벡터 곱
        weighted_emb = torch.sum(alpha * feats, dim=1)

        # 통계량 concat
        ms = self.mean_std[item_ids]
        weighted_emb = torch.cat([weighted_emb, ms], dim=1)

        # CLIP 임베딩 변환
        img = self.img_emb[item_ids]
        txt = self.txt_emb[item_ids]

        img = self.img_proj(img) * (1 - w.max())
        txt = self.txt_proj(txt) * (1 - w.max())

        # 유저, 아이템, 특징, 통량 임베딩과 CLIP 임베딩 concat
        final_embedding = torch.cat([fused_emb, img, txt], dim=1)

        # 최종 레이어
        out = self.mlp(final_vec).squeeze(1)
        return out


## 손실함수 정의


In [18]:
class RMSELoss(nn.Module):
    """
    RMSE 손실함수 클래스
    """
    def __init__(self, eps=1e-8):
        super().__init__()
        self.mse = nn.MSELoss()
        self.eps = eps

    def forward(self, pred, target):
        return torch.sqrt(self.mse(pred, target) + self.eps)

## 모델 인자로 넣을 준비

In [20]:
user_f_cnt = [len(user2id), len(location2id), len(age2id)]
item_f_cnt = [len(isbn2id), len(author2id), len(year2id), len(publisher2id), len(language2id), len(category2id)]

In [21]:
device = 'mps' if torch.backends.mps.is_available() else 'cpu'

freq_of_user_np = df_user.values[:, 3:].copy()
user_low_freq_mask = (freq_of_user_np[..., 0] == 1) | (freq_of_user_np[..., 0] == 2)
observed_mean_user = freq_of_user_np[..., 1][user_low_freq_mask]
smoothed_mean_user = (7.06 + observed_mean_user) / 2.0
freq_of_user_np[..., 1][user_low_freq_mask] = smoothed_mean_user
freq_of_book_np = df_book.values[:, 6:].copy()
item_low_freq_mask = (freq_of_book_np[..., 0] == 1) | (freq_of_book_np[..., 0] == 2)
observed_mean_item = freq_of_book_np[..., 1][item_low_freq_mask]
smoothed_mean_item = (7.06 + observed_mean_item) / 2.0
freq_of_book_np[..., 1][item_low_freq_mask] = smoothed_mean_item

freq_of_user = torch.from_numpy(freq_of_user_np).to(torch.float).to(device)
freq_of_item = torch.from_numpy(freq_of_book_np).to(torch.float).to(device)

model = Model(len(df_user), len(df_book), user_f_cnt, item_f_cnt, 64, freq_of_user, freq_of_item, device)
model.to(device)


  item_stats = torch.tensor(freq_of_item).float().reshape(-1, 6, 3)
  user_stats = torch.tensor(freq_of_user).float().reshape(-1, 3, 3)


Model(
  (user_emb): Embedding(68092, 63)
  (user_f0_emb): Embedding(68092, 63)
  (user_f1_emb): Embedding(18368, 63)
  (item_emb): Embedding(149570, 63)
  (item_f0_emb): Embedding(149570, 63)
  (item_f1_emb): Embedding(62059, 63)
  (item_f2_emb): Embedding(95, 63)
  (item_f3_emb): Embedding(11571, 63)
  (item_f4_emb): Embedding(27, 63)
  (img_proj): Sequential(
    (0): Linear(in_features=512, out_features=64, bias=True)
    (1): Dropout(p=0.95, inplace=False)
    (2): ReLU()
  )
  (txt_proj): Sequential(
    (0): Linear(in_features=512, out_features=64, bias=True)
    (1): Dropout(p=0.3, inplace=False)
    (2): ReLU()
  )
  (mlp): Sequential(
    (0): Linear(in_features=704, out_features=320, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.1, inplace=False)
    (3): Linear(in_features=320, out_features=256, bias=True)
    (4): ReLU()
    (5): Dropout(p=0.1, inplace=False)
    (6): Linear(in_features=256, out_features=1, bias=True)
  )
)

## 하이퍼파라미터 정의, 옵티마이저 및 손실함수 할당, 데이터 로더 할당

In [41]:
epochs = 50
lr = 0.035
optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=0.0005)
criterion = RMSELoss()

train_set = bookset(df, mode='train', device=device)
train_loader = DataLoader(train_set, batch_size=2048, shuffle=True)
test_set = bookset(df, mode='test', device=device)
test_loader = DataLoader(test_set, batch_size=2048, shuffle=False)

## 학습

In [43]:
epoch_train_loss = []
epoch_test_loss = []

for epoch in range(epochs):

    running_train_loss = 0.0
    running_test_loss = 0.0

    # ---------- TRAIN ----------
    model.train()
    n_data = 0

    for batch, r in train_loader:
        u, i, uf0, uf1, if0, if1, if2, if3, if4 = batch

        # device 이동
        u = u.to(device)
        i = i.to(device)
        uf0 = uf0.to(device)
        uf1 = uf1.to(device)
        if0 = if0.to(device)
        if1 = if1.to(device)
        if2 = if2.to(device)
        if3 = if3.to(device)
        if4 = if4.to(device)
        r = r.to(device) + (torch.randn(len(r)) * 0.15).to(device)

        optimizer.zero_grad()

        # model forward (feature 포함)
        pred = model(u, i, uf0, uf1, if0, if1, if2, if3, if4)

        loss = criterion(pred, r)
        loss.backward()
        optimizer.step()

        running_train_loss += loss.item() * len(r)
        n_data += len(r)

    running_train_loss /= n_data


    # ---------- TEST ----------
    model.eval()
    n_data = 0

    with torch.no_grad():
        for batch, r in test_loader:
            u, i, uf0, uf1, if0, if1, if2, if3, if4 = batch

            u = u.to(device)
            i = i.to(device)
            uf0 = uf0.to(device)
            uf1 = uf1.to(device)
            if0 = if0.to(device)
            if1 = if1.to(device)
            if2 = if2.to(device)
            if3 = if3.to(device)
            if4 = if4.to(device)
            r = r.to(device)

            pred = model(u, i, uf0, uf1, if0, if1, if2, if3, if4)
            loss = criterion(pred, r)

            running_test_loss += loss.item() * len(r)
            n_data += len(r)

    running_test_loss /= n_data

    epoch_train_loss.append(running_train_loss)
    epoch_test_loss.append(running_test_loss)

    print(f"Epoch {epoch+1} train={running_train_loss:.4f} test={running_test_loss:.4f}")


Epoch 1 train=1.6694 test=1.5785
Epoch 2 train=1.6434 test=1.6562
Epoch 3 train=1.6281 test=1.5697
Epoch 4 train=1.6084 test=1.6063
Epoch 5 train=1.6344 test=1.5740
Epoch 6 train=1.6184 test=1.5737
Epoch 7 train=1.6086 test=1.5673
Epoch 8 train=1.5981 test=1.5731
Epoch 9 train=1.6298 test=1.5815
Epoch 10 train=1.6148 test=1.6572
Epoch 11 train=1.6100 test=1.5864
Epoch 12 train=1.6133 test=1.6090
Epoch 13 train=1.5916 test=1.5701
Epoch 14 train=1.6129 test=1.6324
Epoch 15 train=1.6032 test=1.5956
Epoch 16 train=1.5875 test=1.6037
Epoch 17 train=1.6224 test=1.5800
Epoch 18 train=1.7903 test=1.6441
Epoch 19 train=1.6343 test=1.6125
Epoch 20 train=1.6065 test=1.6133
Epoch 21 train=1.6197 test=1.5941
Epoch 22 train=1.6035 test=1.5769
Epoch 23 train=1.5974 test=1.6455
Epoch 24 train=1.6170 test=1.5889
Epoch 25 train=1.6032 test=1.5738
Epoch 26 train=1.6022 test=1.7595
Epoch 27 train=1.6122 test=1.6573
Epoch 28 train=1.6075 test=1.5980
Epoch 29 train=1.5985 test=1.5956
Epoch 30 train=1.5993 t

## 최종 제출 파일 생성 코드
* 최종 제출을 위한 데이터셋 정의
* 추론 및 csv 생성 후 저장

In [None]:
class valset(Dataset):
    def __init__(self, df_train, device='mps'):
        
        # ----------- USER -----------
        self.user = torch.tensor(df_train['user_id'].values, dtype=torch.long, device=device)
        self.user_f0 = torch.tensor(df_train['location'].values, dtype=torch.long, device=device)
        self.user_f1 = torch.tensor(df_train['age'].values, dtype=torch.long, device=device)

        # ----------- ITEM -----------
        self.item = torch.tensor(df_train['isbn'].values, dtype=torch.long, device=device)
        self.item_f0 = torch.tensor(df_train['book_author'].values, dtype=torch.long, device=device)
        self.item_f1 = torch.tensor(df_train['year_of_publication'].values, dtype=torch.long, device=device)
        self.item_f2 = torch.tensor(df_train['publisher'].values, dtype=torch.long, device=device)
        self.item_f3 = torch.tensor(df_train['language'].values, dtype=torch.long, device=device)
        self.item_f4 = torch.tensor(df_train['category'].values, dtype=torch.long, device=device)


    def __len__(self):
        return len(self.user)

    def __getitem__(self, idx):
        return self.user[idx],\
            self.item[idx],\
            self.user_f0[idx],\
            self.user_f1[idx],\
            self.item_f0[idx],\
            self.item_f1[idx],\
            self.item_f2[idx],\
            self.item_f3[idx],\
            self.item_f4[idx]


In [None]:
df_test = pd.read_csv("data/test_ratings.csv")
df_test['user_id'] = df_test['user_id'].map(user2id)
df_test['isbn'] = df_test['isbn'].map(isbn2id)

user_cols = ['user_id', 'location', 'age']

book_cols = ['isbn', 'book_author', 'year_of_publication', 'publisher', 'language',
             'category']

df_test = df_test.merge(df_book[book_cols], on='isbn', how='left')
df_test = df_test.merge(df_user[user_cols], on='user_id', how='left')

val_set = valset(df_test, device=device)
test_loader = DataLoader(val_set, batch_size=2048, shuffle=False, drop_last=False)

predictions = []
model.eval()
with torch.no_grad():
    for batch in test_loader:
        u, i, uf0, uf1, if0, if1, if2, if3, if4 = batch

        u = u.to(device).long()
        i = i.to(device).long()
        uf0 = uf0.to(device).long()
        uf1 = uf1.to(device).long()
        if0 = if0.to(device).long()
        if1 = if1.to(device).long()
        if2 = if2.to(device).long()
        if3 = if3.to(device).long()
        if4 = if4.to(device).long()

        output = model(u, i, uf0, uf1, if0, if1, if2, if3, if4)
        predictions.append(output.cpu().numpy())

final_predictions = np.concatenate([o.flatten() for o in predictions])
df_submission = pd.read_csv("data/test_ratings.csv")
df_submission['rating'] = final_predictions
df_submission.to_csv("submission.csv")

# 제출 파일 생성 끝