In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import json
import pickle

In [3]:
df = pd.read_csv('/content/drive/MyDrive/집교 2_Team P/user-track-listen_count.csv')

In [4]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm
import numpy as np
from sklearn.metrics import mean_squared_error
# 가상의 데이터 프레임 생성 (예시)
# ...

# Label Encoding
user_encoder = LabelEncoder()
item_encoder = LabelEncoder()

df['user_id'] = user_encoder.fit_transform(df['user_id'])
df['track_id'] = item_encoder.fit_transform(df['track_id'])
df.loc[df['listen_count_bin'] == '10~2704', 'listen_count_bin'] = 10
df['listen_count_bin'] = df['listen_count_bin'].astype(int)

# PyTorch DataLoader에 맞게 데이터 변환
def df_to_tensor(dataset):
    users = torch.tensor(dataset['user_id'].values, dtype=torch.long)
    items = torch.tensor(dataset['track_id'].values, dtype=torch.long)
    ratings = torch.tensor(dataset['listen_count_bin'].values, dtype=torch.float)
    return users, items, ratings

train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

train_users, train_items, train_ratings = df_to_tensor(train_df)
test_users, test_items, test_ratings = df_to_tensor(test_df)

train_data = TensorDataset(train_users, train_items, train_ratings)
test_data = TensorDataset(test_users, test_items, test_ratings)

train_loader = DataLoader(train_data, batch_size=256, shuffle=True)
test_loader = DataLoader(test_data, batch_size=256, shuffle=False)

In [29]:
class GMF(nn.Module):
    def __init__(self, num_users, num_items, embedding_size):
        super(GMF, self).__init__()
        self.user_embedding = nn.Embedding(num_users, embedding_size)
        self.item_embedding = nn.Embedding(num_items, embedding_size)
        self.output_layer = nn.Linear(embedding_size, 1)

    def forward(self, user, item):
        user_embedding = self.user_embedding(user)
        item_embedding = self.item_embedding(item)
        elementwise_product = torch.mul(user_embedding, item_embedding)
        prediction = self.output_layer(elementwise_product)
        return prediction.view(-1)

# 데이터 전처리 및 DataLoader 생성
# ...
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)
# 모델 초기화
num_users = len(user_encoder.classes_)
num_items = len(item_encoder.classes_)
def train_gmf(embedding_size):

  gmf_model = GMF(num_users, num_items, embedding_size).to(device)

  # 손실 함수 및 최적화 함수 정의
  criterion = nn.MSELoss()
  optimizer = optim.Adam(gmf_model.parameters(), lr=0.001)

  # 학습 및 평가
  num_epochs = 10
  for epoch in range(num_epochs):
      gmf_model.train()
      for batch in tqdm(train_loader):
          users, items, ratings = batch
          users = users.to(device)
          items = items.to(device)
          ratings = ratings.to(device)
          optimizer.zero_grad()
          predictions = gmf_model(users, items)
          loss = criterion(predictions, ratings.unsqueeze(1))
          loss.backward()
          optimizer.step()

      gmf_model.eval()
      all_predictions = []
      with torch.no_grad():

          for batch in tqdm(test_loader):
              users, items, ratings = batch
              users = users.to(device)
              items = items.to(device)
              predictions = gmf_model(users, items)
              all_predictions.append(predictions)

      predictions = torch.cat(all_predictions).squeeze().cpu().numpy()

      rmse = np.sqrt(mean_squared_error(test_df['listen_count_bin'].values, predictions))
      print(f'Epoch {epoch+1}/{num_epochs}, RMSE on test set: {rmse}')

Using device: cuda


In [30]:
train_gmf(32)

  return F.mse_loss(input, target, reduction=self.reduction)
100%|██████████| 14544/14544 [00:59<00:00, 243.29it/s]
100%|██████████| 3636/3636 [00:10<00:00, 333.01it/s]


Epoch 1/10, RMSE on test set: 2.6626089501879733


  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
100%|██████████| 14544/14544 [01:01<00:00, 235.69it/s]
100%|██████████| 3636/3636 [00:11<00:00, 330.49it/s]


Epoch 2/10, RMSE on test set: 2.6625966443370723


  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
100%|██████████| 14544/14544 [01:01<00:00, 235.77it/s]
100%|██████████| 3636/3636 [00:10<00:00, 333.77it/s]


Epoch 3/10, RMSE on test set: 2.662587552912467


  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
100%|██████████| 14544/14544 [01:01<00:00, 236.02it/s]
100%|██████████| 3636/3636 [00:10<00:00, 332.89it/s]


Epoch 4/10, RMSE on test set: 2.6626262742589457


  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
100%|██████████| 14544/14544 [01:01<00:00, 236.77it/s]
100%|██████████| 3636/3636 [00:10<00:00, 331.80it/s]


Epoch 5/10, RMSE on test set: 2.662570871074185


  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
100%|██████████| 14544/14544 [01:01<00:00, 235.88it/s]
100%|██████████| 3636/3636 [00:11<00:00, 330.20it/s]


Epoch 6/10, RMSE on test set: 2.6626276012879417


  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
100%|██████████| 14544/14544 [01:01<00:00, 236.74it/s]
100%|██████████| 3636/3636 [00:10<00:00, 333.83it/s]


Epoch 7/10, RMSE on test set: 2.6625761122910983


  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
100%|██████████| 14544/14544 [01:01<00:00, 235.49it/s]
100%|██████████| 3636/3636 [00:10<00:00, 333.44it/s]


Epoch 8/10, RMSE on test set: 2.6625956537413327


  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
100%|██████████| 14544/14544 [01:01<00:00, 236.33it/s]
100%|██████████| 3636/3636 [00:11<00:00, 328.37it/s]


Epoch 9/10, RMSE on test set: 2.6626270989738576


  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
100%|██████████| 14544/14544 [01:01<00:00, 237.04it/s]
100%|██████████| 3636/3636 [00:10<00:00, 333.98it/s]

Epoch 10/10, RMSE on test set: 2.662577747559464





In [31]:
train_gmf(128)

  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
100%|██████████| 14544/14544 [01:00<00:00, 241.01it/s]
100%|██████████| 3636/3636 [00:10<00:00, 332.52it/s]


Epoch 1/10, RMSE on test set: 2.662685618898964


  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
100%|██████████| 14544/14544 [01:01<00:00, 235.82it/s]
100%|██████████| 3636/3636 [00:10<00:00, 334.31it/s]


Epoch 2/10, RMSE on test set: 2.6626952871431815


  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
100%|██████████| 14544/14544 [01:01<00:00, 236.41it/s]
100%|██████████| 3636/3636 [00:11<00:00, 329.66it/s]


Epoch 3/10, RMSE on test set: 2.6628105995860967


  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
100%|██████████| 14544/14544 [01:01<00:00, 235.74it/s]
100%|██████████| 3636/3636 [00:10<00:00, 334.30it/s]


Epoch 4/10, RMSE on test set: 2.6626974433108175


  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
100%|██████████| 14544/14544 [01:01<00:00, 235.32it/s]
100%|██████████| 3636/3636 [00:10<00:00, 330.72it/s]


Epoch 5/10, RMSE on test set: 2.6626610083521167


  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
100%|██████████| 14544/14544 [01:01<00:00, 236.02it/s]
100%|██████████| 3636/3636 [00:11<00:00, 329.26it/s]


Epoch 6/10, RMSE on test set: 2.662656017361288


  return F.mse_loss(input, target, reduction=self.reduction)
  9%|▉         | 1317/14544 [00:05<00:56, 235.75it/s]


KeyboardInterrupt: ignored

In [36]:
!pip install pyro-ppl
import torch
import pyro
import pyro.distributions as dist
from pyro.infer import SVI, Trace_ELBO
from pyro.optim import Adam

# 가상의 데이터셋 생성 (예시: MovieLens 데이터셋)
# ...

# PyTorch DataLoader에 맞게 데이터 변환
def df_to_tensor(dataset):
    users = torch.tensor(dataset['user_id'].values, dtype=torch.long)
    items = torch.tensor(dataset['item_id'].values, dtype=torch.long)
    ratings = torch.tensor(dataset['rating'].values, dtype=torch.float)
    return users, items, ratings

# Bayesian SVD++ 모델 정의
class BayesianSVDppModel:
    def __init__(self, num_users, num_items, embedding_size):
        self.user_mean = torch.nn.Parameter(torch.zeros(num_users))
        self.item_mean = torch.nn.Parameter(torch.zeros(num_items))
        self.user_embedding = torch.nn.Embedding(num_users, embedding_size)
        self.item_embedding = torch.nn.Embedding(num_items, embedding_size)
        self.alpha_u = torch.nn.Embedding(num_users, embedding_size)
        self.alpha_i = torch.nn.Embedding(num_items, embedding_size)

    def model(self, users, items, ratings):
        user_mean = self.user_mean[users]
        item_mean = self.item_mean[items]
        user_embedding = self.user_embedding(users)
        item_embedding = self.item_embedding(items)
        alpha_u = self.alpha_u(users)
        alpha_i = self.alpha_i(items)

        prediction = user_mean + item_mean + torch.sum(user_embedding * item_embedding, dim=1) + torch.sum(alpha_u * alpha_i, dim=1)
        obs = pyro.sample("obs", dist.Normal(prediction, 1.0).to_event(1), obs=ratings)

    def guide(self, users, items, ratings):
        # Variational parameters
        user_mean_loc = pyro.param("user_mean_loc", torch.zeros(num_users))
        user_mean_scale = pyro.param("user_mean_scale", torch.ones(num_users), constraint=dist.constraints.positive)
        item_mean_loc = pyro.param("item_mean_loc", torch.zeros(num_items))
        item_mean_scale = pyro.param("item_mean_scale", torch.ones(num_items), constraint=dist.constraints.positive)
        user_embedding_loc = pyro.param("user_embedding_loc", torch.randn(num_users, embedding_size))
        user_embedding_scale = pyro.param("user_embedding_scale", torch.ones(num_users, embedding_size), constraint=dist.constraints.positive)
        item_embedding_loc = pyro.param("item_embedding_loc", torch.randn(num_items, embedding_size))
        item_embedding_scale = pyro.param("item_embedding_scale", torch.ones(num_items, embedding_size), constraint=dist.constraints.positive)
        alpha_u_loc = pyro.param("alpha_u_loc", torch.randn(num_users, embedding_size))
        alpha_u_scale = pyro.param("alpha_u_scale", torch.ones(num_users, embedding_size), constraint=dist.constraints.positive)
        alpha_i_loc = pyro.param("alpha_i_loc", torch.randn(num_items, embedding_size))
        alpha_i_scale = pyro.param("alpha_i_scale", torch.ones(num_items, embedding_size), constraint=dist.constraints.positive)

        # Sample from variational distribution
        pyro.sample("user_mean", dist.Normal(user_mean_loc, user_mean_scale))
        pyro.sample("item_mean", dist.Normal(item_mean_loc, item_mean_scale))
        pyro.sample("user_embedding", dist.Normal(user_embedding_loc, user_embedding_scale))
        pyro.sample("item_embedding", dist.Normal(item_embedding_loc, item_embedding_scale))
        pyro.sample("alpha_u", dist.Normal(alpha_u_loc, alpha_u_scale))
        pyro.sample("alpha_i", dist.Normal(alpha_i_loc, alpha_i_scale))

# 데이터 전처리 및 DataLoader 생성
# ...

# 모델 초기화
num_users = len(user_encoder.classes_)
num_items = len(item_encoder.classes_)
embedding_size = 32

bayesian_svdpp_model = BayesianSVDppModel(num_users, num_items, embedding_size)

# 손실 함수 및 최적화 함수 정의
optimizer = Adam({"lr": 0.01})
svi = SVI(bayesian_svdpp_model.model, bayesian_svdpp_model.guide, optimizer, loss=Trace_ELBO())

# 학습
num_epochs = 10
for epoch in range(num_epochs):
    loss = 0.0
    for batch in train_loader:
        users, items, ratings = batch
        loss += svi.step(users, items, ratings)

    print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {loss / len(train_loader)}")

# 예측
bayesian_svdpp_model.eval()
with torch.no_grad():
    all_predictions = []
    for batch in test_loader:
        users, items, ratings = batch
        predictions = bayesian_svdpp_model.model(users, items, ratings)
        all_predictions.extend(predictions.numpy())

test_rmse = mean_squared_error(test_ratings, all_predictions, squared=False)
print(f'Test RMSE: {test_rmse}')




ValueError: ignored