In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import json
import pickle

In [3]:
df = pd.read_csv('/content/drive/MyDrive/집교 2_Team P/user-track-listen_count_filtered5.csv')

In [4]:
from sklearn.preprocessing import LabelEncoder
user_encoder = LabelEncoder()
track_encoder = LabelEncoder()
lyrics_encoder = LabelEncoder()
df['user_id'] = user_encoder.fit_transform(df['user_id'])
df['track_id'] = track_encoder.fit_transform(df['track_id'])

In [5]:
# !pip install torch torchvision -U

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import pandas as pd
import numpy as np
from tqdm import tqdm

# 데이터 불러오기 (예시: CSV 파일)
# Label Encoding
# PyTorch DataLoader에 맞게 데이터 변환
def df_to_tensor(dataset):
    users = torch.tensor(dataset['user_id'].values, dtype=torch.int)
    items = torch.tensor(dataset['track_id'].values, dtype=torch.int)
    ratings = torch.tensor(dataset['listen_count_bin'].values, dtype=torch.float)

    return users, items, ratings

train_df, test_df = train_test_split(df, test_size=0.3, random_state=42)
val_df, test_df = train_test_split(test_df, test_size=0.5, random_state=42)

train_users, train_items, train_ratings = df_to_tensor(train_df)
val_users, val_items, val_ratings = df_to_tensor(val_df)
test_users, test_items, test_ratings = df_to_tensor(test_df)

train_data = TensorDataset(train_users, train_items, train_ratings)
val_data = TensorDataset(val_users, val_items, val_ratings)
test_data = TensorDataset(test_users, test_items, test_ratings)

train_loader = DataLoader(train_data, batch_size=256, shuffle=True)
val_loader = DataLoader(val_data, batch_size=256, shuffle=False)
test_loader = DataLoader(test_data, batch_size=256, shuffle=False)
num_users = (df['user_id'].nunique())
num_items = (df['track_id'].nunique())
print(num_users)
print(num_items)

23761
28378


In [13]:
# NCF 모델 정의
class NCF(nn.Module):
    def __init__(self, num_users, num_items, embedding_size):
        super(NCF, self).__init__()
        self.user_embedding = nn.Embedding(num_users, embedding_size)
        self.item_embedding = nn.Embedding(num_items, embedding_size)
        self.fc_layers = nn.Sequential(
            nn.Linear(embedding_size * 2, embedding_size),
            nn.ReLU(),
            nn.Linear(embedding_size, int(embedding_size/2)),
            nn.ReLU(),
            nn.Linear(int(embedding_size/2), int(embedding_size/4)),
            nn.ReLU(),
            nn.Linear(int(embedding_size/4), int(embedding_size/8)),
            nn.ReLU(),
            nn.Linear(int(embedding_size/8), 1)
        )

    def forward(self, user, item):
        user_embedding = self.user_embedding(user)
        item_embedding = self.item_embedding(item)
        x = torch.cat((user_embedding, item_embedding), dim=1)
        x = self.fc_layers(x)
        return x

# CUDA 디바이스 설정
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)
def train_(embedding,n_epoch):
  # NCF 모델 정의 및 GPU로 이동
  model = NCF(num_users=num_users, num_items=num_items, embedding_size=embedding)
  model.to(device)
  criterion = nn.MSELoss()
  optimizer = optim.Adam(model.parameters(), lr=0.001,weight_decay=1e-5)
  # tqdm을 사용하여 학습 및 테스트 진행 상황 확인
  num_epochs = n_epoch
  min_loss = 100
  cnt = 0
  for epoch in range(num_epochs):
      model.train()
      total_loss = 0
      for user, item, rating in tqdm(train_loader, desc=f'Epoch {epoch+1}/{num_epochs}'):
          optimizer.zero_grad()
          user, item, rating = user.to(device), item.to(device), rating.to(device)  # GPU로 이동
          output = model(user, item)
          loss = criterion(output, rating.unsqueeze(1))
          loss.backward()
          optimizer.step()
          total_loss += loss.item()

      avg_loss = total_loss / len(train_loader)
      print(f'Epoch {epoch+1}/{num_epochs}, Avg. Loss: {avg_loss:.4f}')

     # Validation
      model.eval()
      total_val_loss = 0
      all_predictions = []
      with torch.no_grad():
          for user, item, rating in tqdm(val_loader, desc=f'Validation Epoch {epoch+1}'):
              user, item, rating = user.to(device), item.to(device), rating.to(device)  # GPU로 이동
              output = model(user, item)
              val_loss = criterion(output, rating.unsqueeze(1))
              total_val_loss += val_loss.item()
              all_predictions.append(output)

      # RMSE 계산
      predictions = torch.cat(all_predictions).squeeze().cpu().numpy()  # CPU로 이동 후 numpy로 변환
      rmse = np.sqrt(mean_squared_error(val_df['listen_count_bin'].values, predictions))
      avg_val_loss = total_val_loss / len(val_loader)
      print(f'Validation Epoch {epoch+1}/{num_epochs}, Avg. Loss: {avg_val_loss:.4f}')
      print(f'Validation Epoch {epoch+1}/{num_epochs}, RMSE on validation set: {rmse}')
      if avg_val_loss < min_loss:
          min_loss = avg_val_loss
          cnt = 0
      else:
          cnt += 1
      if cnt == 2:
        print("val_loss did not decrease")
        break

  # Test
  model.eval()
  all_predictions = []
  with torch.no_grad():
      for user, item, _ in tqdm(test_loader, desc=f'Testing'):
          user, item = user.to(device), item.to(device)  # GPU로 이동
          output = model(user, item)
          all_predictions.append(output)

  # RMSE 계산
  predictions = torch.cat(all_predictions).squeeze().cpu().numpy()  # CPU로 이동 후 numpy로 변환
  rmse = np.sqrt(mean_squared_error(test_df['listen_count_bin'].values, predictions))
  print(f'Final RMSE on test set: {rmse}')

Using device: cuda


In [None]:
train_(64,30)

In [None]:
train_(256,30)

In [None]:
train_(512,30)

In [None]:
train_(768,30)