In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import json
import pickle

In [3]:
df = pd.read_csv('/content/drive/MyDrive/집교 2_Team P/user-track-listen_count_filtered5.csv')

In [4]:
# Pickle 파일 읽기
with open('/content/drive/MyDrive/집교 2_Team P/lyrics_Embedding/lstm_outputs_merged.pkl', 'rb') as file:
    data = pickle.load(file)

# DataFrame으로 변환
df_embedding = pd.DataFrame(data, columns=['embedding', 'track_id'])

# track_id를 정수로 변환 (필요하다면)
df_embedding['track_id'] = df_embedding['track_id'].astype(int)

# 'embedding' 열을 768차원의 각 차원으로 나누기
# df_embedding[['embedding_{}'.format(i) for i in range(768)]] = pd.DataFrame(df_embedding['embedding'].tolist(), index=df_embedding.index)

# 'embedding' 열 삭제
# df_embedding = df_embedding.drop(['embedding'], axis=1)

# DataFrame 확인
print(df_embedding.head())


                                           embedding  track_id
0  [-0.20179361, 0.18552369, -0.11386657, 0.00375...         2
1  [-0.16733141, 0.19308521, -0.1055927, 0.136779...         8
2  [-0.19286135, 0.13035919, -0.13566753, 0.02671...      1524
3  [-0.19150019, 0.15346457, -0.1114238, 0.093935...      1785
4  [-0.21802717, 0.16726568, -0.12696394, 0.06847...      1787


In [5]:
print(df.shape)
df = pd.merge(df, df_embedding, on='track_id', how='inner')
df.shape

(4645010, 4)


(4645010, 5)

In [6]:
from sklearn.preprocessing import LabelEncoder
user_encoder = LabelEncoder()
track_encoder = LabelEncoder()
lyrics_encoder = LabelEncoder()
df['user_id'] = user_encoder.fit_transform(df['user_id'])
df['track_id'] = track_encoder.fit_transform(df['track_id'])
df_embedding['track_id'] = lyrics_encoder.fit_transform(df_embedding['track_id'])

In [7]:
lyrics_dict = dict(zip(df_embedding['track_id'], df_embedding['embedding']))

In [8]:
# !pip install torch torchvision -U

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import pandas as pd
import numpy as np
from tqdm import tqdm

# 데이터 불러오기 (예시: CSV 파일)
# Label Encoding
# PyTorch DataLoader에 맞게 데이터 변환
def df_to_tensor(dataset):
    users = torch.tensor(dataset['user_id'].values, dtype=torch.int)
    items = torch.tensor(dataset['track_id'].values, dtype=torch.int)
    ratings = torch.tensor(dataset['listen_count_bin'].values, dtype=torch.float)
    lyrics_embeddings = torch.tensor(np.vstack(dataset['embedding'].values), dtype=torch.float)
    return users, items, ratings, lyrics_embeddings

train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

train_users, train_items, train_ratings, train_lyrics_embeddings = df_to_tensor(train_df)
test_users, test_items, test_ratings,test_lyrics_embeddings = df_to_tensor(test_df)

train_data = TensorDataset(train_users, train_items, train_ratings,train_lyrics_embeddings)
test_data = TensorDataset(test_users, test_items, test_ratings,test_lyrics_embeddings)

train_loader = DataLoader(train_data, batch_size=256, shuffle=True)
test_loader = DataLoader(test_data, batch_size=256, shuffle=False)
num_users = (df['user_id'].nunique())
num_items = (df['track_id'].nunique())
print(num_users)
print(num_items)

23761
28378


In [9]:
# NCF 모델 정의
class NCF_embedding(nn.Module):
    def __init__(self, num_users, num_items, embedding_size):
        super(NCF_embedding, self).__init__()
        self.user_embedding = nn.Embedding(num_users, embedding_size)
        self.item_embedding = nn.Embedding(num_items, embedding_size)
        self.lyrics_embedding = nn.Linear(768, embedding_size)
        self.fc_layers = nn.Sequential(
            nn.Linear(embedding_size * 3, embedding_size),
            nn.ReLU(),
            nn.Linear(embedding_size, int(embedding_size/2)),
            nn.ReLU(),
            nn.Linear(int(embedding_size/2), int(embedding_size/4)),
            nn.ReLU(),
            nn.Linear(int(embedding_size/4), int(embedding_size/8)),
            nn.ReLU(),
            nn.Linear(int(embedding_size/8), 1)
        )

    def forward(self, user, item, lyrics_embedding):
        user_embedding = self.user_embedding(user)
        item_embedding = self.item_embedding(item)
        lyrics_embedding = self.lyrics_embedding(lyrics_embedding.reshape(lyrics_embedding.shape[0],768))
        x = torch.cat((user_embedding, item_embedding,lyrics_embedding), dim=1)
        x = self.fc_layers(x)
        return x

# CUDA 디바이스 설정
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)
def train_embedding(embedding,n_epoch):
  # NCF 모델 정의 및 GPU로 이동
  model = NCF_embedding(num_users=num_users, num_items=num_items, embedding_size=embedding)
  model.to(device)
  criterion = nn.MSELoss()
  optimizer = optim.Adam(model.parameters(), lr=0.001)
  # tqdm을 사용하여 학습 및 테스트 진행 상황 확인
  num_epochs = n_epoch
  for epoch in range(num_epochs):
      model.train()
      total_loss = 0
      for user, item, rating,lyrics_embedding in tqdm(train_loader, desc=f'Epoch {epoch+1}/{num_epochs}'):
          optimizer.zero_grad()
          user, item, rating,lyrics_embedding = user.to(device), item.to(device), rating.to(device),lyrics_embedding.to(device)  # GPU로 이동
          output = model(user, item,lyrics_embedding.float())
          loss = criterion(output, rating.unsqueeze(1))
          loss.backward()
          optimizer.step()
          total_loss += loss.item()

      avg_loss = total_loss / len(train_loader)
      print(f'Epoch {epoch+1}/{num_epochs}, Avg. Loss: {avg_loss:.4f}')

      # 각 에폭이 끝날 때마다 테스트 데이터에 대한 예측 수행
      model.eval()
      all_predictions = []
      with torch.no_grad():
          for user, item, _,lyrics_embedding in tqdm(test_loader, desc=f'Testing Epoch {epoch+1}'):
              user, item,lyrics_embedding = user.to(device), item.to(device),lyrics_embedding.to(device)  # GPU로 이동
              output = model(user, item,lyrics_embedding)
              all_predictions.append(output)

      # RMSE 계산
      predictions = torch.cat(all_predictions).squeeze().cpu().numpy()  # CPU로 이동 후 numpy로 변환
      rmse = np.sqrt(mean_squared_error(test_df['listen_count_bin'].values, predictions))
      print(f'Epoch {epoch+1}/{num_epochs}, RMSE on test set: {rmse}')


Using device: cuda


In [10]:
train_embedding(256,15)

Epoch 1/15: 100%|██████████| 14516/14516 [02:29<00:00, 97.12it/s]


Epoch 1/15, Avg. Loss: 1.3960


Testing Epoch 1: 100%|██████████| 3629/3629 [00:16<00:00, 220.10it/s]


Epoch 1/15, RMSE on test set: 1.1551428258345198


Epoch 2/15: 100%|██████████| 14516/14516 [02:30<00:00, 96.43it/s]


Epoch 2/15, Avg. Loss: 1.3219


Testing Epoch 2: 100%|██████████| 3629/3629 [00:15<00:00, 230.43it/s]


Epoch 2/15, RMSE on test set: 1.1494479858054807


Epoch 3/15: 100%|██████████| 14516/14516 [02:31<00:00, 96.06it/s]


Epoch 3/15, Avg. Loss: 1.2973


Testing Epoch 3: 100%|██████████| 3629/3629 [00:15<00:00, 228.47it/s]


Epoch 3/15, RMSE on test set: 1.1429243545510706


Epoch 4/15: 100%|██████████| 14516/14516 [02:30<00:00, 96.39it/s]


Epoch 4/15, Avg. Loss: 1.2695


Testing Epoch 4: 100%|██████████| 3629/3629 [00:15<00:00, 229.88it/s]


Epoch 4/15, RMSE on test set: 1.1385210380571094


Epoch 5/15: 100%|██████████| 14516/14516 [02:31<00:00, 96.06it/s]


Epoch 5/15, Avg. Loss: 1.2371


Testing Epoch 5: 100%|██████████| 3629/3629 [00:16<00:00, 214.52it/s]


Epoch 5/15, RMSE on test set: 1.136758482663746


Epoch 6/15: 100%|██████████| 14516/14516 [02:30<00:00, 96.29it/s]


Epoch 6/15, Avg. Loss: 1.1916


Testing Epoch 6: 100%|██████████| 3629/3629 [00:16<00:00, 224.12it/s]


Epoch 6/15, RMSE on test set: 1.1328031510232786


Epoch 7/15: 100%|██████████| 14516/14516 [02:30<00:00, 96.41it/s]


Epoch 7/15, Avg. Loss: 1.1363


Testing Epoch 7: 100%|██████████| 3629/3629 [00:16<00:00, 224.98it/s]


Epoch 7/15, RMSE on test set: 1.130089657231166


Epoch 8/15: 100%|██████████| 14516/14516 [02:30<00:00, 96.35it/s]


Epoch 8/15, Avg. Loss: 1.0791


Testing Epoch 8: 100%|██████████| 3629/3629 [00:15<00:00, 227.37it/s]


Epoch 8/15, RMSE on test set: 1.133493292482769


Epoch 9/15: 100%|██████████| 14516/14516 [02:30<00:00, 96.45it/s]


Epoch 9/15, Avg. Loss: 1.0243


Testing Epoch 9: 100%|██████████| 3629/3629 [00:15<00:00, 228.80it/s]


Epoch 9/15, RMSE on test set: 1.1379027251289173


Epoch 10/15: 100%|██████████| 14516/14516 [02:29<00:00, 96.85it/s]


Epoch 10/15, Avg. Loss: 0.9738


Testing Epoch 10: 100%|██████████| 3629/3629 [00:16<00:00, 224.00it/s]


Epoch 10/15, RMSE on test set: 1.142928239608011


Epoch 11/15: 100%|██████████| 14516/14516 [02:33<00:00, 94.59it/s]


Epoch 11/15, Avg. Loss: 0.9287


Testing Epoch 11: 100%|██████████| 3629/3629 [00:16<00:00, 226.80it/s]


Epoch 11/15, RMSE on test set: 1.1486422160771907


Epoch 12/15: 100%|██████████| 14516/14516 [02:32<00:00, 95.14it/s]


Epoch 12/15, Avg. Loss: 0.8874


Testing Epoch 12: 100%|██████████| 3629/3629 [00:16<00:00, 222.94it/s]


Epoch 12/15, RMSE on test set: 1.1556731038562404


Epoch 13/15: 100%|██████████| 14516/14516 [02:32<00:00, 94.88it/s]


Epoch 13/15, Avg. Loss: 0.8498


Testing Epoch 13: 100%|██████████| 3629/3629 [00:16<00:00, 220.72it/s]


Epoch 13/15, RMSE on test set: 1.1614412994440937


Epoch 14/15: 100%|██████████| 14516/14516 [02:34<00:00, 94.19it/s]


Epoch 14/15, Avg. Loss: 0.8160


Testing Epoch 14: 100%|██████████| 3629/3629 [00:16<00:00, 220.15it/s]


Epoch 14/15, RMSE on test set: 1.1634745256724568


Epoch 15/15: 100%|██████████| 14516/14516 [02:42<00:00, 89.37it/s]


Epoch 15/15, Avg. Loss: 0.7857


Testing Epoch 15: 100%|██████████| 3629/3629 [00:17<00:00, 205.00it/s]

Epoch 15/15, RMSE on test set: 1.1792196736933527





In [11]:
train_embedding(512,15)

Epoch 1/15: 100%|██████████| 14516/14516 [03:53<00:00, 62.15it/s]


Epoch 1/15, Avg. Loss: 1.3881


Testing Epoch 1: 100%|██████████| 3629/3629 [00:17<00:00, 202.44it/s]


Epoch 1/15, RMSE on test set: 1.1568221511962649


Epoch 2/15: 100%|██████████| 14516/14516 [04:06<00:00, 58.90it/s]


Epoch 2/15, Avg. Loss: 1.3189


Testing Epoch 2: 100%|██████████| 3629/3629 [00:17<00:00, 204.02it/s]


Epoch 2/15, RMSE on test set: 1.1506079957650446


Epoch 3/15: 100%|██████████| 14516/14516 [04:06<00:00, 58.97it/s]


Epoch 3/15, Avg. Loss: 1.2851


Testing Epoch 3: 100%|██████████| 3629/3629 [00:18<00:00, 201.35it/s]


Epoch 3/15, RMSE on test set: 1.140142759150556


Epoch 4/15: 100%|██████████| 14516/14516 [04:02<00:00, 59.76it/s]


Epoch 4/15, Avg. Loss: 1.2511


Testing Epoch 4: 100%|██████████| 3629/3629 [00:16<00:00, 219.29it/s]


Epoch 4/15, RMSE on test set: 1.1336910103192137


Epoch 5/15: 100%|██████████| 14516/14516 [03:43<00:00, 64.92it/s]


Epoch 5/15, Avg. Loss: 1.2048


Testing Epoch 5: 100%|██████████| 3629/3629 [00:16<00:00, 221.77it/s]


Epoch 5/15, RMSE on test set: 1.1325256770578502


Epoch 6/15: 100%|██████████| 14516/14516 [03:41<00:00, 65.46it/s]


Epoch 6/15, Avg. Loss: 1.1404


Testing Epoch 6: 100%|██████████| 3629/3629 [00:16<00:00, 219.53it/s]


Epoch 6/15, RMSE on test set: 1.1270001316354632


Epoch 7/15: 100%|██████████| 14516/14516 [03:44<00:00, 64.73it/s]


Epoch 7/15, Avg. Loss: 1.0717


Testing Epoch 7: 100%|██████████| 3629/3629 [00:16<00:00, 223.73it/s]


Epoch 7/15, RMSE on test set: 1.1253333585713383


Epoch 8/15: 100%|██████████| 14516/14516 [03:39<00:00, 66.01it/s]


Epoch 8/15, Avg. Loss: 1.0059


Testing Epoch 8: 100%|██████████| 3629/3629 [00:16<00:00, 225.20it/s]


Epoch 8/15, RMSE on test set: 1.1239250139301327


Epoch 9/15: 100%|██████████| 14516/14516 [03:40<00:00, 65.93it/s]


Epoch 9/15, Avg. Loss: 0.9458


Testing Epoch 9: 100%|██████████| 3629/3629 [00:16<00:00, 220.05it/s]


Epoch 9/15, RMSE on test set: 1.13464333595831


Epoch 10/15: 100%|██████████| 14516/14516 [03:43<00:00, 64.85it/s]


Epoch 10/15, Avg. Loss: 0.8921


Testing Epoch 10: 100%|██████████| 3629/3629 [00:18<00:00, 200.02it/s]


Epoch 10/15, RMSE on test set: 1.1397496752791838


Epoch 11/15: 100%|██████████| 14516/14516 [03:49<00:00, 63.33it/s]


Epoch 11/15, Avg. Loss: 0.8431


Testing Epoch 11: 100%|██████████| 3629/3629 [00:17<00:00, 207.02it/s]


Epoch 11/15, RMSE on test set: 1.1467706823476629


Epoch 12/15: 100%|██████████| 14516/14516 [03:47<00:00, 63.78it/s]


Epoch 12/15, Avg. Loss: 0.7996


Testing Epoch 12: 100%|██████████| 3629/3629 [00:18<00:00, 201.42it/s]


Epoch 12/15, RMSE on test set: 1.1536775086121727


Epoch 13/15: 100%|██████████| 14516/14516 [03:46<00:00, 63.96it/s]


Epoch 13/15, Avg. Loss: 0.7610


Testing Epoch 13: 100%|██████████| 3629/3629 [00:16<00:00, 215.95it/s]


Epoch 13/15, RMSE on test set: 1.1620036814906396


Epoch 14/15: 100%|██████████| 14516/14516 [03:41<00:00, 65.50it/s]


Epoch 14/15, Avg. Loss: 0.7267


Testing Epoch 14: 100%|██████████| 3629/3629 [00:16<00:00, 222.03it/s]


Epoch 14/15, RMSE on test set: 1.1750405574963119


Epoch 15/15: 100%|██████████| 14516/14516 [03:40<00:00, 65.90it/s]


Epoch 15/15, Avg. Loss: 0.6955


Testing Epoch 15: 100%|██████████| 3629/3629 [00:16<00:00, 219.05it/s]


Epoch 15/15, RMSE on test set: 1.1698114226369525


In [12]:
train_embedding(768,15)

Epoch 1/15: 100%|██████████| 14516/14516 [04:49<00:00, 50.09it/s]


Epoch 1/15, Avg. Loss: 1.3875


Testing Epoch 1: 100%|██████████| 3629/3629 [00:16<00:00, 215.05it/s]


Epoch 1/15, RMSE on test set: 1.1552204464044353


Epoch 2/15: 100%|██████████| 14516/14516 [04:54<00:00, 49.29it/s]


Epoch 2/15, Avg. Loss: 1.3184


Testing Epoch 2: 100%|██████████| 3629/3629 [00:16<00:00, 219.67it/s]


Epoch 2/15, RMSE on test set: 1.1499093709460027


Epoch 3/15: 100%|██████████| 14516/14516 [04:53<00:00, 49.49it/s]


Epoch 3/15, Avg. Loss: 1.2833


Testing Epoch 3: 100%|██████████| 3629/3629 [00:16<00:00, 221.36it/s]


Epoch 3/15, RMSE on test set: 1.1379131549710932


Epoch 4/15: 100%|██████████| 14516/14516 [04:53<00:00, 49.52it/s]


Epoch 4/15, Avg. Loss: 1.2476


Testing Epoch 4: 100%|██████████| 3629/3629 [00:16<00:00, 214.49it/s]


Epoch 4/15, RMSE on test set: 1.1304087086937062


Epoch 5/15: 100%|██████████| 14516/14516 [04:53<00:00, 49.50it/s]


Epoch 5/15, Avg. Loss: 1.1985


Testing Epoch 5: 100%|██████████| 3629/3629 [00:16<00:00, 214.83it/s]


Epoch 5/15, RMSE on test set: 1.1213107868043974


Epoch 6/15: 100%|██████████| 14516/14516 [04:56<00:00, 48.88it/s]


Epoch 6/15, Avg. Loss: 1.1347


Testing Epoch 6: 100%|██████████| 3629/3629 [00:16<00:00, 218.40it/s]


Epoch 6/15, RMSE on test set: 1.1175968681583988


Epoch 7/15: 100%|██████████| 14516/14516 [04:55<00:00, 49.09it/s]


Epoch 7/15, Avg. Loss: 1.0648


Testing Epoch 7: 100%|██████████| 3629/3629 [00:16<00:00, 218.06it/s]


Epoch 7/15, RMSE on test set: 1.1257906898424168


Epoch 8/15: 100%|██████████| 14516/14516 [04:55<00:00, 49.14it/s]


Epoch 8/15, Avg. Loss: 0.9941


Testing Epoch 8: 100%|██████████| 3629/3629 [00:16<00:00, 222.35it/s]


Epoch 8/15, RMSE on test set: 1.1199523770225819


Epoch 9/15: 100%|██████████| 14516/14516 [04:55<00:00, 49.18it/s]


Epoch 9/15, Avg. Loss: 0.9272


Testing Epoch 9: 100%|██████████| 3629/3629 [00:16<00:00, 220.22it/s]


Epoch 9/15, RMSE on test set: 1.1306638461034675


Epoch 10/15: 100%|██████████| 14516/14516 [04:55<00:00, 49.17it/s]


Epoch 10/15, Avg. Loss: 0.8655


Testing Epoch 10: 100%|██████████| 3629/3629 [00:16<00:00, 220.69it/s]


Epoch 10/15, RMSE on test set: 1.1357129168671445


Epoch 11/15: 100%|██████████| 14516/14516 [04:54<00:00, 49.28it/s]


Epoch 11/15, Avg. Loss: 0.8098


Testing Epoch 11: 100%|██████████| 3629/3629 [00:16<00:00, 222.85it/s]


Epoch 11/15, RMSE on test set: 1.1413270770421586


Epoch 12/15: 100%|██████████| 14516/14516 [04:54<00:00, 49.31it/s]


Epoch 12/15, Avg. Loss: 0.7593


Testing Epoch 12: 100%|██████████| 3629/3629 [00:16<00:00, 220.96it/s]


Epoch 12/15, RMSE on test set: 1.1512296674284093


Epoch 13/15: 100%|██████████| 14516/14516 [04:55<00:00, 49.19it/s]


Epoch 13/15, Avg. Loss: 0.7143


Testing Epoch 13: 100%|██████████| 3629/3629 [00:16<00:00, 218.98it/s]


Epoch 13/15, RMSE on test set: 1.1613064549034815


Epoch 14/15: 100%|██████████| 14516/14516 [04:56<00:00, 48.95it/s]


Epoch 14/15, Avg. Loss: 0.6740


Testing Epoch 14: 100%|██████████| 3629/3629 [00:16<00:00, 217.58it/s]


Epoch 14/15, RMSE on test set: 1.1691534628078486


Epoch 15/15: 100%|██████████| 14516/14516 [04:56<00:00, 49.02it/s]


Epoch 15/15, Avg. Loss: 0.6379


Testing Epoch 15: 100%|██████████| 3629/3629 [00:16<00:00, 218.97it/s]

Epoch 15/15, RMSE on test set: 1.180706578298664



