In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import json
import pickle

In [3]:
df = pd.read_csv('/content/drive/MyDrive/집교 2_Team P/user-track-listen_count_filtered5.csv')

In [5]:
# Pickle 파일 읽기
with open('/content/drive/MyDrive/집교 2_Team P/lyrics_Embedding/all_roberta-no_duplicate_full.pkl의 사본', 'rb') as file:
    data = pickle.load(file)

# DataFrame으로 변환
df_embedding = pd.DataFrame(data, columns=['embedding', 'track_id'])

# track_id를 정수로 변환 (필요하다면)
df_embedding['track_id'] = df_embedding['track_id'].astype(int)

# 'embedding' 열을 768차원의 각 차원으로 나누기
# df_embedding[['embedding_{}'.format(i) for i in range(768)]] = pd.DataFrame(df_embedding['embedding'].tolist(), index=df_embedding.index)

# 'embedding' 열 삭제
# df_embedding = df_embedding.drop(['embedding'], axis=1)

# DataFrame 확인
print(df_embedding.head())


                                           embedding  track_id
0  [-0.10998139, 0.004360262, -0.013837141, -0.12...         2
1  [-0.09321758, 0.026769742, -0.014905005, -0.17...         8
2  [-0.096894965, 0.013164954, -0.009375575, -0.0...      1524
3  [-0.098773316, 0.038653724, -0.0150319515, -0....      1785
4  [-0.09078942, -0.005732551, -0.007893708, -0.1...      1787


In [6]:
print(df.shape)
df = pd.merge(df, df_embedding, on='track_id', how='inner')
df.shape

(4645010, 4)


(4644051, 5)

In [7]:
from sklearn.preprocessing import LabelEncoder
user_encoder = LabelEncoder()
track_encoder = LabelEncoder()
lyrics_encoder = LabelEncoder()
df['user_id'] = user_encoder.fit_transform(df['user_id'])
df['track_id'] = track_encoder.fit_transform(df['track_id'])
df_embedding['track_id'] = lyrics_encoder.fit_transform(df_embedding['track_id'])

In [8]:
lyrics_dict = dict(zip(df_embedding['track_id'], df_embedding['embedding']))

In [9]:
# !pip install torch torchvision -U

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import pandas as pd
import numpy as np
from tqdm import tqdm

# 데이터 불러오기 (예시: CSV 파일)
# Label Encoding
# PyTorch DataLoader에 맞게 데이터 변환
def df_to_tensor(dataset):
    users = torch.tensor(dataset['user_id'].values, dtype=torch.int)
    items = torch.tensor(dataset['track_id'].values, dtype=torch.int)
    ratings = torch.tensor(dataset['listen_count_bin'].values, dtype=torch.float)
    lyrics_embeddings = torch.tensor(np.vstack(dataset['embedding'].values), dtype=torch.float)
    return users, items, ratings, lyrics_embeddings

train_df, test_df = train_test_split(df, test_size=0.3, random_state=42)
val_df, test_df = train_test_split(test_df,test_size=0.5, random_state=42)

train_users, train_items, train_ratings, train_lyrics_embeddings = df_to_tensor(train_df)
val_users,val_items,val_ratings,val_lyrics_embeddings = df_to_tensor(val_df)
test_users, test_items, test_ratings,test_lyrics_embeddings = df_to_tensor(test_df)


train_data = TensorDataset(train_users, train_items, train_ratings,train_lyrics_embeddings)
val_data = TensorDataset(val_users,val_items,val_ratings,val_lyrics_embeddings)
test_data = TensorDataset(test_users, test_items, test_ratings,test_lyrics_embeddings)

train_loader = DataLoader(train_data, batch_size=256, shuffle=True)
val_loader = DataLoader(val_data, batch_size=256, shuffle=True)
test_loader = DataLoader(test_data, batch_size=256, shuffle=False)
num_users = (df['user_id'].nunique())
num_items = (df['track_id'].nunique())
print(num_users)
print(num_items)

23761
28309


In [10]:
# NCF 모델 정의
class NCF_embedding(nn.Module):
    def __init__(self, num_users, num_items, embedding_size):
        super(NCF_embedding, self).__init__()
        self.user_embedding = nn.Embedding(num_users, embedding_size)
        self.item_embedding = nn.Embedding(num_items, embedding_size)
        self.lyrics_embedding = nn.Linear(768, embedding_size)
        self.fc_layers = nn.Sequential(
            nn.Linear(embedding_size * 3, embedding_size),
            nn.ReLU(),
            nn.Linear(embedding_size, int(embedding_size/2)),
            nn.ReLU(),
            nn.Linear(int(embedding_size/2), int(embedding_size/4)),
            nn.ReLU(),
            nn.Linear(int(embedding_size/4), int(embedding_size/8)),
            nn.ReLU(),
            nn.Linear(int(embedding_size/8), 1)
        )

    def forward(self, user, item, lyrics_embedding):
        user_embedding = self.user_embedding(user)
        item_embedding = self.item_embedding(item)
        lyrics_embedding = self.lyrics_embedding(lyrics_embedding.reshape(lyrics_embedding.shape[0],768))
        x = torch.cat((user_embedding, item_embedding,lyrics_embedding), dim=1)
        x = self.fc_layers(x)
        return x

# CUDA 디바이스 설정
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)
def train_embedding(embedding,n_epoch):
  # NCF 모델 정의 및 GPU로 이동
  model = NCF_embedding(num_users=num_users, num_items=num_items, embedding_size=embedding)
  model.to(device)
  criterion = nn.MSELoss()
  optimizer = optim.Adam(model.parameters(), lr=0.001)
  # tqdm을 사용하여 학습 및 테스트 진행 상황 확인
  num_epochs = n_epoch
  min_loss = 100
  cnt = 0
  for epoch in range(num_epochs):
      model.train()
      total_loss = 0
      for user, item, rating,embedding in tqdm(train_loader, desc=f'Epoch {epoch+1}/{num_epochs}'):
          optimizer.zero_grad()
          user, item, rating,embedding = user.to(device), item.to(device), rating.to(device),embedding.to(device)  # GPU로 이동
          output = model(user, item,embedding)
          loss = criterion(output, rating.unsqueeze(1))
          loss.backward()
          optimizer.step()
          total_loss += loss.item()

      avg_loss = total_loss / len(train_loader)
      print(f'Epoch {epoch+1}/{num_epochs}, Avg. Loss: {avg_loss:.4f}')

     # Validation
      model.eval()
      total_val_loss = 0
      all_predictions = []
      with torch.no_grad():
          for user, item, rating, embedding in tqdm(val_loader, desc=f'Validation Epoch {epoch+1}'):
              user, item, rating, embedding = user.to(device), item.to(device), rating.to(device), embedding.to(device)  # GPU로 이동
              output = model(user, item, embedding)
              val_loss = criterion(output, rating.unsqueeze(1))
              total_val_loss += val_loss.item()
              all_predictions.append(output)

      # RMSE 계산
      predictions = torch.cat(all_predictions).squeeze().cpu().numpy()  # CPU로 이동 후 numpy로 변환
      rmse = np.sqrt(mean_squared_error(val_df['listen_count_bin'].values, predictions))
      avg_val_loss = total_val_loss / len(val_loader)
      print(f'Validation Epoch {epoch+1}/{num_epochs}, Avg. Loss: {avg_val_loss:.4f}')
      print(f'Validation Epoch {epoch+1}/{num_epochs}, RMSE on validation set: {rmse}')
      if avg_val_loss < min_loss:
          min_loss = avg_val_loss
          cnt = 0
      else:
          cnt += 1
      if cnt == 2:
        print("val_loss did not decrease")
        break

  # Test
  model.eval()
  all_predictions = []
  with torch.no_grad():
      for user, item, _ , embedding in tqdm(test_loader, desc=f'Testing'):
          user, item, embedding = user.to(device), item.to(device), embedding.to(device)  # GPU로 이동
          output = model(user, item,embedding)
          all_predictions.append(output)

  # RMSE 계산
  predictions = torch.cat(all_predictions).squeeze().cpu().numpy()  # CPU로 이동 후 numpy로 변환
  rmse = np.sqrt(mean_squared_error(test_df['listen_count_bin'].values, predictions))
  print(f'Final RMSE on test set: {rmse}')

Using device: cuda


In [11]:
train_embedding(64,15)

Epoch 1/15: 100%|██████████| 12699/12699 [01:34<00:00, 134.41it/s]


Epoch 1/15, Avg. Loss: 1.4259


Validation Epoch 1: 100%|██████████| 2722/2722 [00:13<00:00, 202.12it/s]


Validation Epoch 1/15, Avg. Loss: 1.3365
Validation Epoch 1/15, RMSE on validation set: 1.3920209715852623


Epoch 2/15: 100%|██████████| 12699/12699 [01:36<00:00, 131.79it/s]


Epoch 2/15, Avg. Loss: 1.3235


Validation Epoch 2: 100%|██████████| 2722/2722 [00:12<00:00, 211.18it/s]


Validation Epoch 2/15, Avg. Loss: 1.3260
Validation Epoch 2/15, RMSE on validation set: 1.3944129337236013


Epoch 3/15: 100%|██████████| 12699/12699 [01:35<00:00, 133.42it/s]


Epoch 3/15, Avg. Loss: 1.3124


Validation Epoch 3: 100%|██████████| 2722/2722 [00:12<00:00, 210.88it/s]


Validation Epoch 3/15, Avg. Loss: 1.3239
Validation Epoch 3/15, RMSE on validation set: 1.3922486864689905


Epoch 4/15: 100%|██████████| 12699/12699 [01:35<00:00, 133.15it/s]


Epoch 4/15, Avg. Loss: 1.3055


Validation Epoch 4: 100%|██████████| 2722/2722 [00:12<00:00, 212.00it/s]


Validation Epoch 4/15, Avg. Loss: 1.3229
Validation Epoch 4/15, RMSE on validation set: 1.4184358128018442


Epoch 5/15: 100%|██████████| 12699/12699 [01:36<00:00, 132.27it/s]


Epoch 5/15, Avg. Loss: 1.2966


Validation Epoch 5: 100%|██████████| 2722/2722 [00:13<00:00, 206.37it/s]


Validation Epoch 5/15, Avg. Loss: 1.3166
Validation Epoch 5/15, RMSE on validation set: 1.4062279246861176


Epoch 6/15: 100%|██████████| 12699/12699 [01:35<00:00, 132.85it/s]


Epoch 6/15, Avg. Loss: 1.2839


Validation Epoch 6: 100%|██████████| 2722/2722 [00:13<00:00, 208.90it/s]


Validation Epoch 6/15, Avg. Loss: 1.3102
Validation Epoch 6/15, RMSE on validation set: 1.4060438653150298


Epoch 7/15: 100%|██████████| 12699/12699 [01:36<00:00, 131.93it/s]


Epoch 7/15, Avg. Loss: 1.2672


Validation Epoch 7: 100%|██████████| 2722/2722 [00:13<00:00, 208.24it/s]


Validation Epoch 7/15, Avg. Loss: 1.3070
Validation Epoch 7/15, RMSE on validation set: 1.3976500543748938


Epoch 8/15: 100%|██████████| 12699/12699 [01:36<00:00, 131.66it/s]


Epoch 8/15, Avg. Loss: 1.2486


Validation Epoch 8: 100%|██████████| 2722/2722 [00:13<00:00, 207.72it/s]


Validation Epoch 8/15, Avg. Loss: 1.3026
Validation Epoch 8/15, RMSE on validation set: 1.4327991256557373


Epoch 9/15: 100%|██████████| 12699/12699 [01:36<00:00, 131.98it/s]


Epoch 9/15, Avg. Loss: 1.2279


Validation Epoch 9: 100%|██████████| 2722/2722 [00:13<00:00, 209.32it/s]


Validation Epoch 9/15, Avg. Loss: 1.3002
Validation Epoch 9/15, RMSE on validation set: 1.4383646400034038


Epoch 10/15: 100%|██████████| 12699/12699 [01:35<00:00, 132.35it/s]


Epoch 10/15, Avg. Loss: 1.2060


Validation Epoch 10: 100%|██████████| 2722/2722 [00:13<00:00, 208.92it/s]


Validation Epoch 10/15, Avg. Loss: 1.2987
Validation Epoch 10/15, RMSE on validation set: 1.4473405609194616


Epoch 11/15: 100%|██████████| 12699/12699 [01:36<00:00, 132.11it/s]


Epoch 11/15, Avg. Loss: 1.1834


Validation Epoch 11: 100%|██████████| 2722/2722 [00:13<00:00, 202.30it/s]


Validation Epoch 11/15, Avg. Loss: 1.2892
Validation Epoch 11/15, RMSE on validation set: 1.437532421501751


Epoch 12/15: 100%|██████████| 12699/12699 [01:35<00:00, 132.65it/s]


Epoch 12/15, Avg. Loss: 1.1605


Validation Epoch 12: 100%|██████████| 2722/2722 [00:13<00:00, 206.89it/s]


Validation Epoch 12/15, Avg. Loss: 1.2959
Validation Epoch 12/15, RMSE on validation set: 1.4539776102888309


Epoch 13/15: 100%|██████████| 12699/12699 [01:35<00:00, 132.35it/s]


Epoch 13/15, Avg. Loss: 1.1372


Validation Epoch 13: 100%|██████████| 2722/2722 [00:13<00:00, 206.58it/s]


Validation Epoch 13/15, Avg. Loss: 1.2968
Validation Epoch 13/15, RMSE on validation set: 1.4511107724614296
val_loss did not decrease


Testing: 100%|██████████| 2722/2722 [00:12<00:00, 222.20it/s]

Final RMSE on test set: 1.13773050042389





In [12]:
train_embedding(256,15)

Epoch 1/15: 100%|██████████| 12699/12699 [02:15<00:00, 93.86it/s]


Epoch 1/15, Avg. Loss: 1.4037


Validation Epoch 1: 100%|██████████| 2722/2722 [00:13<00:00, 207.90it/s]


Validation Epoch 1/15, Avg. Loss: 1.3364
Validation Epoch 1/15, RMSE on validation set: 1.388221091928835


Epoch 2/15: 100%|██████████| 12699/12699 [02:18<00:00, 91.60it/s]


Epoch 2/15, Avg. Loss: 1.3245


Validation Epoch 2: 100%|██████████| 2722/2722 [00:13<00:00, 208.02it/s]


Validation Epoch 2/15, Avg. Loss: 1.3280
Validation Epoch 2/15, RMSE on validation set: 1.3927184206540373


Epoch 3/15: 100%|██████████| 12699/12699 [02:18<00:00, 91.96it/s]


Epoch 3/15, Avg. Loss: 1.3050


Validation Epoch 3: 100%|██████████| 2722/2722 [00:13<00:00, 206.87it/s]


Validation Epoch 3/15, Avg. Loss: 1.3140
Validation Epoch 3/15, RMSE on validation set: 1.3881117646376209


Epoch 4/15: 100%|██████████| 12699/12699 [02:18<00:00, 91.49it/s]


Epoch 4/15, Avg. Loss: 1.2719


Validation Epoch 4: 100%|██████████| 2722/2722 [00:13<00:00, 203.06it/s]


Validation Epoch 4/15, Avg. Loss: 1.2994
Validation Epoch 4/15, RMSE on validation set: 1.4177576668123313


Epoch 5/15: 100%|██████████| 12699/12699 [02:17<00:00, 92.13it/s]


Epoch 5/15, Avg. Loss: 1.2241


Validation Epoch 5: 100%|██████████| 2722/2722 [00:13<00:00, 207.87it/s]


Validation Epoch 5/15, Avg. Loss: 1.2928
Validation Epoch 5/15, RMSE on validation set: 1.4314307531527033


Epoch 6/15: 100%|██████████| 12699/12699 [02:17<00:00, 92.02it/s]


Epoch 6/15, Avg. Loss: 1.1588


Validation Epoch 6: 100%|██████████| 2722/2722 [00:13<00:00, 208.38it/s]


Validation Epoch 6/15, Avg. Loss: 1.2892
Validation Epoch 6/15, RMSE on validation set: 1.4400332358367312


Epoch 7/15: 100%|██████████| 12699/12699 [02:18<00:00, 91.98it/s]


Epoch 7/15, Avg. Loss: 1.0825


Validation Epoch 7: 100%|██████████| 2722/2722 [00:13<00:00, 209.27it/s]


Validation Epoch 7/15, Avg. Loss: 1.2966
Validation Epoch 7/15, RMSE on validation set: 1.4464444871159507


Epoch 8/15: 100%|██████████| 12699/12699 [02:18<00:00, 91.76it/s]


Epoch 8/15, Avg. Loss: 1.0062


Validation Epoch 8: 100%|██████████| 2722/2722 [00:12<00:00, 210.53it/s]


Validation Epoch 8/15, Avg. Loss: 1.3162
Validation Epoch 8/15, RMSE on validation set: 1.4840152869555
val_loss did not decrease


Testing: 100%|██████████| 2722/2722 [00:12<00:00, 222.45it/s]

Final RMSE on test set: 1.1463451886581018





In [13]:
train_embedding(512,15)

Epoch 1/15: 100%|██████████| 12699/12699 [03:13<00:00, 65.49it/s]


Epoch 1/15, Avg. Loss: 1.3969


Validation Epoch 1: 100%|██████████| 2722/2722 [00:13<00:00, 198.85it/s]


Validation Epoch 1/15, Avg. Loss: 1.3390
Validation Epoch 1/15, RMSE on validation set: 1.4043070621438214


Epoch 2/15: 100%|██████████| 12699/12699 [03:17<00:00, 64.21it/s]


Epoch 2/15, Avg. Loss: 1.3243


Validation Epoch 2: 100%|██████████| 2722/2722 [00:13<00:00, 207.95it/s]


Validation Epoch 2/15, Avg. Loss: 1.3243
Validation Epoch 2/15, RMSE on validation set: 1.4077598531593816


Epoch 3/15: 100%|██████████| 12699/12699 [03:18<00:00, 64.12it/s]


Epoch 3/15, Avg. Loss: 1.2939


Validation Epoch 3: 100%|██████████| 2722/2722 [00:13<00:00, 207.77it/s]


Validation Epoch 3/15, Avg. Loss: 1.3133
Validation Epoch 3/15, RMSE on validation set: 1.3866871579470645


Epoch 4/15: 100%|██████████| 12699/12699 [03:17<00:00, 64.23it/s]


Epoch 4/15, Avg. Loss: 1.2470


Validation Epoch 4: 100%|██████████| 2722/2722 [00:13<00:00, 207.32it/s]


Validation Epoch 4/15, Avg. Loss: 1.2853
Validation Epoch 4/15, RMSE on validation set: 1.4255562823986396


Epoch 5/15: 100%|██████████| 12699/12699 [03:16<00:00, 64.49it/s]


Epoch 5/15, Avg. Loss: 1.1754


Validation Epoch 5: 100%|██████████| 2722/2722 [00:13<00:00, 207.08it/s]


Validation Epoch 5/15, Avg. Loss: 1.2685
Validation Epoch 5/15, RMSE on validation set: 1.4458135450739613


Epoch 6/15: 100%|██████████| 12699/12699 [03:17<00:00, 64.45it/s]


Epoch 6/15, Avg. Loss: 1.0823


Validation Epoch 6: 100%|██████████| 2722/2722 [00:13<00:00, 208.40it/s]


Validation Epoch 6/15, Avg. Loss: 1.2733
Validation Epoch 6/15, RMSE on validation set: 1.462488376479321


Epoch 7/15: 100%|██████████| 12699/12699 [03:16<00:00, 64.68it/s]


Epoch 7/15, Avg. Loss: 0.9860


Validation Epoch 7: 100%|██████████| 2722/2722 [00:12<00:00, 210.31it/s]


Validation Epoch 7/15, Avg. Loss: 1.2896
Validation Epoch 7/15, RMSE on validation set: 1.5025257052014833
val_loss did not decrease


Testing: 100%|██████████| 2722/2722 [00:12<00:00, 223.05it/s]

Final RMSE on test set: 1.1355729258850877





In [14]:
train_embedding(768,15)

Epoch 1/15: 100%|██████████| 12699/12699 [04:16<00:00, 49.43it/s]


Epoch 1/15, Avg. Loss: 1.3935


Validation Epoch 1: 100%|██████████| 2722/2722 [00:13<00:00, 196.80it/s]


Validation Epoch 1/15, Avg. Loss: 1.3372
Validation Epoch 1/15, RMSE on validation set: 1.3939804992764802


Epoch 2/15: 100%|██████████| 12699/12699 [04:21<00:00, 48.54it/s]


Epoch 2/15, Avg. Loss: 1.3236


Validation Epoch 2: 100%|██████████| 2722/2722 [00:13<00:00, 196.16it/s]


Validation Epoch 2/15, Avg. Loss: 1.3221
Validation Epoch 2/15, RMSE on validation set: 1.4059452570717683


Epoch 3/15: 100%|██████████| 12699/12699 [04:22<00:00, 48.30it/s]


Epoch 3/15, Avg. Loss: 1.2879


Validation Epoch 3: 100%|██████████| 2722/2722 [00:13<00:00, 194.95it/s]


Validation Epoch 3/15, Avg. Loss: 1.2962
Validation Epoch 3/15, RMSE on validation set: 1.4186362327355753


Epoch 4/15: 100%|██████████| 12699/12699 [04:23<00:00, 48.26it/s]


Epoch 4/15, Avg. Loss: 1.2364


Validation Epoch 4: 100%|██████████| 2722/2722 [00:13<00:00, 200.07it/s]


Validation Epoch 4/15, Avg. Loss: 1.2745
Validation Epoch 4/15, RMSE on validation set: 1.4266485444732655


Epoch 5/15: 100%|██████████| 12699/12699 [04:22<00:00, 48.32it/s]


Epoch 5/15, Avg. Loss: 1.1606


Validation Epoch 5: 100%|██████████| 2722/2722 [00:13<00:00, 200.11it/s]


Validation Epoch 5/15, Avg. Loss: 1.2582
Validation Epoch 5/15, RMSE on validation set: 1.4465433226611144


Epoch 6/15: 100%|██████████| 12699/12699 [04:21<00:00, 48.50it/s]


Epoch 6/15, Avg. Loss: 1.0628


Validation Epoch 6: 100%|██████████| 2722/2722 [00:13<00:00, 197.95it/s]


Validation Epoch 6/15, Avg. Loss: 1.2618
Validation Epoch 6/15, RMSE on validation set: 1.477442903406309


Epoch 7/15: 100%|██████████| 12699/12699 [04:22<00:00, 48.46it/s]


Epoch 7/15, Avg. Loss: 0.9594


Validation Epoch 7: 100%|██████████| 2722/2722 [00:13<00:00, 200.84it/s]


Validation Epoch 7/15, Avg. Loss: 1.2869
Validation Epoch 7/15, RMSE on validation set: 1.5105513948670217
val_loss did not decrease


Testing: 100%|██████████| 2722/2722 [00:12<00:00, 225.14it/s]

Final RMSE on test set: 1.1344034364133844



