In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import json
import pickle

In [4]:
df = pd.read_csv('/content/drive/MyDrive/집교 2_Team P/user-track-listen_count_filtered5.csv')

In [5]:
# Pickle 파일 읽기
with open('/content/drive/MyDrive/집교 2_Team P/lyrics_Embedding/all_embeddings_full.pkl', 'rb') as file:
    data = pickle.load(file)

# DataFrame으로 변환
df_embedding = pd.DataFrame(data, columns=['embedding', 'track_id'])

# track_id를 정수로 변환 (필요하다면)
df_embedding['track_id'] = df_embedding['track_id'].astype(int)

# 'embedding' 열을 768차원의 각 차원으로 나누기
# df_embedding[['embedding_{}'.format(i) for i in range(768)]] = pd.DataFrame(df_embedding['embedding'].tolist(), index=df_embedding.index)

# 'embedding' 열 삭제
# df_embedding = df_embedding.drop(['embedding'], axis=1)

# DataFrame 확인
print(df_embedding.head())


                                           embedding  track_id
0  [0.012072664, 0.17292306, 0.0061238254, 0.0707...         2
1  [-0.17554894, 0.24209566, 0.4195969, -0.185033...         8
2  [-0.096951924, 0.0034472912, 0.005701333, 0.01...      1524
3  [-0.21775067, 0.244962, 0.24090661, 0.1647732,...      1785
4  [-0.069424234, -0.016805744, 0.21406727, -0.27...      1787


In [6]:
print(df.shape)
df = pd.merge(df, df_embedding, on='track_id', how='inner')
df.shape

(4645010, 4)


(4644051, 5)

In [7]:
from sklearn.preprocessing import LabelEncoder
user_encoder = LabelEncoder()
track_encoder = LabelEncoder()
lyrics_encoder = LabelEncoder()
df['user_id'] = user_encoder.fit_transform(df['user_id'])
df['track_id'] = track_encoder.fit_transform(df['track_id'])
df_embedding['track_id'] = lyrics_encoder.fit_transform(df_embedding['track_id'])

In [8]:
lyrics_dict = dict(zip(df_embedding['track_id'], df_embedding['embedding']))

In [9]:
# !pip install torch torchvision -U

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import pandas as pd
import numpy as np
from tqdm import tqdm

# 데이터 불러오기 (예시: CSV 파일)
# Label Encoding
df.loc[df['listen_count_bin'] == '10~2704', 'listen_count_bin'] = 10
df['listen_count_bin'] = df['listen_count_bin'].astype(int)
# PyTorch DataLoader에 맞게 데이터 변환
def df_to_tensor(dataset):
    users = torch.tensor(dataset['user_id'].values, dtype=torch.int)
    items = torch.tensor(dataset['track_id'].values, dtype=torch.int)
    ratings = torch.tensor(dataset['listen_count_bin'].values, dtype=torch.float)
    lyrics_embeddings = torch.tensor(np.vstack(dataset['embedding'].values), dtype=torch.float)
    return users, items, ratings, lyrics_embeddings

train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

train_users, train_items, train_ratings, train_lyrics_embeddings = df_to_tensor(train_df)
test_users, test_items, test_ratings,test_lyrics_embeddings = df_to_tensor(test_df)

train_data = TensorDataset(train_users, train_items, train_ratings,train_lyrics_embeddings)
test_data = TensorDataset(test_users, test_items, test_ratings,test_lyrics_embeddings)

train_loader = DataLoader(train_data, batch_size=256, shuffle=True)
test_loader = DataLoader(test_data, batch_size=256, shuffle=False)
num_users = (df['user_id'].nunique())
num_items = (df['track_id'].nunique())
print(num_users)
print(num_items)

23761
28309


In [10]:
class NeuMF(nn.Module):
    def __init__(self, num_users, num_items, embedding_size, mlp_hidden_size):
        super(NeuMF, self).__init__()
        # Matrix Factorization
        self.user_embedding_mf = nn.Embedding(num_users, embedding_size)
        self.item_embedding_mf = nn.Embedding(num_items, embedding_size)
        # Multi-Layer Perceptron
        self.user_embedding_mlp = nn.Embedding(num_users, mlp_hidden_size)
        self.item_embedding_mlp = nn.Embedding(num_items, mlp_hidden_size)
        self.lyrics_embedding = nn.Linear(768, embedding_size)
        self.mlp_layers = nn.Sequential(
            nn.Linear(3 * mlp_hidden_size, mlp_hidden_size),
            nn.ReLU(),
            nn.Linear(mlp_hidden_size, int(mlp_hidden_size/2)),
            nn.ReLU(),
            nn.Linear(int(mlp_hidden_size/2), int(mlp_hidden_size/4)),
            nn.ReLU(),
            nn.Linear(int(mlp_hidden_size/4), int(mlp_hidden_size/8)),

        )
        # Final Layer
        self.final_layer = nn.Linear(int(mlp_hidden_size/8+embedding_size), 1)

    def forward(self, user, item,lyrics_embedding):
        # Matrix Factorization
        user_embedding_mf = self.user_embedding_mf(user)
        item_embedding_mf = self.item_embedding_mf(item)
        mf_output = torch.mul(user_embedding_mf, item_embedding_mf)

        # Multi-Layer Perceptron
        user_embedding_mlp = self.user_embedding_mlp(user)
        item_embedding_mlp = self.item_embedding_mlp(item)
        lyrics_embedding = self.lyrics_embedding(lyrics_embedding.reshape(lyrics_embedding.shape[0],768))
        mlp_input = torch.cat((user_embedding_mlp, item_embedding_mlp,lyrics_embedding), dim=1)
        mlp_output = self.mlp_layers(mlp_input)
        # Concatenate MF and MLP outputs
        final_input = torch.cat((mf_output, mlp_output), dim=1)

        # Final prediction
        prediction = self.final_layer(final_input)
        return prediction.view(-1)

# 학습 및 평가 코드는 이전과 유사
# ...
# CUDA 디바이스 설정
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)
def train_(emb,mlp,n_epoch):
  embedding_size = emb
  mlp_hidden_size = mlp

  neumf_model = NeuMF(num_users, num_items, embedding_size, mlp_hidden_size)

  # 손실 함수 및 최적화 함수 정의
  criterion = nn.MSELoss()
  optimizer = optim.Adam(neumf_model.parameters(), lr=0.001)
  # NCF 모델 정의 및 GPU로 이동
  # model = NCF(num_users=len(user_encoder.classes_), num_items=len(item_encoder.classes_), embedding_size=embedding_size)
  neumf_model.to(device)
  criterion = nn.MSELoss()
  optimizer = optim.Adam(neumf_model.parameters(), lr=0.001)
  # tqdm을 사용하여 학습 및 테스트 진행 상황 확인
  num_epochs = n_epoch
  for epoch in range(num_epochs):
      neumf_model.train()
      total_loss = 0
      for user, item, rating,lyrics_embedding in tqdm(train_loader, desc=f'Epoch {epoch+1}/{num_epochs}'):
          optimizer.zero_grad()
          user, item, rating,lyrics_embedding = user.to(device), item.to(device), rating.to(device),lyrics_embedding.to(device)  # GPU로 이동
          output = neumf_model(user, item,lyrics_embedding)
          loss = criterion(output, rating.unsqueeze(1))
          loss.backward()
          optimizer.step()
          total_loss += loss.item()

      avg_loss = total_loss / len(train_loader)
      print(f'Epoch {epoch+1}/{num_epochs}, Avg. Loss: {avg_loss:.4f}')

      # 각 에폭이 끝날 때마다 테스트 데이터에 대한 예측 수행
      neumf_model.eval()
      all_predictions = []
      with torch.no_grad():
          for user, item, _,lyrics_embedding in tqdm(test_loader, desc=f'Testing Epoch {epoch+1}'):
              user, item,lyrics_embedding = user.to(device), item.to(device),lyrics_embedding.to(device)  # GPU로 이동
              output = neumf_model(user, item,lyrics_embedding)
              all_predictions.append(output)

      # RMSE 계산
      predictions = torch.cat(all_predictions).squeeze().cpu().numpy()  # CPU로 이동 후 numpy로 변환
      rmse = np.sqrt(mean_squared_error(test_df['listen_count_bin'].values, predictions))
      print(f'Epoch {epoch+1}/{num_epochs}, RMSE on test set: {rmse}')

Using device: cuda


In [11]:
train_(64,64,10)

  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
Epoch 1/10: 100%|██████████| 14513/14513 [01:39<00:00, 145.17it/s]


Epoch 1/10, Avg. Loss: 1.6541


Testing Epoch 1: 100%|██████████| 3629/3629 [00:15<00:00, 229.75it/s]


Epoch 1/10, RMSE on test set: 1.280509463407143


  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
Epoch 2/10: 100%|██████████| 14513/14513 [01:40<00:00, 144.93it/s]


Epoch 2/10, Avg. Loss: 1.6459


Testing Epoch 2: 100%|██████████| 3629/3629 [00:15<00:00, 231.01it/s]


Epoch 2/10, RMSE on test set: 1.2808157682407189


  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
Epoch 3/10: 100%|██████████| 14513/14513 [01:40<00:00, 144.35it/s]


Epoch 3/10, Avg. Loss: 1.6437


Testing Epoch 3: 100%|██████████| 3629/3629 [00:15<00:00, 236.23it/s]


Epoch 3/10, RMSE on test set: 1.2803792505809994


  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
Epoch 4/10: 100%|██████████| 14513/14513 [01:41<00:00, 143.02it/s]


Epoch 4/10, Avg. Loss: 1.6430


Testing Epoch 4: 100%|██████████| 3629/3629 [00:15<00:00, 234.06it/s]


Epoch 4/10, RMSE on test set: 1.2803727214617995


  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
Epoch 5/10: 100%|██████████| 14513/14513 [01:40<00:00, 144.22it/s]


Epoch 5/10, Avg. Loss: 1.6429


Testing Epoch 5: 100%|██████████| 3629/3629 [00:15<00:00, 235.58it/s]


Epoch 5/10, RMSE on test set: 1.2804588615049213


  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
Epoch 6/10: 100%|██████████| 14513/14513 [01:42<00:00, 141.47it/s]


Epoch 6/10, Avg. Loss: 1.6430


Testing Epoch 6: 100%|██████████| 3629/3629 [00:15<00:00, 228.09it/s]


Epoch 6/10, RMSE on test set: 1.2804467843130043


  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
Epoch 7/10: 100%|██████████| 14513/14513 [01:42<00:00, 141.96it/s]


Epoch 7/10, Avg. Loss: 1.6429


Testing Epoch 7: 100%|██████████| 3629/3629 [00:16<00:00, 224.47it/s]


Epoch 7/10, RMSE on test set: 1.2803689573820536


  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
Epoch 8/10: 100%|██████████| 14513/14513 [01:41<00:00, 143.59it/s]


Epoch 8/10, Avg. Loss: 1.6429


Testing Epoch 8: 100%|██████████| 3629/3629 [00:15<00:00, 227.24it/s]


Epoch 8/10, RMSE on test set: 1.280348984062504


  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
Epoch 9/10: 100%|██████████| 14513/14513 [01:40<00:00, 144.30it/s]


Epoch 9/10, Avg. Loss: 1.6429


Testing Epoch 9: 100%|██████████| 3629/3629 [00:16<00:00, 225.29it/s]


Epoch 9/10, RMSE on test set: 1.2804121432886597


  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
Epoch 10/10: 100%|██████████| 14513/14513 [01:40<00:00, 143.90it/s]


Epoch 10/10, Avg. Loss: 1.6429


Testing Epoch 10: 100%|██████████| 3629/3629 [00:15<00:00, 230.43it/s]

Epoch 10/10, RMSE on test set: 1.2803697066472184





In [12]:
train_(128,128,10)

  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
Epoch 1/10: 100%|██████████| 14513/14513 [01:46<00:00, 136.33it/s]


Epoch 1/10, Avg. Loss: 1.6521


Testing Epoch 1: 100%|██████████| 3629/3629 [00:15<00:00, 231.18it/s]


Epoch 1/10, RMSE on test set: 1.28039627000155


  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
Epoch 2/10: 100%|██████████| 14513/14513 [01:49<00:00, 132.38it/s]


Epoch 2/10, Avg. Loss: 1.6439


Testing Epoch 2: 100%|██████████| 3629/3629 [00:15<00:00, 229.34it/s]


Epoch 2/10, RMSE on test set: 1.2804043953236999


  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
Epoch 3/10: 100%|██████████| 14513/14513 [01:49<00:00, 132.42it/s]


Epoch 3/10, Avg. Loss: 1.6432


Testing Epoch 3: 100%|██████████| 3629/3629 [00:16<00:00, 222.36it/s]


Epoch 3/10, RMSE on test set: 1.2804900084152726


  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
Epoch 4/10: 100%|██████████| 14513/14513 [01:49<00:00, 132.56it/s]


Epoch 4/10, Avg. Loss: 1.6432


Testing Epoch 4: 100%|██████████| 3629/3629 [00:16<00:00, 226.10it/s]


Epoch 4/10, RMSE on test set: 1.2804177635930978


  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
Epoch 5/10: 100%|██████████| 14513/14513 [01:50<00:00, 131.71it/s]


Epoch 5/10, Avg. Loss: 1.6432


Testing Epoch 5: 100%|██████████| 3629/3629 [00:15<00:00, 232.14it/s]


Epoch 5/10, RMSE on test set: 1.2806646852732335


  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
Epoch 6/10: 100%|██████████| 14513/14513 [01:50<00:00, 131.24it/s]


Epoch 6/10, Avg. Loss: 1.6431


Testing Epoch 6: 100%|██████████| 3629/3629 [00:15<00:00, 234.75it/s]


Epoch 6/10, RMSE on test set: 1.2804060386160356


  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
Epoch 7/10: 100%|██████████| 14513/14513 [01:50<00:00, 131.78it/s]


Epoch 7/10, Avg. Loss: 1.6431


Testing Epoch 7:  40%|███▉      | 1439/3629 [00:06<00:09, 225.60it/s]


KeyboardInterrupt: ignored

In [None]:
train_(256,256,10)