# Movie & Game Embedding 결합 및 Latent Transformation
1. 영화 & 게임 임베딩(.npy) 불러오기
2. TruncatedSVD로 차원 축소
3. MLP를 이용한 latent embedding 생성 (128차원)
4. 사용자 latent embedding placeholder 생성
5. 영화 + 게임 + 사용자 embedding 결합
6. 추천 시스템에서 바로 사용할 수 있는 구조 생성

* 현재 user latent는 랜덤 placeholder → 실제 NCF 학습 후 교체 필요

* embedding 수는 min_len 기준으로 맞춤 → 전체 영화/게임 수와 다를 수 있음

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
import numpy as np
import pandas as pd
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import normalize
import torch
import torch.nn as nn

In [15]:
# ======================================================
# 1. 영화 & 게임 임베딩 불러오기
# ======================================================
movie_path = '/content/drive/MyDrive/2025Bigdata/3/hybrid_movie_embeddings.npy'
game_path  = '/content/drive/MyDrive/2025Bigdata/3/hybrid_game_embeddings_BERT_final.npy'
movies_csv = '/content/drive/MyDrive/2025Bigdata/3/movies.csv'
games_csv  = '/content/drive/MyDrive/2025Bigdata/3/cleaned_games.csv'

movie_embeddings = np.load(movie_path)
game_embeddings  = np.load(game_path)

movie_df = pd.read_csv(movies_csv)
game_df  = pd.read_csv(games_csv)

movie_titles = movie_df['title'].tolist()
game_titles  = game_df['title'].tolist()

# 길이 맞추기
min_len = min(len(movie_embeddings), len(game_embeddings))
movie_embeddings = movie_embeddings[:min_len]
game_embeddings  = game_embeddings[:min_len]

# ======================================================
# 2. 차원 축소 (TruncatedSVD)
# ======================================================
target_dim = 512
svd_movie = TruncatedSVD(n_components=target_dim, random_state=42)
svd_game  = TruncatedSVD(n_components=target_dim, random_state=42)

movie_reduced = svd_movie.fit_transform(movie_embeddings)
game_reduced  = svd_game.fit_transform(game_embeddings)

# ======================================================
# 3. MLP latent embedding 생성
# ======================================================
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class SimpleMLP(nn.Module):
    def __init__(self, input_dim, latent_dim=128):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, 256),
            nn.ReLU(),
            nn.Linear(256, latent_dim)
        )
    def forward(self, x):
        return self.model(x)

mlp = SimpleMLP(input_dim=target_dim, latent_dim=128).to(DEVICE)

# 사용자 전용 MLP(32차원)
user_mlp = SimpleMLP(input_dim=user_latent_raw.shape[1], latent_dim=128).to(DEVICE)

mlp.eval()
user_mlp.eval()

# 사용자 임베딩 불러오기
user_latent_raw = np.load('/content/drive/MyDrive/2025Bigdata/3/user_latent.npy')

if user_latent_raw.shape[1] < target_dim:
  user_reduced = user_latent_raw
else:
  svd_user = TruncatedSVD(n_components=target_dim, random_state=42)
  user_reduced = svd_user.fit_transform(user_latent_raw)

with torch.no_grad():
  movie_tensor = torch.tensor(movie_reduced, dtype=torch.float32).to(DEVICE)
  game_tensor  = torch.tensor(game_reduced, dtype=torch.float32).to(DEVICE)
  user_tensor = torch.tensor(user_reduced, dtype=torch.float32).to(DEVICE)

  movie_latent = mlp(movie_tensor).cpu().numpy()
  game_latent  = mlp(game_tensor).cpu().numpy()
  user_latent = user_mlp(user_tensor).cpu().numpy()

# 유사도 정확도를 위해 정규화
movie_latent = normalize(movie_latent)
game_latent = normalize(game_latent)
user_latent = normalize(user_latent)

movie_ids = movie_df['movieId'].values
movie_to_index = {movie_id: idx for idx, movie_id in enumerate(movie_ids)}

ratings_df = pd.read_csv('/content/drive/MyDrive/2025Bigdata/3/ratings.csv')
ratings_df['movie_index'] = ratings_df['movieId'].map(movie_to_index)

# ======================================================
# 4. 전체 embedding 결합
# ======================================================
latent_embeddings = np.vstack([movie_latent, game_latent])
final_embeddings = np.vstack([latent_embeddings, user_latent])

# item types & titles mapping
item_types = ['movie']*movie_latent.shape[0] + ['game']*game_latent.shape[0] + ['user']*user_latent.shape[0]
item_titles = movie_titles[:min_len] + game_titles[:min_len] + [f"user_{i}" for i in range(user_latent.shape[0])]

print("Final embeddings shape:", final_embeddings.shape)
print("Item types length:", len(item_types))
print("Item titles length:", len(item_titles))

# ======================================================
# 5. 추천 함수 정의
# ======================================================
def recommend_games(user_id, ratings_df, movie_latent, game_latent, game_titles, min_rating=4.0, top_k=3):
  # 평점 min_rating(=4.0) 이상만 필터링
  user_ratings = ratings_df[ratings_df['userId'] == user_id]
  liked_movies = user_ratings[user_ratings['rating'] >= min_rating]

  if liked_movies.empty:
    print(f"사용자 {user_id}는 평점 {min_rating} 이상을 준 영화가 없습니다.")
    return []

  # 영화 인덱스 추출 (movie_latent와 매핑되는 인덱스)
  liked_indices = liked_movies['movie_index'].values

  # 영화 임베딩 평균 -> 사용자 영화 취향 벡터
  liked_embeddings = movie_latent[liked_indices]
  avg_movie_vec = liked_embeddings.mean(axis=0).reshape(1, -1)

  # 모든 게임과 유사도 계산
  sims = cosine_similarity(avg_movie_vec, game_latent).flatten()
  top_indices = sims.argsort()[::-1][:top_k]

  # 추천 게임 리스트 반환
  results = []
  for idx in top_indices:
    results.append({
        'title': game_titles[idx],
        'score': sims[idx]
    })
  return results

# ======================================================
# 6. 테스트 실행
# ======================================================
print("사용자 5번이 좋아할 게임 3개 추천")
recs = recommend_games(
    user_id = 5,
    ratings_df = ratings_df,
    movie_latent = movie_latent,
    game_latent = game_latent,
    game_titles = game_titles,
    min_rating = 4.0,
    top_k=3)

for r in recs:
  print(f"[{r['title']} (score: {r['score']:.4f})")

Final embeddings shape: (10610, 128)
Item types length: 10610
Item titles length: 10610
사용자 5번이 좋아할 게임 3개 추천
[Motor Mechanic (score: 0.2528)
[Rocksmith 2014 Edition - Remastered (score: 0.2396)
[Predator: Hunting Grounds (score: 0.2366)
