In [None]:
import sys
sys.path.insert(0, "/Users/jisoo/projects/thesis/carte_test")

from config import RAW, PROCESSED

In [None]:
import json

with RAW.TMDB_MOVIES_JSONL.open("r", encoding="utf-8") as f:
    line = next(f).strip()
    obj = json.loads(line)

# 들여쓰기 + 한글 깨짐 방지 + 키 정렬(옵션)
pretty = json.dumps(obj, indent=2, ensure_ascii=False, sort_keys=True)
print(pretty)

In [None]:
from __future__ import annotations

import json
from dataclasses import dataclass
from pathlib import Path
from typing import Any, Dict, Iterable, List, Optional, Sequence, Set, Tuple

import pandas as pd
from tqdm.auto import tqdm


# =========================================================
# 공통 유틸
# =========================================================
def safe_str(v: Any) -> Optional[str]:
    """
    - 목적
      - 다양한 타입의 값을 "빈 문자열 제거된 문자열"로 정규화
    - 규칙
      - None / "" / 공백만 있는 문자열 -> None
      - 그 외 -> strip 적용 후 반환
    """
    if v is None:
        return None
    if isinstance(v, str):
        s = v.strip()
        return s if s else None
    s = str(v).strip()
    return s if s else None


def parse_release_year(release_date: Any) -> Optional[int]:
    """
    - 목적
      - TMDB release_date("YYYY-MM-DD")에서 연도(YYYY)만 파싱
    - 실패 시
      - None 반환
    """
    if not isinstance(release_date, str) or len(release_date) < 4:
        return None
    try:
        return int(release_date[:4])
    except Exception:
        return None


def iter_tmdb_jsonl(jsonl_path: Path) -> Iterable[Dict[str, Any]]:
    """
    - 목적
      - TMDB 병합 jsonl을 라인 단위로 스트리밍 로드
    - jsonl 라인 구조(예시)
      {"tmdb_id": int, "fetched_at_utc": "...", "data": {...}}
    """
    with jsonl_path.open("r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            try:
                yield json.loads(line)
            except Exception:
                # - 목적: JSON 파싱 실패 라인은 스킵
                continue


def take_top_unique(values: Sequence[Any], k: int) -> List[str]:
    """
    - 목적
      - 문자열 후보들에서 top-k를 뽑되, 중복 제거 + 순서 유지
    - 처리
      - safe_str 적용
      - 중복 제거(첫 등장 유지)
      - 상위 k개 반환
    """
    out: List[str] = []
    seen: Set[str] = set()
    for v in values:
        s = safe_str(v)
        if s is None:
            continue
        if s in seen:
            continue
        seen.add(s)
        out.append(s)
        if len(out) >= k:
            break
    return out


def pad_to_k(values: List[str], k: int) -> List[Optional[str]]:
    """
    - 목적
      - 리스트 길이를 k로 고정
    - 규칙
      - 길이가 부족하면 None으로 패딩
      - 길이가 넘치면 k개로 절단
    """
    if len(values) >= k:
        return values[:k]
    return values + [None] * (k - len(values))


# =========================================================
# MovieLens: tmdbId -> movieId 매핑
# =========================================================
def build_tmdb_to_movielens_movie_map(
    movielens_movies_csv: Path,
    movielens_links_csv: Path,
) -> Tuple[Set[int], Dict[int, int]]:
    """
    - 목적
      - MovieLens movies.csv + links.csv를 이용해
        tmdbId -> movieId 매핑을 만든다.
    - 반환
      - needed_tmdb_ids: TMDB jsonl에서 뽑아야 할 tmdb_id 집합
      - tmdb_to_movie: tmdb_id -> movieId 매핑 dict
    """
    df_movies = pd.read_csv(movielens_movies_csv, encoding="utf-8", usecols=["movieId"])
    df_links = pd.read_csv(movielens_links_csv, encoding="utf-8", usecols=["movieId", "tmdbId"])

    df = df_movies.merge(df_links, on="movieId", how="left")
    df["tmdbId"] = pd.to_numeric(df["tmdbId"], errors="coerce").astype("Int64")

    needed_tmdb_ids: Set[int] = set(df["tmdbId"].dropna().astype(int).unique().tolist())

    tmdb_to_movie: Dict[int, int] = {}
    for r in df.dropna(subset=["tmdbId"]).itertuples(index=False):
        tmdb_to_movie[int(r.tmdbId)] = int(r.movieId)

    return needed_tmdb_ids, tmdb_to_movie


# =========================================================
# TMDB payload에서 필요한 feature만 "평탄화" 추출
# =========================================================
@dataclass(frozen=True)
class ExtractConfig:
    """
    - 목적
      - 다중값 필드를 top-k로 자르는 규칙을 한 곳에서 관리
    """
    top_company: int = 2
    top_country: int = 2
    top_language: int = 2
    top_actor: int = 3
    top_director: int = 1
    top_writer: int = 1
    top_genre: int = 3

    # crew writer job 우선순위
    # - 목적: 가능하면 Writer를 먼저 잡고, 그 다음 Screenplay, Story
    writer_job_priority: Tuple[str, ...] = ("Writer", "Screenplay", "Story")

    # director job name
    director_job: str = "Director"


def extract_people_from_credits(
    tmdb_data: Dict[str, Any],
    cfg: ExtractConfig,
) -> Tuple[List[str], List[str], List[str]]:
    """
    - 목적
      - tmdb_data["credits"]에서 actor / director / writer 추출
    - 반환
      - (actors, directors, writers) 각각 top-k 리스트(문자열)
    """
    credits = tmdb_data.get("credits")
    if not isinstance(credits, dict):
        return ([], [], [])

    cast = credits.get("cast") or []
    crew = credits.get("crew") or []

    # -----------------------------------------------------
    # actors: cast.order 오름차순 기준 top-k
    # -----------------------------------------------------
    cast_valid: List[Dict[str, Any]] = []
    for c in cast:
        if not isinstance(c, dict):
            continue
        pid = c.get("id")
        name = safe_str(c.get("name"))
        if pid is None or name is None:
            continue
        cast_valid.append(c)

    cast_valid.sort(key=lambda x: (x.get("order") is None, x.get("order", 10**9)))
    actors = take_top_unique([c.get("name") for c in cast_valid], cfg.top_actor)

    # -----------------------------------------------------
    # directors: crew.job == Director top-k(보통 1명)
    # -----------------------------------------------------
    directors: List[str] = []
    for c in crew:
        if not isinstance(c, dict):
            continue
        job = safe_str(c.get("job"))
        if job != cfg.director_job:
            continue
        name = safe_str(c.get("name"))
        if name is None:
            continue
        directors.append(name)
    directors = take_top_unique(directors, cfg.top_director)

    # -----------------------------------------------------
    # writers: writer_job_priority 기준으로 우선순위 높은 job top-k
    # -----------------------------------------------------
    writers_candidates: List[Tuple[int, str]] = []
    job_rank = {j: i for i, j in enumerate(cfg.writer_job_priority)}
    for c in crew:
        if not isinstance(c, dict):
            continue
        job = safe_str(c.get("job"))
        if job is None or job not in job_rank:
            continue
        name = safe_str(c.get("name"))
        if name is None:
            continue
        writers_candidates.append((job_rank[job], name))

    writers_candidates.sort(key=lambda x: x[0])
    writers = take_top_unique([w[1] for w in writers_candidates], cfg.top_writer)

    return (actors, directors, writers)


def extract_flat_catalog_row(
    movie_id: int,
    tmdb_id: int,
    tmdb_data: Dict[str, Any],
    cfg: ExtractConfig,
) -> Dict[str, Any]:
    """
    - 목적
      - TMDB의 nested payload에서 graphlet 입력용 feature만 뽑아 1행(row)으로 평탄화
    - 주의
      - 텍스트 컬럼들은 fastText 임베딩 대상이 될 수 있음
    """
    # -----------------------------------------------------
    # Numeric
    # -----------------------------------------------------
    release_date = safe_str(tmdb_data.get("release_date"))
    release_year = parse_release_year(release_date)

    # -----------------------------------------------------
    # Text: 단일값 (fastText)
    # -----------------------------------------------------
    original_title = safe_str(tmdb_data.get("original_title"))

    # 추가: tagline / overview (fastText)
    # - 목적: 문장형 텍스트를 카탈로그에 포함 (후처리로 제외/제한 가능)
    tagline = safe_str(tmdb_data.get("tagline"))
    overview = safe_str(tmdb_data.get("overview"))

    # -----------------------------------------------------
    # Text: 다중값 -> top-k 평탄화 (fastText)
    # -----------------------------------------------------
    companies: List[Any] = []
    for c in (tmdb_data.get("production_companies") or []):
        if isinstance(c, dict):
            companies.append(c.get("name"))

    countries: List[Any] = []
    for c in (tmdb_data.get("production_countries") or []):
        if isinstance(c, dict):
            # - 국가명 추천: 코드보다 fastText에도 더 자연스러움
            countries.append(c.get("name"))

    languages: List[Any] = []
    for l in (tmdb_data.get("spoken_languages") or []):
        if isinstance(l, dict):
            # - english_name이 더 깔끔한 경우가 많음 (없으면 name)
            languages.append(l.get("english_name") or l.get("name"))

    genres: List[Any] = []
    for g in (tmdb_data.get("genres") or []):
        if isinstance(g, dict):
            genres.append(g.get("name"))

    top_companies = pad_to_k(take_top_unique(companies, cfg.top_company), cfg.top_company)
    top_countries = pad_to_k(take_top_unique(countries, cfg.top_country), cfg.top_country)
    top_languages = pad_to_k(take_top_unique(languages, cfg.top_language), cfg.top_language)
    top_genres = pad_to_k(take_top_unique(genres, cfg.top_genre), cfg.top_genre)

    # -----------------------------------------------------
    # credits: actors/director/writer (fastText)
    # -----------------------------------------------------
    actors, directors, writers = extract_people_from_credits(tmdb_data, cfg)
    top_actors = pad_to_k(actors, cfg.top_actor)
    top_director = pad_to_k(directors, cfg.top_director)
    top_writer = pad_to_k(writers, cfg.top_writer)

    row: Dict[str, Any] = {
        "movieId": movie_id,
        "tmdbId": tmdb_id,

        # Numeric
        "release_year": release_year,

        # Text (fastText)
        "original_title": original_title,

        # 추가된 Text (fastText)
        "tagline": tagline,
        "overview": overview,

        "produced_by_company_1": top_companies[0],
        "produced_by_company_2": top_companies[1],
        "produced_in_country_1": top_countries[0],
        "produced_in_country_2": top_countries[1],
        "spoken_language_1": top_languages[0],
        "spoken_language_2": top_languages[1],
        "actor_1": top_actors[0],
        "actor_2": top_actors[1],
        "actor_3": top_actors[2],
        "director_1": top_director[0],
        "writer_1": top_writer[0],
        "genre_1": top_genres[0],
        "genre_2": top_genres[1],
        "genre_3": top_genres[2],
    }
    return row


# =========================================================
# 빌드: "단일 카탈로그 parquet 1개"만 생성
# =========================================================
def build_movie_catalog_flat(
    movielens_movies_csv: Path,
    movielens_links_csv: Path,
    tmdb_merged_jsonl: Path,
    output_parquet: Path,
    cfg: Optional[ExtractConfig] = None,
    preview_n: int = 5,
) -> pd.DataFrame:
    """
    생성물(1개):
      - movie_catalog_flat.parquet
        : movieId 별로 graphlet 입력에 바로 쓸 feature를 한 행으로 평탄화한 카탈로그
    """
    if cfg is None:
        cfg = ExtractConfig()

    needed_tmdb_ids, tmdb_to_movie = build_tmdb_to_movielens_movie_map(
        movielens_movies_csv=movielens_movies_csv,
        movielens_links_csv=movielens_links_csv,
    )

    rows: List[Dict[str, Any]] = []

    for obj in tqdm(iter_tmdb_jsonl(tmdb_merged_jsonl), desc="Build movie catalog(flat)"):
        tmdb_id = obj.get("tmdb_id")
        if not isinstance(tmdb_id, int):
            continue
        if tmdb_id not in needed_tmdb_ids:
            continue

        tmdb_data = obj.get("data")
        if not isinstance(tmdb_data, dict) or not tmdb_data:
            continue

        movie_id = tmdb_to_movie.get(tmdb_id)
        if movie_id is None:
            continue

        rows.append(extract_flat_catalog_row(movie_id, tmdb_id, tmdb_data, cfg))

    df = pd.DataFrame(rows).drop_duplicates(subset=["movieId"], keep="last")

    # release_year를 nullable Int로 정리(없으면 <NA>)
    if "release_year" in df.columns:
        df["release_year"] = pd.to_numeric(df["release_year"], errors="coerce").astype("Int64")

    # 저장
    output_parquet.parent.mkdir(parents=True, exist_ok=True)
    df.to_parquet(output_parquet, index=False)

    print(f"[OK] Saved: {output_parquet} rows={len(df):,} cols={len(df.columns):,}")
    if len(df):
        print("\n[PREVIEW]")
        print(df.head(preview_n).to_string(index=False))

    return df


# =============================================================================
# 실행
# =============================================================================
build_movie_catalog_flat(
    movielens_movies_csv=RAW.MOVIES_CSV,
    movielens_links_csv=RAW.LINKS_CSV,
    tmdb_merged_jsonl=RAW.TMDB_MOVIES_JSONL,
    output_parquet=PROCESSED.MOVIE_CATALOG_PARQUET,
    cfg=ExtractConfig(
        top_company=2,
        top_country=2,
        top_language=2,
        top_actor=3,
        top_director=1,
        top_writer=1,
        top_genre=3,
        writer_job_priority=("Writer", "Screenplay", "Story"),
        director_job="Director",
    ),
    preview_n=5,
)