First, install the dependencies after setting up python venv with:
```
uv sync
```
and select the `.venv` as the kernel.

# 0. Trailers Inspection
Before we do anything, let's inspect our trailers for any metadata relevant for our presentation. We will:
- Print out all the names and the corresponding YouTube ID
- Print out the genres that movie belongs to

In [1]:
import pandas as pd
import json
import os
from pathlib import Path

# 1. Get all downloaded trailer files
trailers_dir = Path('/Users/danitniwat/Desktop/Workspaces/nlp_projektwoche/prelim_assign/data/trailers')
trailer_files = list(trailers_dir.glob('*.mp4'))
youtube_ids = [f.stem for f in trailer_files]

print(f"Found {len(trailer_files)} downloaded trailers:")
for youtube_id in youtube_ids:
    print(f"  - {youtube_id}.mp4")

# 2. Load metadata and sampled trailers data
print("\nLoading metadata...")
with open('data/metadata.json', 'r') as f:
    metadata = json.load(f)

sampled_df = pd.read_csv('data/sampled_trailers.csv')

# 3. Create reverse mapping: YouTube ID -> Database ID -> Movie Info
print("Creating mappings...")

# First, create YouTube ID to database ID mapping
youtube_to_db_id = {}
for db_id in metadata['trailers12k'].keys():
    try:
        yt_id = metadata['trailers12k'][db_id]['youtube']['trailers'][0]['id']
        youtube_to_db_id[yt_id] = db_id
    except (KeyError, IndexError):
        continue

# 4. Extract information for each downloaded trailer
trailer_info = []

for youtube_id in youtube_ids:
    if youtube_id in youtube_to_db_id:
        db_id = youtube_to_db_id[youtube_id]
        movie_data = metadata['trailers12k'][db_id]
        
        # Get IMDB ID (with 'tt' prefix for matching with sampled data)
        imdb_id_clean = movie_data['imdb']['id']
        imdb_id_full = f"tt{imdb_id_clean}"
        
        # Get movie title
        movie_title = movie_data['imdb']['title']
        
        # Get genre information from sampled_trailers.csv
        movie_row = sampled_df[sampled_df['mid'] == imdb_id_full]
        
        if not movie_row.empty:
            # Get all genres for this movie
            genre_columns = ['action', 'adventure', 'comedy', 'crime', 'drama', 
                           'fantasy', 'horror', 'romance', 'sci-fi', 'thriller']
            
            movie_genres = []
            for genre in genre_columns:
                if movie_row[genre].iloc[0] == 1:
                    movie_genres.append(genre)
            
            trailer_info.append({
                'youtube_id': youtube_id,
                'imdb_id': imdb_id_full,
                'movie_title': movie_title,
                'genres': movie_genres,
                'genre_string': ', '.join(movie_genres),
                'file_path': f'data/trailers/{youtube_id}.mp4'
            })
        else:
            print(f"Warning: Movie {imdb_id_full} not found in sampled data")
    else:
        print(f"Warning: YouTube ID {youtube_id} not found in metadata")

# 5. Display results
print(f"\n{'='*80}")
print("DOWNLOADED TRAILERS ANALYSIS")
print(f"{'='*80}")

for i, info in enumerate(trailer_info, 1):
    print(f"\n{i}. {info['movie_title']}")
    print(f"   YouTube ID: {info['youtube_id']}")
    print(f"   IMDB ID: {info['imdb_id']}")
    print(f"   Genres: {info['genre_string']}")
    print(f"   File: {info['file_path']}")

# 6. Create a summary DataFrame
if trailer_info:
    summary_df = pd.DataFrame(trailer_info)
    print(f"\n{'='*80}")
    print("SUMMARY TABLE")
    print(f"{'='*80}")
    print(summary_df[['movie_title', 'youtube_id', 'imdb_id', 'genre_string']].to_string(index=False))
    
    # Save to CSV for easy reference
    summary_df.to_csv('data/downloaded_trailers_summary.csv', index=False)
    print(f"\nSummary saved to: data/downloaded_trailers_summary.csv")
else:
    print("\nNo trailer information could be extracted.")

# 7. Genre distribution of downloaded trailers
if trailer_info:
    print(f"\n{'='*80}")
    print("GENRE DISTRIBUTION")
    print(f"{'='*80}")
    
    all_genres = []
    for info in trailer_info:
        all_genres.extend(info['genres'])
    
    genre_counts = pd.Series(all_genres).value_counts()
    print(genre_counts.to_string())

Found 24 downloaded trailers:
  - p-pVxwaFuBs.mp4
  - 5-_9AFwMDmQ.mp4
  - oMDXmDOyWE8.mp4
  - Yt7ofokzn04.mp4
  - VA7LT1hym1M.mp4
  - lioWzrpCtGQ.mp4
  - suJhUxvLUDE.mp4
  - hrt7uSLIiyk.mp4
  - Xj8FyGLhLwo.mp4
  - c4vCtn5RMKo.mp4
  - YdTqAevPejU.mp4
  - pBwcksgKcFs.mp4
  - LMkkJbRJdiA.mp4
  - 9scyo1fQXbc.mp4
  - UmkuYy5tbFM.mp4
  - MyiHqXGsD8c.mp4
  - QGerKfRYeq4.mp4
  - o-mGxa-_85M.mp4
  - jtnqVZEktyg.mp4
  - X_2hHR9W8uk.mp4
  - Fu75D5tvxBA.mp4
  - LSrBBhedXVE.mp4
  - MNQiLB2Rdjs.mp4
  - HXI_xAJFi4Q.mp4

Loading metadata...
Creating mappings...

DOWNLOADED TRAILERS ANALYSIS

1. Now You See Me
   YouTube ID: p-pVxwaFuBs
   IMDB ID: tt1670345
   Genres: crime, thriller
   File: data/trailers/p-pVxwaFuBs.mp4

2. Far from the Madding Crowd
   YouTube ID: 5-_9AFwMDmQ
   IMDB ID: tt2935476
   Genres: drama, romance
   File: data/trailers/5-_9AFwMDmQ.mp4

3. 40 Days and 40 Nights
   YouTube ID: oMDXmDOyWE8
   IMDB ID: tt0243736
   Genres: comedy, romance
   File: data/trailers/oMDXmDOyWE8.mp