# Movie Recommender System - Phase 1: Setup & Data Loading


In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity, linear_kernel
from sklearn.decomposition import TruncatedSVD
from scipy.sparse.linalg import svds
import json
import warnings
warnings.filterwarnings('ignore')

# Cáº¥u hÃ¬nh hiá»ƒn thá»‹
pd.set_option('display.max_columns', None)
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

# 1. LOAD DATA


In [2]:
print("\n[1] Loading datasets...")

# Try to load verified files first, fallback to original
try:
    credits = pd.read_csv('tmdb_5000_credits_verified.csv')
    movies = pd.read_csv('tmdb_5000_movies_verified.csv')
    print("âœ“ Loaded verified datasets")
except FileNotFoundError:
    print("âš  Verified files not found, loading original datasets...")
    try:
        credits = pd.read_csv('../data/tmdb_5000_credits.csv')
        movies = pd.read_csv('../data/tmdb_5000_movies.csv')
        print("âœ“ Loaded original datasets")
    except FileNotFoundError:
        print("âœ— ERROR: Dataset files not found!")
        print("Please ensure you have either:")
        print("  - tmdb_5000_credits_verified.csv & tmdb_5000_movies_verified.csv")
        print("  OR")
        print("  - tmdb_5000_credits.csv & tmdb_5000_movies.csv")
        print("\nðŸ‘‰ Run 'data_check_fix.py' script first to verify your data!")
        exit(1)

print(f"Credits shape: {credits.shape}")
print(f"Movies shape: {movies.shape}")

# Hiá»ƒn thá»‹ thÃ´ng tin cÆ¡ báº£n
print("\n[Credits Columns]:", credits.columns.tolist())
print("[Movies Columns]:", movies.columns.tolist())

# Auto-fix common column name variations
column_fixes = [
    ('movie_title', 'title'),
    ('original_title', 'title'),
    ('movie_id', 'id')
]

for old_col, new_col in column_fixes:
    if old_col in credits.columns and new_col not in credits.columns:
        credits.rename(columns={old_col: new_col}, inplace=True)
        print(f"âœ“ Credits: Renamed '{old_col}' â†’ '{new_col}'")

    if old_col in movies.columns and new_col not in movies.columns:
        movies.rename(columns={old_col: new_col}, inplace=True)
        print(f"âœ“ Movies: Renamed '{old_col}' â†’ '{new_col}'")


[1] Loading datasets...
âš  Verified files not found, loading original datasets...
âœ“ Loaded original datasets
Credits shape: (4803, 4)
Movies shape: (4803, 20)

[Credits Columns]: ['movie_id', 'title', 'cast', 'crew']
[Movies Columns]: ['budget', 'genres', 'homepage', 'id', 'keywords', 'original_language', 'original_title', 'overview', 'popularity', 'production_companies', 'production_countries', 'release_date', 'revenue', 'runtime', 'spoken_languages', 'status', 'tagline', 'title', 'vote_average', 'vote_count']
âœ“ Credits: Renamed 'movie_id' â†’ 'id'


# 2. DATA MERGING


In [3]:
print("\n[2] Merging datasets...")

# Rename columns Ä‘á»ƒ merge
credits.rename(columns={'movie_id': 'id'}, inplace=True)

# Merge datasets
df = movies.merge(credits, on='id')

print(f"Merged dataset shape: {df.shape}")
print(f"Columns: {df.columns.tolist()}")


[2] Merging datasets...
Merged dataset shape: (4803, 23)
Columns: ['budget', 'genres', 'homepage', 'id', 'keywords', 'original_language', 'original_title', 'overview', 'popularity', 'production_companies', 'production_countries', 'release_date', 'revenue', 'runtime', 'spoken_languages', 'status', 'tagline', 'title_x', 'vote_average', 'vote_count', 'title_y', 'cast', 'crew']


# 3. INITIAL DATA EXPLORATION


In [4]:
print("\n[3] Initial Data Exploration")
print("-" * 60)

# Kiá»ƒm tra missing values
print("\nMissing Values:")
missing = df.isnull().sum()
print(missing[missing > 0])

# Kiá»ƒm tra duplicates
print(f"\nDuplicate rows: {df.duplicated().sum()}")

# Basic statistics
print("\nBasic Statistics:")
print(df[['budget', 'revenue', 'runtime', 'vote_average', 'vote_count']].describe())


[3] Initial Data Exploration
------------------------------------------------------------

Missing Values:
homepage        3091
overview           3
release_date       1
runtime            2
tagline          844
dtype: int64

Duplicate rows: 0

Basic Statistics:
             budget       revenue      runtime  vote_average    vote_count
count  4.803000e+03  4.803000e+03  4801.000000   4803.000000   4803.000000
mean   2.904504e+07  8.226064e+07   106.875859      6.092172    690.217989
std    4.072239e+07  1.628571e+08    22.611935      1.194612   1234.585891
min    0.000000e+00  0.000000e+00     0.000000      0.000000      0.000000
25%    7.900000e+05  0.000000e+00    94.000000      5.600000     54.000000
50%    1.500000e+07  1.917000e+07   103.000000      6.200000    235.000000
75%    4.000000e+07  9.291719e+07   118.000000      6.800000    737.000000
max    3.800000e+08  2.787965e+09   338.000000     10.000000  13752.000000


# 4. DATA PREPROCESSING FUNCTIONS


In [5]:
print("\n[4] Creating preprocessing functions...")

def parse_json_column(column):
    """Parse JSON-formatted columns"""
    try:
        data = json.loads(column)
        return [item['name'] for item in data] if isinstance(data, list) else []
    except:
        return []

def extract_director(crew_json):
    """Extract director from crew"""
    try:
        crew = json.loads(crew_json)
        for member in crew:
            if member['job'] == 'Director':
                return member['name']
        return np.nan
    except:
        return np.nan

def extract_top_cast(cast_json, n=5):
    """Extract top N cast members"""
    try:
        cast = json.loads(cast_json)
        return [member['name'] for member in cast[:n]]
    except:
        return []


[4] Creating preprocessing functions...


# 5. FEATURE ENGINEERING


In [6]:
print("\n[5] Feature Engineering...")

# Parse JSON columns
df['genres'] = df['genres'].apply(parse_json_column)
df['keywords'] = df['keywords'].apply(parse_json_column)
df['production_companies'] = df['production_companies'].apply(parse_json_column)
df['production_countries'] = df['production_countries'].apply(parse_json_column)

# Extract cast vÃ  director
df['cast'] = df['cast'].apply(lambda x: extract_top_cast(x, 5))
df['director'] = df['crew'].apply(extract_director)

# Convert release_date to datetime
df['release_date'] = pd.to_datetime(df['release_date'], errors='coerce')
df['release_year'] = df['release_date'].dt.year

# Create decade
df['decade'] = (df['release_year'] // 10) * 10

print("Feature engineering completed!")
print(f"New columns: {[col for col in df.columns if col in ['genres', 'keywords', 'cast', 'director', 'release_year', 'decade']]}")


[5] Feature Engineering...
Feature engineering completed!
New columns: ['genres', 'keywords', 'cast', 'director', 'release_year', 'decade']


# 6. DATA CLEANING


In [7]:
print("\n[6] Data Cleaning...")

# Check columns exist before cleaning
print(f"Available columns: {df.columns.tolist()}")

initial_count = len(df)

# Clean data - vá»›i error handling
try:
    # Check vÃ  clean overview
    if 'overview' in df.columns:
        df = df[df['overview'].notna()]
        print(f"Removed movies with missing overview")

    # Check vÃ  clean title (cÃ³ thá»ƒ lÃ  'title', 'original_title', hoáº·c 'movie_title')
    title_col = None
    for col in ['title', 'original_title', 'movie_title']:
        if col in df.columns:
            title_col = col
            break

    if title_col:
        df = df[df[title_col].notna()]
        # Rename to 'title' if different
        if title_col != 'title':
            df.rename(columns={title_col: 'title'}, inplace=True)
        print(f"Removed movies with missing {title_col}")

    # Check vÃ  clean genres
    if 'genres' in df.columns:
        df = df[df['genres'].apply(len) > 0]
        print(f"Removed movies with no genres")

    print(f"Removed {initial_count - len(df)} movies with missing critical info")
    print(f"Final dataset shape: {df.shape}")

except Exception as e:
    print(f"Warning during cleaning: {e}")
    print(f"Continuing with {len(df)} movies")


[6] Data Cleaning...
Available columns: ['budget', 'genres', 'homepage', 'id', 'keywords', 'original_language', 'original_title', 'overview', 'popularity', 'production_companies', 'production_countries', 'release_date', 'revenue', 'runtime', 'spoken_languages', 'status', 'tagline', 'title_x', 'vote_average', 'vote_count', 'title_y', 'cast', 'crew', 'director', 'release_year', 'decade']
Removed movies with missing overview
Removed movies with missing original_title
Removed movies with no genres
Removed 31 movies with missing critical info
Final dataset shape: (4772, 26)


# 7. SAVE PROCESSED DATA


In [8]:
print("\n[7] Saving processed data...")

# Save to CSV
df.to_csv('../data/processed/movies_processed.csv', index=False)
print("Saved: movies_processed.csv")

# Save essential columns for quick loading
essential_cols = ['id', 'title', 'overview', 'genres', 'keywords',
                  'cast', 'director', 'vote_average', 'vote_count',
                  'popularity', 'release_year', 'runtime']
df[essential_cols].to_csv('../data/processed/movies_essential.csv', index=False)
print("Saved: movies_essential.csv")


[7] Saving processed data...
Saved: movies_processed.csv
Saved: movies_essential.csv


In [9]:

# ============================================================================


# ============================================================================
# ============================================================================


print("\n" + "=" * 60)
print("DATA LOADING & PREPROCESSING COMPLETED!")
print("=" * 60)

# ============================================================================
# 8. QUICK DATA SUMMARY
# ============================================================================
print("\n[SUMMARY]")
print(f"Total movies: {len(df)}")
print(f"Date range: {df['release_year'].min():.0f} - {df['release_year'].max():.0f}")
print(f"Unique genres: {len(set([g for genres in df['genres'] for g in genres]))}")
print(f"Average rating: {df['vote_average'].mean():.2f}")
print(f"Average votes: {df['vote_count'].mean():.0f}")

# Display sample
print("\n[SAMPLE DATA]")
print(df[['title', 'genres', 'vote_average', 'release_year']].head(10))


DATA LOADING & PREPROCESSING COMPLETED!

[SUMMARY]
Total movies: 4772
Date range: 1916 - 2017
Unique genres: 20
Average rating: 6.11
Average votes: 695

[SAMPLE DATA]
                                      title  \
0                                    Avatar   
1  Pirates of the Caribbean: At World's End   
2                                   Spectre   
3                     The Dark Knight Rises   
4                               John Carter   
5                              Spider-Man 3   
6                                   Tangled   
7                   Avengers: Age of Ultron   
8    Harry Potter and the Half-Blood Prince   
9        Batman v Superman: Dawn of Justice   

                                          genres  vote_average  release_year  
0  [Action, Adventure, Fantasy, Science Fiction]           7.2        2009.0  
1                   [Adventure, Fantasy, Action]           6.9        2007.0  
2                     [Action, Adventure, Crime]           6.3        2015.0 