In [1]:
#Movie Recommender

In [2]:
#Library Imports
import os
import sys
import ssl
import subprocess

try:
    import nltk
    print("✅ NLTK already installed")
except ImportError:
    print("📦 Installing NLTK...")
    subprocess.check_call([sys.executable, "-m", "pip", "install", "nltk"])
    import nltk
    print("✅ NLTK installed successfully")

import nltk
nltk.download('punkt')
nltk.download('stopwords')

✅ NLTK already installed


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/charleygregory/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/charleygregory/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
import pandas as pd
import numpy as np
import pickle
import ast
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import nltk
from nltk.stem.porter import PorterStemmer
import warnings
warnings.filterwarnings('ignore')

# Download NLTK data (run once)
# nltk.download('punkt')
# nltk.download('stopwords')

print("Libraries imported successfully!")

Libraries imported successfully!


In [4]:
#Load Data
movies = pd.read_csv('data/tmdb_5000_movies.csv')
credits = pd.read_csv('data/tmdb_5000_credits.csv')

print("Dataset shapes:")
print(f"Movies: {movies.shape}")
print(f"Credits: {credits.shape}")

print("\nMovies dataset info:")
print(movies.info())

print("\nFirst few rows of movies dataset:")
print(movies.head())

Dataset shapes:
Movies: (4803, 20)
Credits: (3863, 1265)

Movies dataset info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4803 entries, 0 to 4802
Data columns (total 20 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   budget                4803 non-null   int64  
 1   genres                4803 non-null   object 
 2   homepage              1712 non-null   object 
 3   id                    4803 non-null   int64  
 4   keywords              4803 non-null   object 
 5   original_language     4803 non-null   object 
 6   original_title        4803 non-null   object 
 7   overview              4800 non-null   object 
 8   popularity            4803 non-null   float64
 9   production_companies  4803 non-null   object 
 10  production_countries  4803 non-null   object 
 11  release_date          4802 non-null   object 
 12  revenue               4803 non-null   int64  
 13  runtime               4801 non-null   float6

In [5]:
#Data Exploration 
print("Movies dataset columns:")
print(movies.columns.tolist())

print("\nCredits dataset columns:")
print(credits.columns.tolist())

print("\nBasic statistics:")
print(movies.describe())

print("\nMissing values in movies:")
print(movies.isnull().sum())

print("\nMissing values in credits:")
print(credits.isnull().sum())

Movies dataset columns:
['budget', 'genres', 'homepage', 'id', 'keywords', 'original_language', 'original_title', 'overview', 'popularity', 'production_companies', 'production_countries', 'release_date', 'revenue', 'runtime', 'spoken_languages', 'status', 'tagline', 'title', 'vote_average', 'vote_count']

Credits dataset columns:
['movie_id', 'title', 'cast', 'crew', 'Unnamed: 4', 'Unnamed: 5', 'Unnamed: 6', 'Unnamed: 7', 'Unnamed: 8', 'Unnamed: 9', 'Unnamed: 10', 'Unnamed: 11', 'Unnamed: 12', 'Unnamed: 13', 'Unnamed: 14', 'Unnamed: 15', 'Unnamed: 16', 'Unnamed: 17', 'Unnamed: 18', 'Unnamed: 19', 'Unnamed: 20', 'Unnamed: 21', 'Unnamed: 22', 'Unnamed: 23', 'Unnamed: 24', 'Unnamed: 25', 'Unnamed: 26', 'Unnamed: 27', 'Unnamed: 28', 'Unnamed: 29', 'Unnamed: 30', 'Unnamed: 31', 'Unnamed: 32', 'Unnamed: 33', 'Unnamed: 34', 'Unnamed: 35', 'Unnamed: 36', 'Unnamed: 37', 'Unnamed: 38', 'Unnamed: 39', 'Unnamed: 40', 'Unnamed: 41', 'Unnamed: 42', 'Unnamed: 43', 'Unnamed: 44', 'Unnamed: 45', 'Unnam

In [6]:
#Merge Datasets
movies = movies.merge(credits, on='title')
print(f"Merged dataset shape: {movies.shape}")
print("\nColumns after merging:")
print(movies.columns.tolist())

Merged dataset shape: (1492, 1284)

Columns after merging:
['budget', 'genres', 'homepage', 'id', 'keywords', 'original_language', 'original_title', 'overview', 'popularity', 'production_companies', 'production_countries', 'release_date', 'revenue', 'runtime', 'spoken_languages', 'status', 'tagline', 'title', 'vote_average', 'vote_count', 'movie_id', 'cast', 'crew', 'Unnamed: 4', 'Unnamed: 5', 'Unnamed: 6', 'Unnamed: 7', 'Unnamed: 8', 'Unnamed: 9', 'Unnamed: 10', 'Unnamed: 11', 'Unnamed: 12', 'Unnamed: 13', 'Unnamed: 14', 'Unnamed: 15', 'Unnamed: 16', 'Unnamed: 17', 'Unnamed: 18', 'Unnamed: 19', 'Unnamed: 20', 'Unnamed: 21', 'Unnamed: 22', 'Unnamed: 23', 'Unnamed: 24', 'Unnamed: 25', 'Unnamed: 26', 'Unnamed: 27', 'Unnamed: 28', 'Unnamed: 29', 'Unnamed: 30', 'Unnamed: 31', 'Unnamed: 32', 'Unnamed: 33', 'Unnamed: 34', 'Unnamed: 35', 'Unnamed: 36', 'Unnamed: 37', 'Unnamed: 38', 'Unnamed: 39', 'Unnamed: 40', 'Unnamed: 41', 'Unnamed: 42', 'Unnamed: 43', 'Unnamed: 44', 'Unnamed: 45', 'Unname

In [7]:
#Data Cleaning and Preprocessing 
# Keep only relevant columns for recommendation
movies = movies[['movie_id', 'title', 'overview', 'genres', 'keywords', 'cast', 'crew']]

# Drop rows with missing values
movies.dropna(inplace=True)
print(f"Dataset shape after removing null values: {movies.shape}")

# Check for duplicates
print(f"Duplicate rows: {movies.duplicated().sum()}")

# Remove duplicates if any
movies.drop_duplicates(inplace=True)
print(f"Final dataset shape: {movies.shape}")

print("\nSample data:")
print(movies.head())

Dataset shape after removing null values: (1489, 7)
Duplicate rows: 0
Final dataset shape: (1489, 7)

Sample data:
  movie_id                                     title  \
0    19995                                    Avatar   
1      285  Pirates of the Caribbean: At World's End   
2   206647                                   Spectre   
3    49026                     The Dark Knight Rises   
4    49529                               John Carter   

                                            overview  \
0  In the 22nd century, a paraplegic Marine is di...   
1  Captain Barbossa, long believed to be dead, ha...   
2  A cryptic message from Bond’s past sends him o...   
3  Following the death of District Attorney Harve...   
4  John Carter is a war-weary, former military ca...   

                                              genres  \
0  [{"id": 28, "name": "Action"}, {"id": 12, "nam...   
1  [{"id": 12, "name": "Adventure"}, {"id": 14, "...   
2  [{"id": 28, "name": "Action"}, {"id": 12

In [8]:
#Feature Extraction
def convert_genres(text):
    """Extract genre names from JSON-like string"""
    L = []
    try:
        for i in ast.literal_eval(text):
            L.append(i['name'])
    except:
        return []
    return L

def convert_keywords(text):
    """Extract keyword names from JSON-like string"""
    L = []
    try:
        for i in ast.literal_eval(text):
            L.append(i['name'])
    except:
        return []
    return L

def convert_cast(text):
    """Extract top 3 cast member names"""
    L = []
    counter = 0
    try:
        for i in ast.literal_eval(text):
            if counter < 3:
                L.append(i['name'])
            counter += 1
    except:
        return []
    return L

def fetch_director(text):
    """Extract director name from crew"""
    L = []
    try:
        for i in ast.literal_eval(text):
            if i['job'] == 'Director':
                L.append(i['name'])
    except:
        return []
    return L

print("Feature extraction functions defined!")

Feature extraction functions defined!


In [9]:
#Feature Extraction
print("Applying feature extraction...")

# Apply the conversion functions
movies['genres'] = movies['genres'].apply(convert_genres)
movies['keywords'] = movies['keywords'].apply(convert_keywords)
movies['cast'] = movies['cast'].apply(convert_cast)
movies['crew'] = movies['crew'].apply(fetch_director)

print("Feature extraction completed!")
print("\nSample extracted features:")
print(movies[['title', 'genres', 'keywords', 'cast', 'crew']].head())

Applying feature extraction...
Feature extraction completed!

Sample extracted features:
                                      title  \
0                                    Avatar   
1  Pirates of the Caribbean: At World's End   
2                                   Spectre   
3                     The Dark Knight Rises   
4                               John Carter   

                                          genres  \
0  [Action, Adventure, Fantasy, Science Fiction]   
1                   [Adventure, Fantasy, Action]   
2                     [Action, Adventure, Crime]   
3               [Action, Crime, Drama, Thriller]   
4           [Action, Adventure, Science Fiction]   

                                            keywords  \
0  [culture clash, future, space war, space colon...   
1  [ocean, drug abuse, exotic island, east india ...   
2  [spy, based on novel, secret agent, sequel, mi...   
3  [dc comics, crime fighter, terrorist, secret i...   
4  [based on novel, mars, medallion

In [10]:
#Text Preprocessing
def collapse_list_of_words(L):
    """Convert list of words to space-separated string"""
    if isinstance(L, list):
        return ' '.join(L)
    else:
        return ''

# Convert lists to strings
movies['genres'] = movies['genres'].apply(collapse_list_of_words)
movies['keywords'] = movies['keywords'].apply(collapse_list_of_words) 
movies['cast'] = movies['cast'].apply(collapse_list_of_words)
movies['crew'] = movies['crew'].apply(collapse_list_of_words)

# Handle overview text
movies['overview'] = movies['overview'].fillna('')

print("Text preprocessing completed!")
print("\nSample processed features:")
print(movies[['title', 'genres', 'keywords', 'cast', 'crew', 'overview']].head())

Text preprocessing completed!

Sample processed features:
                                      title  \
0                                    Avatar   
1  Pirates of the Caribbean: At World's End   
2                                   Spectre   
3                     The Dark Knight Rises   
4                               John Carter   

                                     genres  \
0  Action Adventure Fantasy Science Fiction   
1                  Adventure Fantasy Action   
2                    Action Adventure Crime   
3               Action Crime Drama Thriller   
4          Action Adventure Science Fiction   

                                            keywords  \
0  culture clash future space war space colony so...   
1  ocean drug abuse exotic island east india trad...   
2  spy based on novel secret agent sequel mi6 bri...   
3  dc comics crime fighter terrorist secret ident...   
4  based on novel mars medallion space travel pri...   

                                       

In [11]:
#Tags Column
movies['tags'] = (movies['overview'] + ' ' + 
                 movies['genres'] + ' ' + 
                 movies['keywords'] + ' ' + 
                 movies['cast'] + ' ' + 
                 movies['crew'])

print("Tags column created!")
print("\nSample tags:")
for i in range(3):
    print(f"\nMovie: {movies.iloc[i]['title']}")
    print(f"Tags: {movies.iloc[i]['tags'][:200]}...")

Tags column created!

Sample tags:

Movie: Avatar
Tags: In the 22nd century, a paraplegic Marine is dispatched to the moon Pandora on a unique mission, but becomes torn between following orders and protecting an alien civilization. Action Adventure Fantasy...

Movie: Pirates of the Caribbean: At World's End
Tags: Captain Barbossa, long believed to be dead, has come back to life and is headed to the edge of the Earth with Will Turner and Elizabeth Swann. But nothing is quite as it seems. Adventure Fantasy Actio...

Movie: Spectre
Tags: A cryptic message from Bond’s past sends him on a trail to uncover a sinister organization. While M battles political forces to keep the secret service alive, Bond peels back the layers of deceit to r...


In [12]:
#Text Preprocessing
def preprocess_text(text):
    """Advanced text preprocessing"""
    # Convert to lowercase
    text = text.lower()
    
    # Split into words
    words = text.split()
    
    # Remove punctuation and apply stemming
    ps = PorterStemmer()
    processed_words = []
    
    for word in words:
        # Remove punctuation
        clean_word = ''.join(char for char in word if char.isalnum())
        if clean_word:  # Only add non-empty words
            # Apply stemming
            stemmed_word = ps.stem(clean_word)
            processed_words.append(stemmed_word)
    
    return ' '.join(processed_words)

# Apply advanced preprocessing
print("Applying advanced text preprocessing...")
movies['tags'] = movies['tags'].apply(preprocess_text)

print("Advanced preprocessing completed!")
print("\nSample processed tags:")
for i in range(2):
    print(f"\nMovie: {movies.iloc[i]['title']}")
    print(f"Processed tags: {movies.iloc[i]['tags'][:150]}...")

Applying advanced text preprocessing...
Advanced preprocessing completed!

Sample processed tags:

Movie: Avatar
Processed tags: in the 22nd centuri a parapleg marin is dispatch to the moon pandora on a uniqu mission but becom torn between follow order and protect an alien civil...

Movie: Pirates of the Caribbean: At World's End
Processed tags: captain barbossa long believ to be dead ha come back to life and is head to the edg of the earth with will turner and elizabeth swann but noth is quit...


In [13]:
#Vectorisation
print("Creating feature vectors...")

# Use CountVectorizer for simplicity and better performance
cv = CountVectorizer(max_features=5000, stop_words='english')

# Alternative: Use TF-IDF (uncomment to try)
# cv = TfidfVectorizer(max_features=5000, stop_words='english')

# Create feature vectors
try:
    vector = cv.fit_transform(movies['tags']).toarray()
    print(f"Vector shape: {vector.shape}")
    print("Vectorization successful!")
except Exception as e:
    print(f"Error in vectorization: {e}")
    # Fallback: simple vectorization
    cv = CountVectorizer(max_features=1000, stop_words='english')
    vector = cv.fit_transform(movies['tags']).toarray()
    print(f"Fallback vector shape: {vector.shape}")

Creating feature vectors...
Vector shape: (1489, 5000)
Vectorization successful!


In [14]:
#Compute Similarity Mextrix
print("Computing cosine similarity matrix...")

# Compute cosine similarity
similarity = cosine_similarity(vector)
print(f"Similarity matrix shape: {similarity.shape}")

print("Similarity matrix computation completed!")

# Display sample similarities
print(f"\nSample similarities for movie '{movies.iloc[0]['title']}':")
similarities_sample = similarity[0]
print(f"Top 5 similarity scores: {sorted(similarities_sample, reverse=True)[:5]}")

Computing cosine similarity matrix...
Similarity matrix shape: (1489, 1489)
Similarity matrix computation completed!

Sample similarities for movie 'Avatar':
Top 5 similarity scores: [np.float64(0.9999999999999999), np.float64(0.3643962629999657), np.float64(0.349232531718729), np.float64(0.3198086379715893), np.float64(0.31863296370911354)]


In [15]:
#Recommendation Function
def recommend(movie):
    """
    Recommend movies based on content similarity
    
    Args:
        movie (str): Title of the movie
        
    Returns:
        list: List of recommended movie titles
    """
    try:
        # Find the index of the movie
        index = movies[movies['title'] == movie].index[0]
        
        # Get similarity scores
        distances = sorted(list(enumerate(similarity[index])), reverse=True, key=lambda x: x[1])
        
        # Get top 5 recommendations (excluding the movie itself)
        recommended_movies = []
        for i in distances[1:6]:  # Skip the first one (the movie itself)
            recommended_movies.append(movies.iloc[i[0]].title)
            
        return recommended_movies
    
    except IndexError:
        return ["Movie not found in database"]
    except Exception as e:
        return [f"Error: {str(e)}"]

print("Recommendation function created!")

Recommendation function created!


In [16]:
#Test Rec Func
print("Testing the recommendation system...")

# Test with a few movies
test_movies = ['Avatar', 'The Dark Knight', 'Titanic']

for movie in test_movies:
    if movie in movies['title'].values:
        print(f"\nRecommendations for '{movie}':")
        recommendations = recommend(movie)
        for i, rec in enumerate(recommendations, 1):
            print(f"{i}. {rec}")
    else:
        print(f"\n'{movie}' not found in database")

# Show available movie titles (first 20)
print(f"\nFirst 20 movies in database:")
print(movies['title'].head(20).tolist())

Testing the recommendation system...

Recommendations for 'Avatar':
1. Mission to Mars
2. Alien³
3. Treasure Planet
4. Planet of the Apes
5. Starship Troopers

Recommendations for 'The Dark Knight':
1. The Dark Knight Rises
2. Batman Begins
3. Batman
4. Batman Returns
5. Batman Forever

Recommendations for 'Titanic':
1. Captain Phillips
2. Poseidon
3. Pirates of the Caribbean: On Stranger Tides
4. Love in the Time of Cholera
5. In the Heart of the Sea

First 20 movies in database:
['Avatar', "Pirates of the Caribbean: At World's End", 'Spectre', 'The Dark Knight Rises', 'John Carter', 'Spider-Man 3', 'Tangled', 'Avengers: Age of Ultron', 'Harry Potter and the Half-Blood Prince', 'Batman v Superman: Dawn of Justice', 'Superman Returns', 'Quantum of Solace', "Pirates of the Caribbean: Dead Man's Chest", 'The Lone Ranger', 'Man of Steel', 'The Chronicles of Narnia: Prince Caspian', 'The Avengers', 'Pirates of the Caribbean: On Stranger Tides', 'Men in Black 3', 'The Hobbit: The Battle of 

In [17]:
#Model Eval
def evaluate_recommendations(movie_title, expected_genres=None):
    """Basic evaluation of recommendations"""
    if movie_title not in movies['title'].values:
        return "Movie not found"
    
    # Get original movie details
    original_idx = movies[movies['title'] == movie_title].index[0]
    original_genres = movies.iloc[original_idx]['genres']
    
    print(f"Original movie: {movie_title}")
    print(f"Original genres: {original_genres}")
    
    # Get recommendations
    recommendations = recommend(movie_title)
    
    print(f"\nRecommendations and their genres:")
    for rec in recommendations:
        if rec in movies['title'].values:
            rec_idx = movies[movies['title'] == rec].index[0]
            rec_genres = movies.iloc[rec_idx]['genres']
            print(f"- {rec}: {rec_genres}")

# Test evaluation
print("Evaluating recommendations...")
evaluate_recommendations('Avatar')

Evaluating recommendations...
Original movie: Avatar
Original genres: Action Adventure Fantasy Science Fiction

Recommendations and their genres:
- Mission to Mars: Animation Adventure Comedy Family
- Alien³: Adventure
- Treasure Planet: Adventure Comedy Science Fiction
- Planet of the Apes: Action Thriller Science Fiction
- Starship Troopers: Drama Science Fiction


In [18]:
#Save Model
print("Saving the model and similarity matrix...")

# Create a clean dataset for the model
new_df = movies[['movie_id', 'title', 'tags']].copy()

try:
    # Save the processed dataframe
    pickle.dump(new_df, open('model.pkl', 'wb'))
    
    # Save the similarity matrix
    pickle.dump(similarity, open('similarity.pkl', 'wb'))
    
    print("Model and similarity matrix saved successfully!")
    print("Files created:")
    print("- model.pkl")
    print("- similarity.pkl")
    
except Exception as e:
    print(f"Error saving files: {e}")

Saving the model and similarity matrix...
Model and similarity matrix saved successfully!
Files created:
- model.pkl
- similarity.pkl


In [19]:
#Load/test model
print("Testing the saved model...")

try:
    # Load the saved model
    loaded_movies = pickle.load(open('model.pkl', 'rb'))
    loaded_similarity = pickle.load(open('similarity.pkl', 'rb'))
    
    print("Model loaded successfully!")
    print(f"Loaded movies shape: {loaded_movies.shape}")
    print(f"Loaded similarity shape: {loaded_similarity.shape}")
    
    # Test with loaded model
    def recommend_from_saved(movie):
        """Recommendation function using saved model"""
        try:
            index = loaded_movies[loaded_movies['title'] == movie].index[0]
            distances = sorted(list(enumerate(loaded_similarity[index])), reverse=True, key=lambda x: x[1])
            
            recommended_movies = []
            for i in distances[1:6]:
                recommended_movies.append(loaded_movies.iloc[i[0]].title)
                
            return recommended_movies
        except:
            return ["Movie not found"]
    
    # Test the loaded model
    test_movie = 'Avatar'
    if test_movie in loaded_movies['title'].values:
        print(f"\nTesting with loaded model - Recommendations for '{test_movie}':")
        recs = recommend_from_saved(test_movie)
        for i, rec in enumerate(recs, 1):
            print(f"{i}. {rec}")
    
except Exception as e:
    print(f"Error loading model: {e}")

Testing the saved model...
Model loaded successfully!
Loaded movies shape: (1489, 3)
Loaded similarity shape: (1489, 1489)

Testing with loaded model - Recommendations for 'Avatar':
1. Mission to Mars
2. Alien³
3. Treasure Planet
4. Planet of the Apes
5. Starship Troopers


In [20]:
#Analysis and Insights 
print("Additional Analysis:")

# Dataset statistics
print(f"Total movies in dataset: {len(movies)}")
print(f"Average number of words in tags: {movies['tags'].str.split().str.len().mean():.2f}")

# Most common words in tags
from collections import Counter
all_words = ' '.join(movies['tags']).split()
word_freq = Counter(all_words)
print(f"\nMost common words in tags:")
for word, freq in word_freq.most_common(10):
    print(f"{word}: {freq}")

# Similarity distribution analysis
print(f"\nSimilarity matrix statistics:")
print(f"Mean similarity: {similarity.mean():.4f}")
print(f"Std similarity: {similarity.std():.4f}")
print(f"Max similarity: {similarity.max():.4f}")
print(f"Min similarity: {similarity.min():.4f}")

print("\n" + "="*50)
print("ANALYSIS COMPLETE!")
print("="*50)
print("\nFiles generated:")
print("- model.pkl (processed movie data)")
print("- similarity.pkl (similarity matrix)")
print("\nNext steps:")
print("1. Use these files in your Flask app (app.py)")
print("2. Test the web interface")
print("3. Consider implementing collaborative filtering")
print("4. Deploy with Docker")

Additional Analysis:
Total movies in dataset: 1489
Average number of words in tags: 77.85

Most common words in tags:
the: 4747
a: 3119
to: 2679
and: 2403
of: 2332
in: 1327
hi: 1291
is: 1009
on: 886
with: 751

Similarity matrix statistics:
Mean similarity: 0.0530
Std similarity: 0.0531
Max similarity: 1.0000
Min similarity: 0.0000

ANALYSIS COMPLETE!

Files generated:
- model.pkl (processed movie data)
- similarity.pkl (similarity matrix)

Next steps:
1. Use these files in your Flask app (app.py)
2. Test the web interface
3. Consider implementing collaborative filtering
4. Deploy with Docker


In [24]:
#Collab Filtering
import pandas as pd
import numpy as np
import pickle
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import TruncatedSVD
import warnings
warnings.filterwarnings('ignore')

print("🎬 Implementing Collaborative Filtering (Simplified Version)...")
print("Note: This version doesn't require the Surprise library")

# Load existing movie data
try:
    movies_df = pickle.load(open('model.pkl', 'rb'))
    print(f"✅ Loaded {len(movies_df)} movies from content-based model")
except:
    print("❌ Could not load existing movie data")
    movies_df = None

if movies_df is not None:
    # Create synthetic user ratings for demonstration
    np.random.seed(42)  # For reproducibility
    
    print("📊 Creating synthetic user-movie rating data...")
    
    # Sample movies for the rating matrix
    sample_movies = movies_df.sample(min(200, len(movies_df)))
    movie_ids = sample_movies.index.tolist()
    movie_titles = sample_movies['title'].tolist()
    
    # Create synthetic users and ratings
    n_users = 100
    n_movies = len(sample_movies)
    
    # Generate realistic rating patterns
    ratings_data = []
    
    for user_id in range(1, n_users + 1):
        # Each user rates 15-50 movies
        n_ratings = np.random.randint(15, min(50, n_movies))
        rated_movies = np.random.choice(movie_ids, n_ratings, replace=False)
        
        # Create user preferences (some users like action, others like comedy, etc.)
        user_preference = np.random.choice(['action', 'comedy', 'drama', 'romance', 'horror'])
        
        for movie_idx in rated_movies:
            # Base rating between 1-5
            base_rating = np.random.choice([1, 2, 3, 4, 5], p=[0.05, 0.1, 0.3, 0.4, 0.15])
            
            # Add some noise but keep realistic distribution
            rating = max(1, min(5, base_rating + np.random.normal(0, 0.5)))
            rating = round(rating)
            
            ratings_data.append({
                'user_id': user_id,
                'movie_id': movie_idx,
                'rating': rating,
                'movie_title': movies_df.loc[movie_idx, 'title']
            })
    
    ratings_df = pd.DataFrame(ratings_data)
    print(f"✅ Created rating dataset: {len(ratings_df)} ratings")
    print(f"📊 Users: {ratings_df['user_id'].nunique()}, Movies: {ratings_df['movie_id'].nunique()}")
    print(f"📈 Rating distribution:\n{ratings_df['rating'].value_counts().sort_index()}")
    
    # Save ratings
    ratings_df.to_csv('synthetic_ratings.csv', index=False)
    print("💾 Saved ratings to synthetic_ratings.csv")

🎬 Implementing Collaborative Filtering (Simplified Version)...
Note: This version doesn't require the Surprise library
✅ Loaded 1489 movies from content-based model
📊 Creating synthetic user-movie rating data...
✅ Created rating dataset: 3221 ratings
📊 Users: 100, Movies: 200
📈 Rating distribution:
rating
1     185
2     368
3     959
4    1098
5     611
Name: count, dtype: int64
💾 Saved ratings to synthetic_ratings.csv


In [25]:
#User-Movie Matrix
class SimpleCollaborativeFilter:
    """Simple collaborative filtering implementation"""
    
    def __init__(self, ratings_df, movies_df):
        self.ratings_df = ratings_df
        self.movies_df = movies_df
        self.user_movie_matrix = None
        self.user_similarity = None
        self.item_similarity = None
        self.svd_model = None
        
        self._build_matrices()
    
    def _build_matrices(self):
        """Build user-movie matrix and similarity matrices"""
        print("🔧 Building user-movie matrix...")
        
        # Create user-movie matrix
        self.user_movie_matrix = self.ratings_df.pivot_table(
            index='user_id', 
            columns='movie_id', 
            values='rating',
            fill_value=0
        )
        
        print(f"✅ User-movie matrix: {self.user_movie_matrix.shape}")
        
        # Calculate user similarity (user-based CF)
        print("🔧 Calculating user similarities...")
        user_matrix = self.user_movie_matrix.values
        self.user_similarity = cosine_similarity(user_matrix)
        
        # Calculate item similarity (item-based CF)
        print("🔧 Calculating item similarities...")
        item_matrix = self.user_movie_matrix.T.values
        self.item_similarity = cosine_similarity(item_matrix)
        
        # Simple SVD for matrix factorization
        print("🔧 Training SVD model...")
        # Convert to binary matrix (rated/not rated) for SVD
        binary_matrix = (self.user_movie_matrix > 0).astype(int)
        
        self.svd_model = TruncatedSVD(n_components=50, random_state=42)
        self.user_factors = self.svd_model.fit_transform(binary_matrix)
        self.item_factors = self.svd_model.components_.T
        
        print("✅ Collaborative filtering models built!")
    
    def predict_user_based(self, user_id, movie_id, k=5):
        """Predict rating using user-based collaborative filtering"""
        if user_id not in self.user_movie_matrix.index:
            return 3.0  # Default rating
        
        if movie_id not in self.user_movie_matrix.columns:
            return 3.0
        
        user_idx = list(self.user_movie_matrix.index).index(user_id)
        movie_idx = list(self.user_movie_matrix.columns).index(movie_id)
        
        # Find k most similar users who rated this movie
        user_sims = self.user_similarity[user_idx]
        movie_ratings = self.user_movie_matrix.iloc[:, movie_idx]
        
        # Get users who rated this movie
        rated_users = movie_ratings > 0
        
        if rated_users.sum() == 0:
            return 3.0
        
        # Get similarities for users who rated this movie
        valid_sims = user_sims[rated_users]
        valid_ratings = movie_ratings[rated_users]
        
        # Get top-k similar users
        if len(valid_sims) > k:
            top_k_indices = np.argsort(valid_sims)[-k:]
            valid_sims = valid_sims[top_k_indices]
            valid_ratings = valid_ratings.iloc[top_k_indices]
        
        # Calculate weighted average
        if valid_sims.sum() == 0:
            return 3.0
        
        prediction = np.average(valid_ratings, weights=valid_sims)
        return max(1.0, min(5.0, prediction))
    
    def predict_item_based(self, user_id, movie_id, k=5):
        """Predict rating using item-based collaborative filtering"""
        if user_id not in self.user_movie_matrix.index:
            return 3.0
        
        if movie_id not in self.user_movie_matrix.columns:
            return 3.0
        
        user_ratings = self.user_movie_matrix.loc[user_id]
        movie_idx = list(self.user_movie_matrix.columns).index(movie_id)
        
        # Find movies rated by this user
        rated_movies = user_ratings > 0
        
        if rated_movies.sum() == 0:
            return 3.0
        
        # Get similarities to target movie
        movie_sims = self.item_similarity[movie_idx]
        
        # Get similarities for movies rated by user
        valid_sims = movie_sims[rated_movies]
        valid_ratings = user_ratings[rated_movies]
        
        # Get top-k similar movies
        if len(valid_sims) > k:
            top_k_indices = np.argsort(valid_sims)[-k:]
            valid_sims = valid_sims[top_k_indices]
            valid_ratings = valid_ratings.iloc[top_k_indices]
        
        # Calculate weighted average
        if valid_sims.sum() == 0:
            return 3.0
        
        prediction = np.average(valid_ratings, weights=valid_sims)
        return max(1.0, min(5.0, prediction))
    
    def predict_svd(self, user_id, movie_id):
        """Predict rating using SVD matrix factorization"""
        if user_id not in self.user_movie_matrix.index:
            return 3.0
        
        if movie_id not in self.user_movie_matrix.columns:
            return 3.0
        
        user_idx = list(self.user_movie_matrix.index).index(user_id)
        movie_idx = list(self.user_movie_matrix.columns).index(movie_id)
        
        # Predict using dot product of factors
        prediction = np.dot(self.user_factors[user_idx], self.item_factors[movie_idx])
        
        # Scale prediction to 1-5 range
        # SVD gives values around 0-1, so we scale to rating range
        scaled_prediction = 1 + 4 * max(0, min(1, prediction))
        
        return scaled_prediction
    
    def get_recommendations(self, user_id, method='user_based', n_recommendations=5):
        """Get movie recommendations for a user"""
        if user_id not in self.user_movie_matrix.index:
            return []
        
        # Get movies user hasn't rated
        user_ratings = self.user_movie_matrix.loc[user_id]
        unrated_movies = user_ratings[user_ratings == 0].index
        
        # Predict ratings for unrated movies
        predictions = []
        
        for movie_id in unrated_movies:
            if method == 'user_based':
                pred_rating = self.predict_user_based(user_id, movie_id)
            elif method == 'item_based':
                pred_rating = self.predict_item_based(user_id, movie_id)
            elif method == 'svd':
                pred_rating = self.predict_svd(user_id, movie_id)
            else:
                pred_rating = 3.0
            
            movie_title = self.movies_df.loc[movie_id, 'title'] if movie_id in self.movies_df.index else "Unknown"
            predictions.append((movie_title, pred_rating, movie_id))
        
        # Sort by predicted rating and return top N
        predictions.sort(key=lambda x: x[1], reverse=True)
        return predictions[:n_recommendations]
    
    def evaluate_model(self, test_size=0.2):
        """Simple evaluation of the collaborative filtering model"""
        print("📊 Evaluating collaborative filtering models...")
        
        # Split data into train/test
        test_data = self.ratings_df.sample(frac=test_size, random_state=42)
        
        methods = ['user_based', 'item_based', 'svd']
        results = {}
        
        for method in methods:
            print(f"   Testing {method}...")
            
            predictions = []
            actuals = []
            
            for _, row in test_data.head(100).iterrows():  # Limit for speed
                user_id = row['user_id']
                movie_id = row['movie_id']
                actual_rating = row['rating']
                
                if method == 'user_based':
                    pred_rating = self.predict_user_based(user_id, movie_id)
                elif method == 'item_based':
                    pred_rating = self.predict_item_based(user_id, movie_id)
                elif method == 'svd':
                    pred_rating = self.predict_svd(user_id, movie_id)
                
                predictions.append(pred_rating)
                actuals.append(actual_rating)
            
            # Calculate RMSE and MAE
            predictions = np.array(predictions)
            actuals = np.array(actuals)
            
            rmse = np.sqrt(np.mean((predictions - actuals) ** 2))
            mae = np.mean(np.abs(predictions - actuals))
            
            results[method] = {'RMSE': rmse, 'MAE': mae}
            print(f"   ✅ {method}: RMSE={rmse:.3f}, MAE={mae:.3f}")
        
        return results

# Build and test collaborative filtering
if 'ratings_df' in locals() and movies_df is not None:
    print("\n🤖 Building Collaborative Filtering System...")
    
    collab_filter = SimpleCollaborativeFilter(ratings_df, movies_df)
    
    # Test different methods
    test_user = 1
    methods = ['user_based', 'item_based', 'svd']
    
    print(f"\n🎯 Testing Recommendations for User {test_user}:")
    
    for method in methods:
        print(f"\n📊 {method.replace('_', '-').title()} Recommendations:")
        recommendations = collab_filter.get_recommendations(test_user, method=method)
        
        for i, (movie_title, pred_rating, movie_id) in enumerate(recommendations, 1):
            print(f"   {i}. {movie_title} (Rating: {pred_rating:.2f})")
    
    # Evaluate models
    evaluation_results = collab_filter.evaluate_model()
    
    # Save the collaborative filtering model
    collab_model_data = {
        'model': collab_filter,
        'user_movie_matrix': collab_filter.user_movie_matrix,
        'ratings_df': ratings_df,
        'evaluation_results': evaluation_results
    }
    
    with open('collaborative_model.pkl', 'wb') as f:
        pickle.dump(collab_model_data, f)
    
    print("\n💾 Saved collaborative filtering model to collaborative_model.pkl")


🤖 Building Collaborative Filtering System...
🔧 Building user-movie matrix...
✅ User-movie matrix: (100, 200)
🔧 Calculating user similarities...
🔧 Calculating item similarities...
🔧 Training SVD model...
✅ Collaborative filtering models built!

🎯 Testing Recommendations for User 1:

📊 User-Based Recommendations:
   1. The Longest Ride (Rating: 5.00)
   2. The Wolverine (Rating: 4.81)
   3. Autumn in New York (Rating: 4.60)
   4. The Change-Up (Rating: 4.42)
   5. Aliens vs Predator: Requiem (Rating: 4.38)

📊 Item-Based Recommendations:
   1. Dream House (Rating: 5.00)
   2. Town & Country (Rating: 4.65)
   3. Pay It Forward (Rating: 4.64)
   4. Sphere (Rating: 4.62)
   5. Rollerball (Rating: 4.60)

📊 Svd Recommendations:
   1. Abduction (Rating: 2.44)
   2. Mars Needs Moms (Rating: 2.28)
   3. The Change-Up (Rating: 2.15)
   4. The General's Daughter (Rating: 2.14)
   5. Ender's Game (Rating: 2.02)
📊 Evaluating collaborative filtering models...
   Testing user_based...
   ✅ user_based:

In [26]:
#Hybrid Recommender
class SimpleHybridRecommender:
    """Simple hybrid recommender without external dependencies"""
    
    def __init__(self, content_weight=0.6, collaborative_weight=0.4):
        self.content_weight = content_weight
        self.collaborative_weight = collaborative_weight
        
        # Load models
        self.load_models()
    
    def load_models(self):
        """Load all required models"""
        try:
            # Content-based models
            self.movies_df = pickle.load(open('model.pkl', 'rb'))
            self.similarity_matrix = pickle.load(open('similarity.pkl', 'rb'))
            print("✅ Content-based models loaded")
            
            # Collaborative filtering model
            with open('collaborative_model.pkl', 'rb') as f:
                collab_data = pickle.load(f)
                self.collab_filter = collab_data['model']
            print("✅ Collaborative filtering model loaded")
            
        except Exception as e:
            print(f"❌ Error loading models: {e}")
    
    def get_content_recommendations(self, movie_title, n_recommendations=10):
        """Get content-based recommendations"""
        try:
            movie_indices = self.movies_df[self.movies_df['title'] == movie_title].index
            if len(movie_indices) == 0:
                return {}
            
            movie_idx = movie_indices[0]
            sim_scores = list(enumerate(self.similarity_matrix[movie_idx]))
            sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
            
            content_scores = {}
            for i, score in sim_scores[1:n_recommendations+1]:
                movie_name = self.movies_df.iloc[i]['title']
                content_scores[movie_name] = score
            
            return content_scores
            
        except Exception as e:
            print(f"Error in content recommendations: {e}")
            return {}
    
    def get_collaborative_recommendations(self, user_id, method='user_based', n_recommendations=10):
        """Get collaborative filtering recommendations"""
        try:
            recommendations = self.collab_filter.get_recommendations(
                user_id, method=method, n_recommendations=n_recommendations
            )
            
            collab_scores = {}
            for movie_title, rating, movie_id in recommendations:
                # Normalize rating to 0-1 scale
                normalized_score = (rating - 1) / 4
                collab_scores[movie_title] = normalized_score
            
            return collab_scores
            
        except Exception as e:
            print(f"Error in collaborative recommendations: {e}")
            return {}
    
    def get_hybrid_recommendations(self, movie_title=None, user_id=None, n_recommendations=5):
        """Get hybrid recommendations"""
        print(f"🔗 Generating hybrid recommendations...")
        
        hybrid_scores = {}
        
        # Get content-based scores
        if movie_title:
            content_scores = self.get_content_recommendations(movie_title, n_recommendations * 2)
            print(f"✅ Content-based: {len(content_scores)} movies")
            
            for movie, score in content_scores.items():
                hybrid_scores[movie] = self.content_weight * score
        
        # Get collaborative scores
        if user_id:
            collab_scores = self.get_collaborative_recommendations(user_id, n_recommendations=n_recommendations * 2)
            print(f"✅ Collaborative: {len(collab_scores)} movies")
            
            for movie, score in collab_scores.items():
                if movie in hybrid_scores:
                    hybrid_scores[movie] += self.collaborative_weight * score
                else:
                    hybrid_scores[movie] = self.collaborative_weight * score
        
        # Sort and return top recommendations
        sorted_recommendations = sorted(hybrid_scores.items(), key=lambda x: x[1], reverse=True)
        return sorted_recommendations[:n_recommendations]

# Test simple hybrid recommender
if 'collab_filter' in locals():
    print("\n🚀 Testing Simple Hybrid Recommender...")
    
    hybrid = SimpleHybridRecommender(content_weight=0.6, collaborative_weight=0.4)
    
    # Test different scenarios
    test_cases = [
        {"movie": "Avatar", "user": 1, "desc": "Hybrid (movie + user)"},
        {"movie": "Avatar", "user": None, "desc": "Content-based only"},
        {"movie": None, "user": 1, "desc": "Collaborative only"}
    ]
    
    for test in test_cases:
        print(f"\n📝 {test['desc']}:")
        recommendations = hybrid.get_hybrid_recommendations(
            movie_title=test['movie'],
            user_id=test['user'],
            n_recommendations=5
        )
        
        for i, (movie, score) in enumerate(recommendations, 1):
            print(f"   {i}. {movie} (Score: {score:.3f})")
    
    # Save hybrid model
    with open('hybrid_model.pkl', 'wb') as f:
        pickle.dump(hybrid, f)
    
    print("\n💾 Saved hybrid model to hybrid_model.pkl")

print("\n" + "="*60)
print("🎉 SIMPLIFIED BONUS TASKS COMPLETE!")
print("✅ Collaborative Filtering (User-based, Item-based, SVD)")
print("✅ Hybrid Recommender System")
print("✅ No external dependencies required!")
print("="*60)


🚀 Testing Simple Hybrid Recommender...
✅ Content-based models loaded
✅ Collaborative filtering model loaded

📝 Hybrid (movie + user):
🔗 Generating hybrid recommendations...
✅ Content-based: 10 movies
✅ Collaborative: 10 movies
   1. The Longest Ride (Score: 0.400)
   2. The Wolverine (Score: 0.381)
   3. Autumn in New York (Score: 0.360)
   4. The Change-Up (Score: 0.342)
   5. Aliens vs Predator: Requiem (Score: 0.338)

📝 Content-based only:
🔗 Generating hybrid recommendations...
✅ Content-based: 10 movies
   1. Mission to Mars (Score: 0.219)
   2. Alien³ (Score: 0.210)
   3. Treasure Planet (Score: 0.192)
   4. Planet of the Apes (Score: 0.191)
   5. Starship Troopers (Score: 0.185)

📝 Collaborative only:
🔗 Generating hybrid recommendations...
✅ Collaborative: 10 movies
   1. The Longest Ride (Score: 0.400)
   2. The Wolverine (Score: 0.381)
   3. Autumn in New York (Score: 0.360)
   4. The Change-Up (Score: 0.342)
   5. Aliens vs Predator: Requiem (Score: 0.338)

💾 Saved hybrid mod