<a href="https://colab.research.google.com/github/divyansh212/AI-MODELS-/blob/main/Content-Based%20Recommandation%20system.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# Content-Based Movie Recommendation System

import numpy as np
import pandas as pd
import ast
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import nltk
from nltk.stem.porter import PorterStemmer

# Step 1: Load and structure data
def load_and_process_data():
    """Load movie data and merge credits with movies"""
    movies = pd.read_csv("tmdb_5000_movies.csv.zip")
    credits = pd.read_csv("tmdb_5000_credits.csv.zip")
    movies = movies.merge(credits, on="title")
    movies = movies[["id", "genres", "keywords", "title", "overview", "cast", "crew"]]
    return movies

# Step 2: Data preprocessing functions
def convert_list_fields(obj):
    """Convert JSON string to list of names"""
    try:
        L = []
        for i in ast.literal_eval(obj):
            L.append(i["name"])
        return L
    except:
        return []

def convert_cast(obj):
    """Convert cast to list of top 3 actors"""
    try:
        L = []
        counter = 0
        for i in ast.literal_eval(obj):
            if counter == 3:
                break
            L.append(i["name"])
            counter += 1
        return L
    except:
        return []

def get_director(obj):
    """Extract director from crew"""
    try:
        L = []
        for i in ast.literal_eval(obj):
            if i["job"] == "Director":
                L.append(i["name"])
                break
        return L
    except:
        return []

def preprocess_movies(movies):
    """Clean and process movie data"""
    # Drop rows with missing values
    movies.dropna(inplace=True)

    # Convert JSON strings to lists
    movies["genres"] = movies["genres"].apply(convert_list_fields)
    movies["keywords"] = movies["keywords"].apply(convert_list_fields)
    movies["cast"] = movies["cast"].apply(convert_cast)
    movies["crew"] = movies["crew"].apply(get_director)

    # Split overview into words
    movies["overview"] = movies["overview"].apply(lambda x: x.split() if isinstance(x, str) else [])

    # Remove spaces from names
    movies["genres"] = movies["genres"].apply(lambda x: [i.replace(" ", "") for i in x])
    movies["keywords"] = movies["keywords"].apply(lambda x: [i.replace(" ", "") for i in x])
    movies["cast"] = movies["cast"].apply(lambda x: [i.replace(" ", "") for i in x])
    movies["crew"] = movies["crew"].apply(lambda x: [i.replace(" ", "") for i in x])

    # Create combined tags
    movies["Tags"] = (movies["overview"] + movies["genres"] +
                     movies["keywords"] + movies["cast"] + movies["crew"])

    # Create final dataset
    new_movies = movies[["id", "title", "Tags"]].copy()
    new_movies["Tags"] = new_movies["Tags"].apply(lambda x: " ".join(x))
    new_movies["Tags"] = new_movies["Tags"].apply(lambda x: x.lower())

    return new_movies

# Step 3: Text processing with stemming
def apply_stemming(df):
    """Apply Porter stemming to tags"""
    stemmer = PorterStemmer()

    def stem_text(text):
        words = []
        for word in text.split():
            words.append(stemmer.stem(word))
        return " ".join(words)

    df_copy = df.copy()
    df_copy["Tags"] = df_copy["Tags"].apply(stem_text)
    return df_copy

# Step 4: Create similarity matrix
def create_similarity_matrix(df):
    """Create cosine similarity matrix from tags"""
    cv = CountVectorizer(max_features=10000, stop_words="english")
    vectors = cv.fit_transform(df["Tags"]).toarray()
    similarity_matrix = cosine_similarity(vectors)
    return similarity_matrix, cv

# Step 5: Recommendation function
def recommend_movies(movie_title, df, similarity_matrix, num_recommendations=5):
    """
    Recommend movies based on similarity to input movie

    Args:
        movie_title (str): Title of the movie to base recommendations on
        df (DataFrame): Movie dataset with id, title, and Tags columns
        similarity_matrix (array): Cosine similarity matrix
        num_recommendations (int): Number of recommendations to return

    Returns:
        list: List of recommended movie titles
    """
    try:
        # Find the index of the movie
        movie_index = df[df["title"] == movie_title].index[0]

        # Get similarity scores for this movie
        distances = similarity_matrix[movie_index]

        # Sort movies by similarity (excluding the movie itself)
        movie_list = sorted(list(enumerate(distances)), reverse=True, key=lambda x: x[1])[1:num_recommendations+1]

        # Get recommended movie titles
        recommendations = []
        for i in movie_list:
            recommendations.append(df.iloc[i[0]]["title"])

        return recommendations

    except IndexError:
        return f"Movie '{movie_title}' not found in database."
    except Exception as e:
        return f"Error occurred: {str(e)}"

# Main execution function
def build_recommendation_system():
    """Build complete recommendation system"""
    print("Loading and processing data...")

    # Load data (you'll need to have the CSV files available)
    try:
        movies = load_and_process_data()
        print(f"Loaded {len(movies)} movies")
    except FileNotFoundError:
        print("Error: Could not find the required CSV files")
        return None, None, None

    # Preprocess data
    processed_movies = preprocess_movies(movies)
    print("Data preprocessing completed")

    # Apply stemming
    stemmed_movies = apply_stemming(processed_movies)
    print("Text stemming completed")

    # Create similarity matrix
    similarity_matrix, vectorizer = create_similarity_matrix(stemmed_movies)
    print("Similarity matrix created")

    return stemmed_movies, similarity_matrix, vectorizer

# Example usage
def demo_recommendations():
    """Demonstrate the recommendation system"""
    df, similarity_matrix, vectorizer = build_recommendation_system()

    if df is not None:
        # Test with a popular movie
        test_movie = "Avatar"
        recommendations = recommend_movies(test_movie, df, similarity_matrix)

        print(f"\nMovies similar to '{test_movie}':")
        for i, movie in enumerate(recommendations, 1):
            print(f"{i}. {movie}")

# Additional utility functions
def search_movies(df, search_term):
    """Search for movies containing a specific term"""
    matches = df[df["title"].str.contains(search_term, case=False, na=False)]
    return matches["title"].tolist()

def get_movie_info(df, movie_title):
    """Get information about a specific movie"""
    movie_data = df[df["title"] == movie_title]
    if not movie_data.empty:
        return {
            "id": movie_data.iloc[0]["id"],
            "title": movie_data.iloc[0]["title"],
            "tags": movie_data.iloc[0]["Tags"][:200] + "..."  # First 200 chars
        }
    else:
        return None

# Run the demo
if __name__ == "__main__":
    demo_recommendations()

Loading and processing data...
Loaded 4809 movies
Data preprocessing completed
Text stemming completed
Similarity matrix created

Movies similar to 'Avatar':
1. Aliens vs Predator: Requiem
2. Aliens
3. Falcon Rising
4. Titan A.E.
5. Independence Day
