In [9]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import scipy.sparse as sp

In [10]:
class PersonalMovieRecommender:
    def __init__(self):
        self.movies_df = None
        self.watch_history_df = None
        self.movie_features = None
        self.tfidf_vectorizer = None
        self.valid_movie_ids = None
        
    def load_data(self, movies_path, watch_history_path):
        """Load movie dataset and personal watch history"""
        try:
            # Load movies dataset
            self.movies_df = pd.read_csv(movies_path, sep=';')
            print(f"Loaded {len(self.movies_df)} movies with columns: {', '.join(self.movies_df.columns)}")
            
            # Load watch history
            self.watch_history_df = pd.read_csv(watch_history_path, sep=';')
            print(f"Loaded {len(self.watch_history_df)} watched movies with columns: {', '.join(self.watch_history_df.columns)}")
            
            # Create a set of valid movie IDs for quick lookup
            self.valid_movie_ids = set(self.movies_df['ID'].values)
            
            # Check for watch history entries not in the movie dataset
            valid_watch_entries = self.watch_history_df[self.watch_history_df['ID'].isin(self.valid_movie_ids)]
            invalid_count = len(self.watch_history_df) - len(valid_watch_entries)
            
            if invalid_count > 0:
                print(f"Warning: {invalid_count} entries in watch history don't match any movie IDs in the dataset.")
                # Keep only valid entries
                self.watch_history_df = valid_watch_entries
            
            return True
        except Exception as e:
            print(f"Error loading data: {e}")
            return False
        
    def preprocess_data(self):
        """Preprocess the movie data to create feature vectors"""
        try:
            # Handle missing values
            for col in ['Genres', 'Director', 'Cast']:
                if col in self.movies_df.columns:
                    self.movies_df[col] = self.movies_df[col].fillna('')
                else:
                    print(f"Warning: Column '{col}' not found in dataset. Using empty values.")
                    self.movies_df[col] = ''
            
            # Convert year to string if it exists
            if 'Year' in self.movies_df.columns:
                self.movies_df['Year'] = self.movies_df['Year'].astype(str)
            else:
                print(f"Warning: Column 'Year' not found in dataset. Using empty values.")
                self.movies_df['Year'] = ''

            # Create a combined feature string
            self.movies_df['Features'] = (
                self.movies_df['Genres'] + ' ' +
                self.movies_df['Director'] + ' ' + 
                self.movies_df['Cast'] + ' ' +
                self.movies_df['Year']
            )
            
            # Create TF-IDF vectors for movie features
            self.tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
            self.movie_features = self.tfidf_vectorizer.fit_transform(self.movies_df['Features'])
            
            print(f"Feature extraction complete. Feature matrix shape: {self.movie_features.shape}")
            return True
        except Exception as e:
            print(f"Error preprocessing data: {e}")
            return False
        
    def build_user_profile(self):
        """Build a profile based on liked movies in watch history"""
        try:
            # Get movies the user liked
            if 'Liked/Disliked' not in self.watch_history_df.columns:
                print("Warning: 'Liked/Disliked' column not found in watch history. Using all watched movies.")
                liked_movies = self.watch_history_df
            else:
                liked_movies = self.watch_history_df[self.watch_history_df['Liked/Disliked'] == 'liked']
            
            if liked_movies.empty:
                print("No liked movies found in watch history.")
                # Fall back to all watched movies if no likes are specified
                liked_movies = self.watch_history_df
                if liked_movies.empty:
                    print("No watched movies found either.")
                    return None
            
            print(f"Building user profile based on {len(liked_movies)} movies")
            
            # Find the indices of liked movies in the dataset
            movie_indices = []
            for _, row in liked_movies.iterrows():
                movie_id = row['ID']
                indices = self.movies_df.index[self.movies_df['ID'] == movie_id].tolist()
                if indices:
                    idx = indices[0]
                    if 0 <= idx < len(self.movies_df):
                        movie_indices.append(idx)
                    else:
                        print(f"Warning: Movie ID {movie_id} has out-of-range index {idx}")
            
            if not movie_indices:
                print("None of the liked/watched movies are in the dataset.")
                return None
            
            print(f"Found {len(movie_indices)} valid movie indices")
            
            # Create a user profile by averaging the feature vectors of liked movies
            # Use sparse matrix operations to avoid dense array conversion
            profile_vectors = self.movie_features[movie_indices]
            user_profile = sp.csr_matrix(profile_vectors.mean(axis=0))
            
            return user_profile
        except Exception as e:
            print(f"Error building user profile: {e}")
            return None
    
    def get_recommendations(self, top_n=10):
        """Get content-based recommendations"""
        try:
            # Build the user profile
            user_profile = self.build_user_profile()
            
            if user_profile is None:
                print("Could not build user profile")
                return pd.DataFrame()
            
            # Calculate similarity between user profile and all movies
            similarities = cosine_similarity(user_profile, self.movie_features).flatten()
            
            # Get IDs of movies already watched
            watched_ids = set(self.watch_history_df['ID'].values)
            
            # Create a list of (movie index, similarity score) tuples for unwatched movies
            movie_scores = []
            for i in range(len(similarities)):
                if i < len(self.movies_df):
                    movie_id = self.movies_df.iloc[i]['ID']
                    if movie_id not in watched_ids:
                        movie_scores.append((i, similarities[i]))
            
            if not movie_scores:
                print("No unwatched movies found with similarity scores")
                return pd.DataFrame()
                
            # Sort by similarity score in descending order
            movie_scores.sort(key=lambda x: x[1], reverse=True)
            
            # Get the top N recommendations
            top_n = min(top_n, len(movie_scores))
            top_indices = [i for i, _ in movie_scores[:top_n]]
            
            if not top_indices:
                print("No movie indices found after filtering")
                return pd.DataFrame()
            
            # Return the recommended movies with all columns
            recommendations = self.movies_df.iloc[top_indices].copy()
            
            # Add similarity score
            recommendations['Similarity'] = [score for _, score in movie_scores[:top_n]]
            
            # Move frequently accessed columns to the front for easy viewing
            important_cols = ['Title', 'Year', 'Genres', 'Director', 'Rating', 'Similarity']
            available_cols = [col for col in important_cols if col in recommendations.columns]
            other_cols = [col for col in recommendations.columns if col not in important_cols]
            reordered_cols = available_cols + other_cols
            
            return recommendations[reordered_cols]
        except Exception as e:
            print(f"Error getting recommendations: {e}")
            return pd.DataFrame()
    
    def get_similar_movies(self, movie_id, top_n=10):
        """Get movies similar to a given movie"""
        try:
            # Check if the movie ID exists in our dataset
            movie_indices = self.movies_df.index[self.movies_df['ID'] == movie_id].tolist()
            
            if not movie_indices:
                print(f"Movie ID {movie_id} not found in dataset")
                return pd.DataFrame()
            
            movie_index = movie_indices[0]
            
            # Get the movie's feature vector
            movie_features = self.movie_features[movie_index]
            
            # Calculate similarity between this movie and all other movies
            similarities = cosine_similarity(movie_features, self.movie_features).flatten()
            
            # Create a list of (movie index, similarity score) tuples
            movie_scores = []
            for i in range(len(similarities)):
                if i != movie_index and i < len(self.movies_df):
                    movie_scores.append((i, similarities[i]))
            
            # Sort by similarity score
            movie_scores.sort(key=lambda x: x[1], reverse=True)
            
            # Get the top N similar movies
            top_n = min(top_n, len(movie_scores))
            top_indices = [i for i, _ in movie_scores[:top_n]]
            
            # Return the similar movies with all columns
            similar_movies = self.movies_df.iloc[top_indices].copy()
            
            # Add similarity score
            similar_movies['Similarity'] = [score for _, score in movie_scores[:top_n]]
            
            # Move frequently accessed columns to the front
            important_cols = ['Title', 'Year', 'Genres', 'Director', 'Rating', 'Similarity']
            available_cols = [col for col in important_cols if col in similar_movies.columns]
            other_cols = [col for col in similar_movies.columns if col not in important_cols]
            reordered_cols = available_cols + other_cols
            
            return similar_movies[reordered_cols]
        except Exception as e:
            print(f"Error finding similar movies: {e}")
            return pd.DataFrame()
    
    def analyze_preferences(self):
        """Analyze movie preferences based on watch history"""
        try:
            # Build the user profile
            user_profile = self.build_user_profile()
            
            if user_profile is None:
                return "Could not build user profile for analysis"
            
            # Get feature names from the vectorizer
            feature_names = self.tfidf_vectorizer.get_feature_names_out()
            
            # Convert sparse matrix to array and flatten
            user_weights = user_profile.toarray().flatten()
            
            # Create (feature, weight) pairs and sort by weight
            feature_weights = [(name, weight) for name, weight in zip(feature_names, user_weights) if weight > 0]
            feature_weights.sort(key=lambda x: x[1], reverse=True)
            
            # Get top features
            top_features = feature_weights[:50]
            
            # Collect watched movies data for genre analysis
            if 'Liked/Disliked' in self.watch_history_df.columns:
                liked_ids = set(self.watch_history_df[self.watch_history_df['Liked/Disliked'] == 'Liked']['ID'])
                watched_movies = self.movies_df[self.movies_df['ID'].isin(liked_ids)]
            else:
                watched_ids = set(self.watch_history_df['ID'])
                watched_movies = self.movies_df[self.movies_df['ID'].isin(watched_ids)]
            
            # Analyze genres
            genre_count = {}
            if 'Genres' in watched_movies.columns:
                for genres in watched_movies['Genres'].dropna():
                    for genre in genres.split():
                        genre_count[genre] = genre_count.get(genre, 0) + 1
            
            # Analyze directors
            director_count = {}
            if 'Director' in watched_movies.columns:
                for directors in watched_movies['Director'].dropna():
                    for director in directors.split():
                        director_count[director] = director_count.get(director, 0) + 1
            
            # Analyze years
            year_count = {}
            if 'Year' in watched_movies.columns:
                for year in watched_movies['Year'].dropna():
                    year_count[str(year)] = year_count.get(str(year), 0) + 1
            
            # Create the analysis report
            report = "Your Movie Preference Analysis:\n\n"
            
            # Report on top genres
            if genre_count:
                report += "Top Genres:\n"
                for genre, count in sorted(genre_count.items(), key=lambda x: x[1], reverse=True)[:5]:
                    report += f"- {genre}: {count} movies\n"
                report += "\n"
            
            # Report on top directors
            if director_count:
                report += "Top Directors:\n"
                for director, count in sorted(director_count.items(), key=lambda x: x[1], reverse=True)[:5]:
                    report += f"- {director}: {count} movies\n"
                report += "\n"
            
            # Report on era preferences
            if year_count:
                report += "Preferred Years:\n"
                for year, count in sorted(year_count.items(), key=lambda x: x[1], reverse=True)[:5]:
                    report += f"- {year}: {count} movies\n"
                report += "\n"
            
            # Report on top features from TF-IDF
            report += "Most Important Features in Your Profile:\n"
            for feature, weight in top_features[:15]:
                report += f"- {feature} (weight: {weight:.4f})\n"
            
            return report
        except Exception as e:
            return f"Error analyzing preferences: {e}"

In [11]:
if __name__ == "__main__":
    # Initialize the recommender
    recommender = PersonalMovieRecommender()
    
    # Load data - update these paths to your actual files
    recommender.load_data('data/data.csv', 'data/user_data.csv')
    
    # Preprocess data
    recommender.preprocess_data()
    
    # Get recommendations
    recommendations = recommender.get_recommendations(top_n=10)
    
    if not recommendations.empty:
        print("\nTop 10 movie recommendations:")
        
        # Check which columns are available and print them
        display_cols = [col for col in ['Title', 'Year', 'Genres', 'Similarity'] if col in recommendations.columns]
        print(recommendations[display_cols])
    else:
        print("\nNo recommendations available.")
    
    # Analyze preferences
    analysis = recommender.analyze_preferences()
    print("\nMovie Preference Analysis:")
    print(analysis)

Loaded 283047 movies with columns: ID, Title, Year, Genres, Director, Cast, RunningTime, Rating, Votes
Loaded 24 watched movies with columns: ID, Title, Year, Genres, Director, Cast, RunningTime, Rating, Votes, Liked/Disliked
Feature extraction complete. Feature matrix shape: (283047, 5000)
Building user profile based on 22 movies
Found 22 valid movie indices

Top 10 movie recommendations:
                                               Title  Year  \
252611                                  All the Rage  2016   
179928                                       A Dream  2021   
166778                         The Dark Knight Rises  2012   
219962  The Wick: Dispatches from the Isle of Wonder  2013   
249987                                       Dunkirk  2017   
55733                                    Out on Bail  1989   
197506                                   All Is Lost  2013   
273093                                      The King  2019   
26478                                     Black S