In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import MinMaxScaler
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import re
from sklearn.metrics import mean_squared_error, mean_absolute_error
import matplotlib.pyplot as plt
from collections import defaultdict
from sklearn.model_selection import train_test_split

In [2]:
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Diego\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Diego\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
ratings_path = "data/ml-latest-small/ratings.csv"
movies_path = "data/ml-latest-small/movies.csv"
tags_path = "data/ml-latest-small/tags.csv"

ratings = pd.read_csv(ratings_path)
movies = pd.read_csv(movies_path)
tags = pd.read_csv(tags_path)

In [None]:
class ImprovedContentRecommender:
    def __init__(self, movies_df, ratings_df, tags_df):
        self.movies_df = movies_df.copy()
        self.ratings_df = ratings_df.copy()
        self.tags_df = tags_df.copy()
        self.tags_df['tag'] = self.tags_df['tag'].astype(str)
        self.tfidf_matrix = None
        self.nn_model = None
        
    def preprocess_text(self, text):
        if pd.isna(text):
            return ""
        
        text = str(text)
        text = re.sub(r'[^a-zA-Z\s]', '', text.lower())
        
        lemmatizer = WordNetLemmatizer()
        words = text.split()
        words = [lemmatizer.lemmatize(word) for word in words]
        
        stop_words = set(stopwords.words('english'))
        words = [word for word in words if word not in stop_words]
        
        return ' '.join(words)
    
    def create_feature_matrix(self):
        self.movies_df['genres'] = self.movies_df['genres'].str.replace('|', ' ')
        
        tag_counts = self.tags_df['tag'].value_counts()
        tag_weights = 1 / np.log1p(tag_counts)
        
        weighted_tags_df = pd.DataFrame({
            'tag': tag_counts.index,
            'weight': tag_weights.values
        })

        weighted_tags = self.tags_df.merge(
            weighted_tags_df,
            on='tag',
            how='left'
        )
        
        weighted_tags['weighted_tag'] = weighted_tags['tag'] + ' ' + \
            (weighted_tags['weight'] * 1).round(3).astype(str)
        
        tags_aggregated = weighted_tags.groupby('movieId')['weighted_tag'].apply(
            lambda x: ' '.join(x)
        ).reset_index()
        
        self.movies_df = self.movies_df.merge(
            tags_aggregated,
            on='movieId',
            how='left'
        )
        self.movies_df['weighted_tag'].fillna('', inplace=True)
        
        self.movies_df['year'] = self.movies_df['title'].str.extract(
            r'\((\d{4})\)'
        ).fillna('2000')
        self.movies_df['year'] = pd.to_numeric(self.movies_df['year'])
    
        rating_stats = self.ratings_df.groupby('movieId').agg({
            'rating': ['count', 'mean', 'std']
        }).reset_index()
        rating_stats.columns = ['movieId', 'rating_count', 'rating_mean', 'rating_std']
        
        C = rating_stats['rating_count'].mean()
        m = rating_stats['rating_mean'].mean()
        rating_stats['bayesian_rating'] = (
            (C * m + rating_stats['rating_count'] * rating_stats['rating_mean']) /
            (C + rating_stats['rating_count'])
        )
        
        self.movies_df = self.movies_df.merge(
            rating_stats,
            on='movieId',
            how='left'
        )
        
        rating_cols = ['rating_count', 'rating_mean', 'rating_std', 'bayesian_rating']
        self.movies_df[rating_cols] = self.movies_df[rating_cols].fillna(0)
        
        self.movies_df['content'] = (
            self.movies_df['genres'].fillna('').apply(self.preprocess_text) + ' ' +
            self.movies_df['weighted_tag'].fillna('').apply(self.preprocess_text) + ' ' +
            self.movies_df['year'].astype(str) + ' ' +
            (self.movies_df['bayesian_rating'] * 2).round(1).astype(str)
        )
        
        self.tfidf = TfidfVectorizer(
            ngram_range=(1, 2),
            max_features=5000,
            min_df=2,
            max_df=0.95,
            stop_words='english'
        )
        self.tfidf_matrix = self.tfidf.fit_transform(self.movies_df['content'])
        
        return self.tfidf_matrix
    
    def build_similarity_model(self):
        cosine_sim = cosine_similarity(self.tfidf_matrix, self.tfidf_matrix).astype(np.float32)
        
        scaler = MinMaxScaler()
        popularity_penalty = scaler.fit_transform(
            self.movies_df[['rating_count']].values
        )
        popularity_weight = 0.2
        
        current_year = 2024
        years = self.movies_df['year'].values
        recency_bonus = scaler.fit_transform(
            (current_year - years).reshape(-1, 1)
        )
        recency_weight = 0.1
        
        self.similarity_matrix = (
            cosine_sim * (1 - popularity_weight - recency_weight) +
            (popularity_penalty * popularity_weight) +
            (recency_bonus * recency_weight)
        ).astype(np.float32)
        
        self.nn_model = NearestNeighbors(
            n_neighbors=20,
            metric='precomputed',
            algorithm='brute'
        )
        self.nn_model.fit(1 - self.similarity_matrix)
        
        return self.nn_model
    
    def get_recommendations(self, movie_title, n_recommendations=10):
        try:
            movie_idx = self.movies_df[
                self.movies_df['title'] == movie_title
            ].index[0]
        except (IndexError, KeyError):
            print(f"Movie '{movie_title}' not found in the database.")
            return pd.DataFrame()
        
        distances, indices = self.nn_model.kneighbors(
            1 - self.similarity_matrix[movie_idx].reshape(1, -1)
        )
        
        similarities = 1 - distances.flatten()
        
        final_indices = [indices.flatten()[0]]
        for _ in range(1, n_recommendations):
            avg_similarities = np.mean([
                self.similarity_matrix[idx] for idx in final_indices
            ], axis=0)
            
            candidates = indices.flatten()
            scores = similarities - 0.3 * avg_similarities[candidates]
            
            for candidate_idx in candidates[np.argsort(-scores)]:
                if candidate_idx not in final_indices:
                    final_indices.append(candidate_idx)
                    break
        
        recommendations = self.movies_df.iloc[final_indices][
            ['title', 'bayesian_rating', 'rating_count', 'genres']
        ].copy()
        
        recommendations['similarity_score'] = similarities[
            [list(indices.flatten()).index(idx) for idx in final_indices]
        ]
        
        return recommendations

    def fit(self):
        """Fit the complete model"""
        print("Creating feature matrix...")
        self.create_feature_matrix()
        print("Building similarity model...")
        self.build_similarity_model()
        return self

In [None]:
improved_recommender = ImprovedContentRecommender(movies, ratings, tags)
improved_recommender.fit()


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  self.movies_df['weighted_tag'].fillna('', inplace=True)


Creating feature matrix...
Building similarity model...


<__main__.ImprovedContentRecommender at 0x148f263ea60>

In [None]:
recommendations = improved_recommender.get_recommendations("Interstellar (2014)")
recommendations["title"]

8376      Interstellar (2014)
8697    Doctor Strange (2016)
1945         Following (1998)
8347         Divergent (2014)
9392           Arrival (2016)
7372         Inception (2010)
8414     Transcendence (2014)
8252           Gravity (2013)
8990      The Revenant (2015)
7212            Avatar (2009)
Name: title, dtype: object

In [None]:
class RecommenderEvaluator:
    def __init__(self, ratings_df, movies_df, recommender):
        """Initialize evaluator with data and recommender model"""
        self.ratings_df = ratings_df.copy()
        self.movies_df = movies_df.copy()
        self.recommender = recommender
        self.metrics = {}
    def calculate_rating_metrics(self, test_df, k=10):
        print("Calculating rating metrics...")
        actual_ratings = []
        predicted_ratings = []
        
        test_users = test_df['userId'].unique()
        if len(test_users) > 100:
            np.random.seed(42)
            test_users = np.random.choice(test_users, 100, replace=False)
        
        for user_id in test_users:
            user_ratings = test_df[test_df['userId'] == user_id]
            for _, row in user_ratings.iterrows():
                movie_title = self.movies_df[
                    self.movies_df['movieId'] == row['movieId']
                ]['title'].iloc[0]
                
                recs = self.recommender.get_recommendations(movie_title, k)
                if not recs.empty:
                    predicted_rating = recs['bayesian_rating'].iloc[0]
                    actual_ratings.append(row['rating'])
                    predicted_ratings.append(predicted_rating)
        
        rmse = np.sqrt(mean_squared_error(actual_ratings, predicted_ratings))
        mae = mean_absolute_error(actual_ratings, predicted_ratings)
        
        self.metrics['rmse'] = rmse
        self.metrics['mae'] = mae
        
        return rmse, mae
    def calculate_coverage(self, sample_size=100):
        print("Calculating catalog coverage...")
        total_movies = len(self.movies_df)
        recommended_movies = set()
        
        sample_movies = self.movies_df['title'].sample(
            n=min(sample_size, len(self.movies_df)),
            random_state=42
        )
        
        for movie_title in sample_movies:
            recs = self.recommender.get_recommendations(movie_title)
            if not recs.empty:
                recommended_movies.update(recs['title'].values)
        
        coverage = len(recommended_movies) / total_movies
        self.metrics['coverage'] = coverage
        
        return coverage

    def calculate_diversity(self, sample_size=100):
        print("Calculating recommendation diversity...")
        diversity_scores = []
        
        sample_movies = self.movies_df['title'].sample(
            n=min(sample_size, len(self.movies_df)),
            random_state=42
        )
        
        for movie_title in sample_movies:
            recs = self.recommender.get_recommendations(movie_title)
            if not recs.empty:
                genres_list = recs['genres'].str.split().values
                unique_genres = set()
                for genres in genres_list:
                    unique_genres.update(genres)
                
                genre_diversity = len(unique_genres) / (len(recs) * recs['genres'].str.split().str.len().mean())
                diversity_scores.append(genre_diversity)
        
        diversity = np.mean(diversity_scores)
        self.metrics['diversity'] = diversity
        
        return diversity

    def calculate_novelty(self, sample_size=100):
        print("Calculating novelty...")
        novelty_scores = []
        
        item_popularity = self.ratings_df['movieId'].value_counts()
        total_ratings = len(self.ratings_df)
        item_popularity = item_popularity / total_ratings
        
        sample_movies = self.movies_df['title'].sample(
            n=min(sample_size, len(self.movies_df)),
            random_state=42
        )
        
        for movie_title in sample_movies:
            recs = self.recommender.get_recommendations(movie_title)
            if not recs.empty:
                rec_movies = self.movies_df[
                    self.movies_df['title'].isin(recs['title'])
                ]['movieId']
                nov_scores = [-np.log2(item_popularity.get(mid, 1/total_ratings)) 
                            for mid in rec_movies]
                novelty_scores.append(np.mean(nov_scores))
        
        novelty = np.mean(novelty_scores)
        self.metrics['novelty'] = novelty
        
        return novelty
    
    def train_test_split(self, test_size=0.2, random_state=42):
        return train_test_split(
            self.ratings_df,
            test_size=test_size,
            random_state=random_state,
            stratify=self.ratings_df['userId']
        )

    def calculate_ranking_metrics(self, test_df, k=10):
        print("Calculating ranking metrics...")
        precision_at_k = []
        recall_at_k = []
        mrr_scores = []
        ndcg_scores = []

        test_users = test_df['userId'].unique()[:100]

        for user_id in test_users:
            user_ratings = test_df[test_df['userId'] == user_id]
            relevant_movies = set(user_ratings[user_ratings['rating'] >= 4]['movieId'])

            if not relevant_movies:
                continue

            recommended_movies = []
            for _, row in user_ratings.iterrows():
                movie_title = self.movies_df[self.movies_df['movieId'] == row['movieId']]['title'].iloc[0]
                recs = self.recommender.get_recommendations(movie_title, k)

                if not recs.empty:
                    recommended_movies.extend(recs['title'].tolist())

            recommended_movie_ids = self.movies_df[self.movies_df['title'].isin(recommended_movies)]['movieId'].tolist()

            hits = len(set(recommended_movie_ids) & relevant_movies)

            precision = hits / k if k > 0 else 0
            recall = hits / len(relevant_movies) if relevant_movies else 0

            precision_at_k.append(precision)
            recall_at_k.append(recall)

            reciprocal_rank = 0.0
            for rank, movie_id in enumerate(recommended_movie_ids):
                if movie_id in relevant_movies:
                    reciprocal_rank = 1 / (rank + 1)
                    break
            mrr_scores.append(reciprocal_rank)

            dcg = 0.0
            idcg = sum([1 / np.log2(i + 2) for i in range(min(len(relevant_movies), k))])
            for rank, movie_id in enumerate(recommended_movie_ids[:k]):
                if movie_id in relevant_movies:
                    dcg += 1 / np.log2(rank + 2)
            ndcg = dcg / idcg if idcg > 0 else 0
            ndcg_scores.append(ndcg)

        average_precision = np.mean(precision_at_k) if precision_at_k else 0
        average_recall = np.mean(recall_at_k) if recall_at_k else 0
        average_mrr = np.mean(mrr_scores) if mrr_scores else 0
        average_ndcg = np.mean(ndcg_scores) if ndcg_scores else 0

        self.metrics['precision@k'] = average_precision
        self.metrics['recall@k'] = average_recall
        self.metrics['mrr'] = average_mrr
        self.metrics['ndcg@k'] = average_ndcg

        return self.metrics

    def evaluate_all(self):
        print("Starting comprehensive evaluation...")

        train_df, test_df = self.train_test_split()

        rmse, mae = self.calculate_rating_metrics(test_df)
        coverage = self.calculate_coverage()
        diversity = self.calculate_diversity()
        novelty = self.calculate_novelty()

        ranking_metrics = self.calculate_ranking_metrics(test_df)

        print("\nEvaluation Results:")
        print(f"RMSE: {rmse:.3f}")
        print(f"MAE: {mae:.3f}")
        print(f"Catalog Coverage: {coverage:.3f}")
        print(f"Recommendation Diversity: {diversity:.3f}")
        print(f"Novelty Score: {novelty:.3f}")
        print(f"Precision@K: {ranking_metrics['precision@k']:.3f}")
        print(f"Recall@K: {ranking_metrics['recall@k']:.3f}")
        print(f"MRR: {ranking_metrics['mrr']:.3f}")
        print(f"NDCG@K: {ranking_metrics['ndcg@k']:.3f}")

        return self.metrics