In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.metrics.pairwise import cosine_similarity
from scipy.stats import pearsonr
from sklearn.model_selection import KFold
import time
from tqdm import tqdm
from multiprocessing import Pool, cpu_count
from multiprocessing.managers import SyncManager as Manager
from functools import partial
import warnings
warnings.filterwarnings('ignore')

class ProgressTracker:
    def __init__(self, total, desc):
        self.pbar = tqdm(total=total, desc=desc, position=0)
    
    def update(self):
        self.pbar.update(1)
    
    def close(self):
        self.pbar.close()
        
class MovieRecommender:
    def __init__(self):
        self.users_df = None
        self.movies_df = None
        self.ratings_df = None
        self.user_movie_matrix = None
        
    def load_data(self):
        """
        Load the MovieLens 1M dataset
        """
        print("\n=== Loading MovieLens 1M Dataset ===")
        
        print("Loading ratings data...")
        self.ratings_df = pd.read_csv('ratings.dat', 
                                    sep='::', 
                                    names=['UserID', 'MovieID', 'Rating', 'Timestamp'],
                                    engine='python')
        print(f"✓ Loaded {len(self.ratings_df)} ratings")
        
        print("Loading movies data...")
        self.movies_df = pd.read_csv('movies.dat',
                                   sep='::',
                                   names=['MovieID', 'Title', 'Genres'],
                                   engine='python',
                                   encoding='ISO-8859-1')
        print(f"✓ Loaded {len(self.movies_df)} movies")
        
        print("Loading users data...")
        self.users_df = pd.read_csv('users.dat',
                                  sep='::',
                                  names=['UserID', 'Gender', 'Age', 'Occupation', 'Zip-code'],
                                  engine='python')
        print(f"✓ Loaded {len(self.users_df)} users")
        
        print("Creating user-movie rating matrix...")
        self.user_movie_matrix = self.ratings_df.pivot(
            index='UserID',
            columns='MovieID',
            values='Rating'
        ).fillna(0)
        print(f"✓ Created matrix with shape: {self.user_movie_matrix.shape}")
        
        return self.ratings_df, self.movies_df, self.users_df
    
    def calculate_similarity(self, user_id, similarity_metric='cosine'):
        """Calculate similarity between target user and all other users"""
        user_vector = self.user_movie_matrix.loc[user_id].values.reshape(1, -1)
        other_users = self.user_movie_matrix.values
        
        if similarity_metric == 'cosine':
            return cosine_similarity(user_vector, other_users)[0]
        elif similarity_metric == 'pearson':
            return np.array([pearsonr(user_vector[0], other_user)[0] 
                           for other_user in other_users])
    
    def predict_ratings(self, user_id, k=5, similarity_metric='cosine'):
        """Predict ratings for all movies for a given user using kNN"""
        similarities = self.calculate_similarity(user_id, similarity_metric)
        user_index = self.user_movie_matrix.index.get_loc(user_id)
        similarities[user_index] = -np.inf
        neighbor_indices = np.argsort(similarities)[-k:]
        
        neighbor_ratings = self.user_movie_matrix.iloc[neighbor_indices]
        weights = similarities[neighbor_indices].reshape(-1, 1)
        weighted_ratings = neighbor_ratings * weights
        predicted_ratings = weighted_ratings.sum(axis=0) / np.abs(weights).sum()
        
        return predicted_ratings

    def evaluate_rmse(self, test_user_id, k=5, similarity_metric='cosine'):
        """Calculate RMSE for a test user"""
        actual_ratings = self.user_movie_matrix.loc[test_user_id]
        predicted_ratings = self.predict_ratings(test_user_id, k, similarity_metric)
        
        rated_movies = actual_ratings > 0
        rmse = np.sqrt(mean_squared_error(
            actual_ratings[rated_movies],
            predicted_ratings[rated_movies]
        ))
        
        return rmse

def evaluate_users_parallel(args):
    """Helper function for parallel processing of user evaluation"""
    test_users, recommender, k, similarity_metric = args
    rmse_values = []
    
    # Calculate RMSE for each user
    for user_id in test_users:
        rmse = recommender.evaluate_rmse(user_id, k, similarity_metric)
        rmse_values.append(rmse)
            
    return np.mean(rmse_values)

class ParallelMovieRecommender(MovieRecommender):
    def find_optimal_k(self, k_values=None, n_folds=5, similarity_metric='cosine', n_jobs=None):
        """
        Find optimal k using parallel k-fold cross-validation with progress bars
        """
        print(f"\n=== Finding Optimal k using {n_folds}-fold Cross-validation (Parallel) ===")
        print(f"Similarity metric: {similarity_metric}")
        
        if k_values is None:
            k_values = [5, 10, 20, 30, 40, 50, 75, 100]
        
        if n_jobs is None:
            n_jobs = cpu_count()
            
        print(f"Testing k values: {k_values}")
        print(f"Using {n_jobs} parallel processes")
        
        # Initialize KFold
        kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)
        user_ids = self.user_movie_matrix.index.values
        
        results = []
        # Create progress bars for k values
        k_pbar = tqdm(k_values, desc="Overall Progress", position=0)
        
        with Pool(processes=n_jobs) as pool:
            for k in k_pbar:
                start_time = time.time()
                k_pbar.set_description(f"Evaluating k={k}")
                fold_rmses = []
                
                # Create progress bar for folds
                fold_pbar = tqdm(total=n_folds, desc="Folds Progress", position=1, leave=False)
                
                for fold, (train_idx, test_idx) in enumerate(kf.split(user_ids), 1):
                    test_users = user_ids[test_idx]
                    
                    # Split test users into chunks
                    n_chunks = n_jobs
                    chunk_size = max(1, len(test_users) // n_chunks)
                    user_chunks = [test_users[i:i + chunk_size] for i in range(0, len(test_users), chunk_size)]
                    
                    # Create progress bar for users
                    user_pbar = tqdm(total=len(user_chunks), 
                                   desc="Users Progress", 
                                   position=2, 
                                   leave=False)
                    
                    # Prepare arguments for parallel processing
                    chunk_args = [
                        (chunk, self, k, similarity_metric)
                        for chunk in user_chunks
                    ]
                    
                    # Process chunks in parallel with progress tracking
                    chunk_rmses = []
                    for chunk_arg in chunk_args:
                        result = evaluate_users_parallel(chunk_arg)
                        chunk_rmses.append(result)
                        user_pbar.update(1)
                    
                    fold_rmse = np.mean(chunk_rmses)
                    fold_rmses.append(fold_rmse)
                    
                    # Update progress bars
                    fold_pbar.update(1)
                    user_pbar.close()
                
                avg_rmse = np.mean(fold_rmses)
                std_rmse = np.std(fold_rmses)
                
                elapsed_time = time.time() - start_time
                fold_pbar.close()
                
                # Update progress bar description with results
                k_pbar.set_postfix({
                    'RMSE': f'{avg_rmse:.4f}±{std_rmse:.4f}',
                    'Time': f'{elapsed_time:.2f}s'
                })
                
                results.append({
                    'k': k,
                    'avg_rmse': avg_rmse,
                    'std_rmse': std_rmse
                })
        
        return pd.DataFrame(results)

def evaluate_knn_parameters_parallel(n_jobs=None):
    """
    Evaluate different k values and similarity metrics in parallel with progress tracking
    """
    print("\n====================================")
    print("Starting Parallel KNN Parameter Evaluation")
    print("====================================")
    
    recommender = ParallelMovieRecommender()
    recommender.load_data()
    
    k_values = [5, 10, 20, 30, 40, 50, 75, 100]
    metrics = ['cosine', 'pearson']
    
    # Create progress bar for metrics
    metric_pbar = tqdm(metrics, desc="Metrics Progress", position=0)
    results = []
    
    for metric in metric_pbar:
        metric_pbar.set_description(f"Evaluating {metric} similarity")
        metric_results = recommender.find_optimal_k(
            k_values=k_values,
            n_folds=5,
            similarity_metric=metric,
            n_jobs=n_jobs
        )
        metric_results['metric'] = metric
        results.append(metric_results)
    
    return pd.concat(results, ignore_index=True)

if __name__ == "__main__":
    print("\nParallel Movie Recommendation System - Parameter Optimization")
    print("=" * 60)
    
    # Use 80% of available CPU cores by default
    n_jobs = max(1, int(cpu_count() * 0.8))
    print(f"\nUsing {n_jobs} parallel processes")
    
    start_time = time.time()
    
    # Create overall progress tracker
    with tqdm(total=1, desc="Overall Progress", position=0) as pbar:
        results = evaluate_knn_parameters_parallel(n_jobs=n_jobs)
        pbar.update(1)
    
    print("\n=== Final Results ===")
    print("\nDetailed Results:")
    print(results.to_string(index=False))
    
    # Find best parameters
    best_result = results.loc[results['avg_rmse'].idxmin()]
    print("\nBest Configuration Found:")
    print("-" * 30)
    print(f"k: {best_result['k']}")
    print(f"Similarity metric: {best_result['metric']}")
    print(f"RMSE: {best_result['avg_rmse']:.4f} ± {best_result['std_rmse']:.4f}")
    
    total_time = time.time() - start_time
    print(f"\nTotal execution time: {format_time(total_time)}")




Parallel Movie Recommendation System - Parameter Optimization

Using 6 parallel processes


Overall Progress:   0%|                                                                          | 0/1 [00:00<?, ?it/s]


Starting Parallel KNN Parameter Evaluation

=== Loading MovieLens 1M Dataset ===
Loading ratings data...
✓ Loaded 1000209 ratings
Loading movies data...
✓ Loaded 3883 movies
Loading users data...
✓ Loaded 6040 users
Creating user-movie rating matrix...
✓ Created matrix with shape: (6040, 3706)


Evaluating cosine similarity:   0%|                                                              | 0/2 [00:00<?, ?it/s]


=== Finding Optimal k using 5-fold Cross-validation (Parallel) ===
Similarity metric: cosine
Testing k values: [5, 10, 20, 30, 40, 50, 75, 100]
Using 6 parallel processes


Evaluating k=5:   0%|                                                                            | 0/8 [00:00<?, ?it/s]
Folds Progress:   0%|                                                                            | 0/5 [00:00<?, ?it/s][A

Users Progress:   0%|                                                                            | 0/7 [00:00<?, ?it/s][A[A

Users Progress:  14%|█████████▋                                                          | 1/7 [00:39<03:56, 39.49s/it][A[A

Users Progress:  29%|███████████████████▍                                                | 2/7 [01:18<03:15, 39.12s/it][A[A

Users Progress:  43%|█████████████████████████████▏                                      | 3/7 [01:56<02:35, 38.88s/it][A[A

Users Progress:  57%|██████████████████████████████████████▊                             | 4/7 [02:35<01:56, 38.87s/it][A[A

Users Progress:  71%|████████████████████████████████████████████████▌                   | 5/7 [03:14<01:17, 38.93s/it][


=== Finding Optimal k using 5-fold Cross-validation (Parallel) ===
Similarity metric: pearson
Testing k values: [5, 10, 20, 30, 40, 50, 75, 100]
Using 6 parallel processes


Evaluating k=5:   0%|                                                                            | 0/8 [00:00<?, ?it/s]
Folds Progress:   0%|                                                                            | 0/5 [00:00<?, ?it/s][A

Users Progress:   0%|                                                                            | 0/7 [00:00<?, ?it/s][A[A

Users Progress:  14%|█████████▋                                                          | 1/7 [01:04<06:25, 64.20s/it][A[A

Users Progress:  29%|███████████████████▍                                                | 2/7 [02:08<05:22, 64.52s/it][A[A

Users Progress:  43%|█████████████████████████████▏                                      | 3/7 [03:13<04:18, 64.60s/it][A[A

Users Progress:  57%|██████████████████████████████████████▊                             | 4/7 [04:18<03:14, 64.75s/it][A[A

Users Progress:  71%|████████████████████████████████████████████████▌                   | 5/7 [05:23<02:09, 64.86s/it][


=== Final Results ===

Detailed Results:
  k  avg_rmse  std_rmse  metric
  5  2.333323  0.020899  cosine
 10  2.280480  0.022589  cosine
 20  2.273451  0.024451  cosine
 30  2.277733  0.023065  cosine
 40  2.284900  0.022154  cosine
 50  2.291184  0.021338  cosine
 75  2.302243  0.020110  cosine
100  2.311641  0.018711  cosine
  5  2.423173  0.012596 pearson
 10  2.385874  0.014499 pearson
 20  2.385730  0.016490 pearson
 30  2.390709  0.017191 pearson
 40  2.394509  0.013667 pearson
 50  2.401580  0.014129 pearson
 75  2.414216  0.012635 pearson
100  2.426296  0.010834 pearson

Best Configuration Found:
------------------------------
k: 20
Similarity metric: cosine
RMSE: 2.2735 ± 0.0245


NameError: name 'format_time' is not defined

In [15]:
import matplotlib
  # Use non-interactive backend
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np

# First, try to fix the environment
import matplotlib
matplotlib.rcParams['backend'] = 'Agg'

# Create DataFrame from your results
data = {
    'k': [5, 10, 20, 30, 40, 50, 75, 100, 5, 10, 20, 30, 40, 50, 75, 100],
    'avg_rmse': [2.333323, 2.280480, 2.273451, 2.277733, 2.284900, 2.291184, 2.302243, 2.311641,
                 2.423173, 2.385874, 2.385730, 2.390709, 2.394509, 2.401580, 2.414216, 2.426296],
    'std_rmse': [0.020899, 0.022589, 0.024451, 0.023065, 0.022154, 0.021338, 0.020110, 0.018711,
                 0.012596, 0.014499, 0.016490, 0.017191, 0.013667, 0.014129, 0.012635, 0.010834],
    'metric': ['cosine']*8 + ['pearson']*8
}

results_df = pd.DataFrame(data)

def save_all_plots():
    # Set style
    sns.set_style("whitegrid")
    colors = ['#2E86C1', '#E74C3C', '#2ECC71', '#F39C12']

    # 1. Line Plot with Error Bands
    plt.figure(figsize=(12, 6))
    for i, metric in enumerate(['cosine', 'pearson']):
        metric_data = results_df[results_df['metric'] == metric]
        plt.plot(metric_data['k'], metric_data['avg_rmse'], 
                marker='o', linewidth=2, label=f'{metric.capitalize()} Similarity',
                color=colors[i])
        
        plt.fill_between(metric_data['k'],
                        metric_data['avg_rmse'] - metric_data['std_rmse'],
                        metric_data['avg_rmse'] + metric_data['std_rmse'],
                        alpha=0.2, color=colors[i])

    plt.title('RMSE vs k for Different Similarity Metrics', fontsize=14, pad=20)
    plt.xlabel('Number of Neighbors (k)', fontsize=12)
    plt.ylabel('RMSE', fontsize=12)
    plt.legend(fontsize=10)
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.savefig('knn_comparison.png', dpi=300, bbox_inches='tight')
    plt.close()

    # 2. Heatmap
    plt.figure(figsize=(12, 4))
    pivot_data = results_df.pivot(index='metric', columns='k', values='avg_rmse')
    sns.heatmap(pivot_data, annot=True, fmt='.3f', cmap='YlOrRd_r',
               cbar_kws={'label': 'RMSE'})
    plt.title('RMSE Heatmap: Performance Across k Values and Metrics', 
             fontsize=14, pad=20)
    plt.xlabel('Number of Neighbors (k)', fontsize=12)
    plt.ylabel('Similarity Metric', fontsize=12)
    plt.tight_layout()
    plt.savefig('knn_heatmap.png', dpi=300, bbox_inches='tight')
    plt.close()

    # 3. Stability Analysis
    plt.figure(figsize=(12, 6))
    for i, metric in enumerate(['cosine', 'pearson']):
        metric_data = results_df[results_df['metric'] == metric]
        plt.plot(metric_data['k'], metric_data['std_rmse'], 
                marker='s', linewidth=2, label=f'{metric.capitalize()} Similarity',
                color=colors[i])

    plt.title('Model Stability Analysis: Standard Deviation Across k Values', 
             fontsize=14, pad=20)
    plt.xlabel('Number of Neighbors (k)', fontsize=12)
    plt.ylabel('Standard Deviation of RMSE', fontsize=12)
    plt.legend(fontsize=10)
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.savefig('knn_stability.png', dpi=300, bbox_inches='tight')
    plt.close()

    # 4. Improvement Plot
    plt.figure(figsize=(12, 6))
    for i, metric in enumerate(['cosine', 'pearson']):
        metric_data = results_df[results_df['metric'] == metric]
        baseline_rmse = metric_data['avg_rmse'].max()
        improvement = ((baseline_rmse - metric_data['avg_rmse']) / baseline_rmse) * 100
        
        plt.plot(metric_data['k'], improvement, 
                marker='D', linewidth=2, label=f'{metric.capitalize()} Similarity',
                color=colors[i])

    plt.title('Performance Improvement Over Baseline', 
             fontsize=14, pad=20)
    plt.xlabel('Number of Neighbors (k)', fontsize=12)
    plt.ylabel('Improvement (%)', fontsize=12)
    plt.legend(fontsize=10)
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.savefig('knn_improvement.png', dpi=300, bbox_inches='tight')
    plt.close()

    # Create and save summary statistics
    summary_stats = pd.DataFrame({
        'Metric': ['Cosine', 'Pearson'],
        'Best RMSE': [
            f"{results_df[results_df['metric'] == 'cosine']['avg_rmse'].min():.4f}",
            f"{results_df[results_df['metric'] == 'pearson']['avg_rmse'].min():.4f}"
        ],
        'Optimal k': [
            results_df[results_df['metric'] == 'cosine']['avg_rmse'].idxmin(),
            results_df[results_df['metric'] == 'pearson']['avg_rmse'].idxmin()
        ],
        'Std Dev': [
            f"{results_df[results_df['metric'] == 'cosine']['std_rmse'].mean():.4f}",
            f"{results_df[results_df['metric'] == 'pearson']['std_rmse'].mean():.4f}"
        ]
    })
    
    summary_stats.to_csv('knn_summary_stats.csv', index=False)

    # Print analysis
    print("\nKNN Analysis Results")
    print("=" * 50)
    print("\nBest Overall Configuration:")
    best_idx = results_df['avg_rmse'].idxmin()
    print(f"• Similarity Metric: {results_df.loc[best_idx, 'metric']}")
    print(f"• k Value: {results_df.loc[best_idx, 'k']}")
    print(f"• RMSE: {results_df.loc[best_idx, 'avg_rmse']:.4f} ± {results_df.loc[best_idx, 'std_rmse']:.4f}")

    print("\nPerformance by Metric:")
    for metric in ['cosine', 'pearson']:
        metric_data = results_df[results_df['metric'] == metric]
        print(f"\n{metric.capitalize()} Similarity:")
        print(f"• Best k: {metric_data.loc[metric_data['avg_rmse'].idxmin(), 'k']}")
        print(f"• RMSE Range: {metric_data['avg_rmse'].min():.4f} - {metric_data['avg_rmse'].max():.4f}")
        print(f"• Average Stability (std): {metric_data['std_rmse'].mean():.4f}")

    print("\nFiles saved:")
    print("• knn_comparison.png - RMSE vs k comparison")
    print("• knn_heatmap.png - Performance heatmap")
    print("• knn_stability.png - Stability analysis")
    print("• knn_improvement.png - Improvement over baseline")
    print("• knn_summary_stats.csv - Summary statistics")

# Run the analysis and save all plots
save_all_plots()

ImportError: cannot import name '_png' from 'matplotlib' (C:\ProgramData\Anaconda3\envs\ct-env\lib\site-packages\matplotlib\__init__.py)

ImportError: cannot import name '_png' from 'matplotlib' (C:\ProgramData\Anaconda3\envs\ct-env\lib\site-packages\matplotlib\__init__.py)

<Figure size 864x432 with 1 Axes>

In [10]:
# !pip install seaborn
!pip install matplotlib

