In [None]:
import pandas as pd

## Data Processing 

In [None]:
# Load the ratings data
ratings = pd.read_csv('ml-100k/u.data', sep='\t', header=None, names=['user_id', 'item_id', 'rating', 'timestamp'])

# Load the movies data
movies = pd.read_csv('ml-100k/u.item', sep='|', encoding='latin-1', header=None,
                     names=['item_id', 'title'] + [f'genre_{i}' for i in range(19)])

# Load the user data
users = pd.read_csv('ml-100k/u.user', sep='|', header=None, 
                    names=['user_id', 'age', 'gender', 'occupation', 'zip_code'])

In [None]:
users

In [None]:
user_interactions = ratings.groupby('user_id').size()
user_indexes = user_interactions[user_interactions > 200].index 
ratings = ratings[ratings.user_id.isin(user_indexes)]
ratings

In [None]:
movies = movies[movies.index.get_level_values(0).isin(ratings.item_id)]
movies = movies.dropna(axis=1)
movies

## Results

1- Add more ( preferablly more complex ) recommendation strategies and models

2- Extend analysis to include recommendation metrics that offer a different perspective ( Think about if accuracy is the most important thing for us ? )

In [None]:
from sklearn.model_selection import train_test_split
import random 

test_user = random.choice(ratings.user_id.unique())

def get_data_for_user(ratings, movies, user_id):    
    temp_ratings = ratings[ratings.user_id == user_id]
    temp_movies = movies[movies.index.get_level_values(0).isin(temp_ratings.item_id)]
    temp_data = temp_movies.merge(temp_ratings, left_on=temp_movies.index.get_level_values(0), right_on='item_id').drop(['item_id', 'user_id', 'timestamp'], axis=1) 
    X = temp_data.drop('rating', axis=1)
    y = temp_data['rating']
    return train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier

X_train, X_test, y_train, y_test = get_data_for_user(ratings, movies, test_user)

param_grid = {
    'LogisticRegression': {
        'model': LogisticRegression(),
        'params': {
            'C': [0.1, 1, 10],
            'solver': ['liblinear', 'lbfgs']
        }
    },
    'RandomForest': {
        'model': RandomForestClassifier(),
        'params': {
            'n_estimators': [10, 50, 100],
            'max_depth': [None, 10, 20]
        }
    },
    'DecisionTree': {
        'model': DecisionTreeClassifier(),
        'params': {
            'max_depth': [None, 10, 20],
            'min_samples_split': [2, 5, 10]
        }
    },
    'MLPClassifier': {
        'model': MLPClassifier(max_iter=1000, random_state=42),
        'params': {
            'hidden_layer_sizes': [(100,), (100, 50), (150,), (150, 75)],
            'activation': ['relu'],
            'solver': ['adam'],
            'alpha': [0.0001, 0.001, 0.01]
        }
    }
}

# Perform grid search for each model
best_models = {}
for model_name, model_info in param_grid.items():
    grid_search = GridSearchCV(model_info['model'], model_info['params'], cv=5, scoring='accuracy')
    grid_search.fit(X_train, y_train)
    best_models[model_name] = grid_search.best_estimator_

best_models

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from surprise import SVD, NMF, Dataset, Reader
from surprise.model_selection import GridSearchCV as SurpriseGridSearch, cross_validate
from xgboost import XGBClassifier

X_train, X_test, y_train, y_test = get_data_for_user(ratings, movies, test_user)

param_grid = {
    'LogisticRegression': {
        'model': LogisticRegression(),
        'params': {
            'C': [0.1, 1, 10],
            'solver': ['liblinear', 'lbfgs']
        }
    },
    'RandomForest': {
        'model': RandomForestClassifier(),
        'params': {
            'n_estimators': [10, 50, 100],
            'max_depth': [None, 10, 20]
        }
    },
    'DecisionTree': {
        'model': DecisionTreeClassifier(),
        'params': {
            'max_depth': [None, 10, 20],
            'min_samples_split': [2, 5, 10]
        }
    },
    'MLPClassifier': {
        'model': MLPClassifier(max_iter=1000, random_state=42),
        'params': {
            'hidden_layer_sizes': [(100,), (100, 50), (150,), (150, 75)],
            'activation': ['relu'],
            'solver': ['adam'],
            'alpha': [0.0001, 0.001, 0.01]
        }
    },
    'SVD': {
        'model': SVD,
        'params': {
            'n_factors': [50, 100, 150],
            'n_epochs': [20, 50],
            'lr_all': [0.005, 0.01]
        }
    },
    'NMF': {
        'model': NMF,
        'params': {
            'n_factors': [10, 20],
            'n_epochs': [20, 50]
        }
    },
    'XGBoost': {
        'model': XGBClassifier(),
        'params': {
            'n_estimators': [50, 100],
            'learning_rate': [0.1, 0.01],
            'max_depth': [3, 5, 10]
        }
    }
}

# Perform grid search for Scikit-learn and Surprise models
best_models = {}
for model_name, model_info in param_grid.items():
    model = model_info['model']
    params = model_info['params']
    print(f"Evaluating {model_name}...")
    
    if model_name in ['SVD', 'NMF']:
        # Use Surprise's GridSearchCV for hyperparameter tuning
        gs = SurpriseGridSearch(param_grid=params, measures=['rmse'], cv=5)
        gs.fit(ratings)
        best_params = gs.best_params['rmse']
        best_score = gs.best_score['rmse']
        best_models[model_name] = (model(**best_params), best_score)
        print(f"Best RMSE for {model_name}: {best_score}, Params: {best_params}")
    
    else:
        # Perform Grid Search for Scikit-learn compatible models
        grid_search = GridSearchCV(estimator=model, param_grid=params, cv=5, scoring='accuracy', n_jobs=-1)
        grid_search.fit(X_train, y_train)
        best_models[model_name] = (grid_search.best_estimator_, grid_search.best_score_)
        print(f"Best model for {model_name}: {grid_search.best_estimator_}, Score: {grid_search.best_score_}")

# Display results
print("\nBest Models and Scores:")
for model_name, model_info in best_models.items():
    if model_name in ['SVD', 'NMF']:
        print(f"{model_name}: Best RMSE = {model_info[1]}, Params = {model_info[0]}")
    else:
        print(f"{model_name}: Best Accuracy = {model_info[1]}, Best Estimator = {model_info[0]}")

In [None]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

def user_based_cf(ratings_matrix, target_user, top_k=5):
    """
    User-Based Collaborative Filtering.

    Parameters:
        ratings_matrix (ndarray): User-Item rating matrix (users as rows, items as columns).
        target_user (int): Index of the target user for recommendations.
        top_k (int): Number of similar users to consider.

    Returns:
        recommendations (list): List of recommended item indices for the target user.
    """
    # Compute cosine similarity between users
    user_similarity = cosine_similarity(ratings_matrix)
    
    # Get similarity scores for the target user
    target_user_similarities = user_similarity[target_user]
    
    # Sort users by similarity (excluding the target user itself)
    similar_users = np.argsort(-target_user_similarities)[1:top_k+1]
    
    # Compute weighted average of ratings for the target user
    scores = np.zeros(ratings_matrix.shape[1])
    for user in similar_users:
        scores += target_user_similarities[user] * ratings_matrix[user]
    
    # Exclude items already rated by the target user
    already_rated = ratings_matrix[target_user] > 0
    scores[already_rated] = -np.inf
    
    # Recommend top items
    recommendations = np.argsort(-scores)[:top_k]
    return recommendations

def item_based_cf(ratings_matrix, target_user, top_k=5):
    """
    Item-Based Collaborative Filtering.

    Parameters:
        ratings_matrix (ndarray): User-Item rating matrix (users as rows, items as columns).
        target_user (int): Index of the target user for recommendations.
        top_k (int): Number of items to recommend.

    Returns:
        recommendations (list): List of recommended item indices for the target user.
    """
    # Compute cosine similarity between items
    item_similarity = cosine_similarity(ratings_matrix.T)
    
    # Get ratings from the target user
    user_ratings = ratings_matrix[target_user]
    
    # Compute weighted scores for items
    scores = np.zeros(ratings_matrix.shape[1])
    for item_idx, rating in enumerate(user_ratings):
        if rating > 0:  # Only consider rated items
            scores += rating * item_similarity[item_idx]
    
    # Exclude items already rated by the target user
    already_rated = ratings_matrix[target_user] > 0
    scores[already_rated] = -np.inf
    
    # Recommend top items
    recommendations = np.argsort(-scores)[:top_k]
    return recommendations

ratings_matrix = np.array([
    [4, 0, 0, 5, 1],
    [5, 5, 4, 0, 0],
    [0, 0, 0, 3, 4],
    [3, 4, 0, 0, 2],
    [0, 3, 4, 5, 0]
])

# Target user for recommendations
target_user = 0

# User-Based CF
user_cf_recommendations = user_based_cf(ratings_matrix, target_user, top_k=3)
print(f"User-Based CF Recommendations for user {target_user}: {user_cf_recommendations}")

# Item-Based CF
item_cf_recommendations = item_based_cf(ratings_matrix, target_user, top_k=3)
print(f"Item-Based CF Recommendations for user {target_user}: {item_cf_recommendations}")



In [None]:
results = {}

for model_name, model in best_models.items():
    results[model_name] = []

for user in ratings.user_id.unique():
    X_train, X_test, y_train, y_test = get_data_for_user(ratings, movies, user)

    for model_name, model in best_models.items():
        model.fit(X_train, y_train)
        model.score(X_test, y_test)
        results[model_name].append(model.score(X_test, y_test))


In [None]:
results

In [None]:
results_df = pd.DataFrame(results)
results_df.mean()