In [44]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import scipy as sp
from scipy.special import logsumexp
from sklearn.feature_extraction.text import TfidfVectorizer #ContentFiltering?
from sklearn.metrics.pairwise import cosine_similarity 
from surprise import Reader, Dataset, SVD, SVDpp, NMF, SlopeOne, CoClustering
from scipy.sparse import csr_matrix


from surprise import KNNBasic
from surprise import KNNWithMeans

from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from surprise import accuracy
from time import time

from surprise.model_selection import GridSearchCV
from surprise.model_selection import train_test_split
from surprise.model_selection import cross_validate
from surprise.model_selection import KFold
import heapq


# Libraries used during sorting procedures.
import heapq # <-- Efficient sorting of large lists
import operator # <-- Convienient item retrieval during iteration 

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [3]:
#Import the train and test sets
train_df=pd.read_csv("/kaggle/input/edsa-movie-recommender-challenge-2022/train.csv")
test_df=pd.read_csv("/kaggle/input/edsa-movie-recommender-challenge-2022/test.csv")

In [4]:
#Glance at train
train_df.head()

In [5]:
#Glance at test
test_df.head()

In [6]:
#Drop timestamp
train_df=train_df.drop(["timestamp"],axis=1)

In [7]:
sampled_df=train_df.sample(n=20000,random_state=2023)

In [47]:
# Bounds for predictions i.e. rating scale
bounds = Reader(rating_scale=(0.5, 5))

# A sample of 20k data points to be used for validating perfomance before large scale training using the full data set on the best perfoming model
validation = Dataset.load_from_df(train_df[["userId", "movieId", "rating"]].sample(frac=1, random_state=2023)[:20000],bounds)

In [49]:
validation

In [8]:
with sns.axes_style('white'):
    g = sns.factorplot("rating", data=sampled_df, aspect=2.0,kind='count')
    g.set_ylabels("Total number of ratings")
print (f'Average rating in dataset: {np.mean(sampled_df["rating"])}')

In [50]:
# function to split data into 5 folds, fitting and testing the algorithm on each fold and then returing the RMSE score for each fold. 

def train_test(algo, data):                   
    
    kf = KFold(n_splits = 5)
    list_rmse = []
    for trainset, testset in kf.split(data):
        algo.fit(trainset)
        predictions = algo.test(testset)

        list_rmse.append(accuracy.rmse(predictions))  # prints RMSE score 
    return list_rmse



In [52]:
# SVD algorithm
svd = SVD()

rsme_svd = train_test(svd, validation)

In [54]:
# Predict ratings from test dataset using SVD

# separate user_id and movie_id from test dataframe dataset into lists
uid = np.array(test_df["userId"].values.tolist())
iid = np.array(test_df["movieId"].values.tolist())

# list of prediction of ratings of user and movie from test set 
predictions = [[svd.predict(uid[i], iid[i])] for i in range(len(iid))]

# format as required by kaggle 
Ids = [str(predictions[i][0].uid) +'_'+ str(predictions[i][0].iid) for i in range(len(predictions))]
ratings = [predictions[i][0].est for i in range(len(predictions))]

In [55]:
# Preparing submission file
Out3_df = pd.DataFrame(list(zip(Ids, ratings)), columns=['Id', 'rating'])

# Output
Out3_df.to_csv('submission.csv', index=False)

# view submission file
Out3_df.head()

In [56]:
Out3_df['rating']=Out3_df['rating'].apply(myround)

In [58]:
# Output
Out3_df.to_csv('submission3.csv', index=False)

# view submission file
Out3_df.head()

In [57]:
Out3_df.head()

In [9]:
util_matrix = sampled_df.pivot_table(index=['userId'], 
                                       columns=['movieId'],
                                       values='rating',
                                    fill_value=0,
                                    aggfunc='mean') 

In [10]:
util_matrix 

In [11]:
util_matrix_T= util_matrix.T
util_matrix_T = util_matrix_T.loc[:, (util_matrix_T != 0).any(axis=0)]
# Save the utility matrix in scipy's sparse matrix format
util_matrix_sparse = sp.sparse.csr_matrix(util_matrix_T.values)

In [12]:
util_matrix_T

In [13]:
# Compute the similarity matrix using the cosine similarity metric
user_similarity = cosine_similarity(util_matrix_sparse.T)
# Save the matrix as a dataframe to allow for easier indexing  
user_sim_df = pd.DataFrame(user_similarity, 
                           index = util_matrix_T.columns, 
                           columns = util_matrix_T.columns)

# Review a small portion of the constructed similartiy matrix  
user_sim_df[:5]

In [14]:
# Compute the similarity matrix using the cosine similarity metric
Item_similarity = cosine_similarity(util_matrix_sparse)
# Save the matrix as a dataframe to allow for easier indexing  
Item_sim_df = pd.DataFrame(Item_similarity, 
                           index = util_matrix.columns, 
                           columns = util_matrix.columns)

# Review a small portion of the constructed similartiy matrix  
Item_sim_df[:5]

In [15]:
def collab_generate_top_N_recommendations(userId, N=10, k=20):
    # Cold-start problem - no ratings given by the reference user. 
    # With no further user data, we solve this by simply recommending
    # the top-N most popular books in the item catalog. 
    if userId not in user_sim_df.columns:
        return train_df.groupby('movieId').mean().sort_values(by='rating',
                                        ascending=False).index[:N].to_list()
    
    # Gather the k users which are most similar to the reference user 
    sim_users = user_sim_df.sort_values(by=userId, ascending=False).index[1:k+1]
    favorite_user_items = [] # <-- List of highest rated items gathered from the k users  
    most_common_favorites = {} # <-- Dictionary of highest rated items in common for the k users
    
    for i in sim_users:
        # Maximum rating given by the current user to an item 
        max_score = util_matrix_T.loc[:, i].max()
        # Save the names of items maximally rated by the current user   
        favorite_user_items.append(util_matrix_T[util_matrix_T.loc[:, i]==max_score].index.tolist())
        
    # Loop over each user's favorite items and tally which ones are 
    # most popular overall.
    for item_collection in range(len(favorite_user_items)):
        for item in favorite_user_items[item_collection]: 
            if item in most_common_favorites:
                most_common_favorites[item] += 1
            else:
                most_common_favorites[item] = 1
    # Sort the overall most popular items and return the top-N instances
    sorted_list = sorted(most_common_favorites.items(), key=operator.itemgetter(1), reverse=True)[:N]
    top_N = [x[0] for x in sorted_list]
    return top_N  

In [16]:
collab_generate_top_N_recommendations(3)

In [17]:
sampled_df[sampled_df['userId'] == 3][:][['movieId','rating']].sort_values(by='rating', ascending=False)[:10]

In [18]:
#Function to round predictions to rating scale
def myround(x, base=0.5):
    if x<0.5:
        result=0.5
    elif x>5:
        result = 5.0
    else:
        result=base * round(x/base)
    return result

In [19]:
def collab_generate_rating_estimate(movieId, userId, k=20, threshold=0.0):
    # Gather the k users which are most similar to the reference user 
    if movieId in util_matrix.columns and userId in user_sim_df.columns:
        sim_users = user_sim_df.sort_values(by=userId, ascending=False).index[1:k+1]
        # Store the corresponding user's similarity values 
        user_values = user_sim_df.sort_values(by=userId, ascending=False).loc[:,userId].tolist()[1:k+1]
        rating_list = [] # <-- List of k user's ratings for the reference item
        weight_list = [] # <-- List of k user's similarities to the reference user

        # Create a weighted sum for each of the k users who have rated the 
        # reference item (book).
        for sim_idx, user_id in enumerate(sim_users):
            # User's rating of the item
            rating = util_matrix.loc[user_id, movieId]
            # User's similarity to the reference user 
            similarity = user_values[sim_idx]
            # Skip the user if they have not rated the item, or are too dissimilar to 
            # the reference user
            if (np.isnan(rating)) or (similarity < threshold):
                continue
            elif not np.isnan(rating):
                rating_list.append(rating*similarity)
                weight_list.append(similarity)
        array1 =np.array(rating_list)
        array2 =np.array(weight_list)
        try:
            # Return the weighted sum as the predicted rating for the reference item
            predicted_rating = np.exp(logsumexp(-3*array1) - logsumexp(-3*array2)) 
        except ZeroDivisionError:
            # If no ratings for the reference item can be collected, return the average 
            # rating given by all users for the item.  
            predicted_rating = sampled_df.loc[sampled_df['movieId'] == movieId, 'rating'].mean()
    elif movieId not in util_matrix.columns and userId in user_sim_df.columns:
        predicted_rating = sampled_df.loc[sampled_df['userId'] == userId, 'rating'].mean()
    elif movieId in util_matrix.columns and userId not in user_sim_df.columns:
        predicted_rating = sampled_df.loc[sampled_df['movieId'] == movieId, 'rating'].mean()
    else:
        predicted_rating = train_df['rating'].mean()
    return myround(predicted_rating)

In [21]:
def collab_generate_rating_guestimate(movieId, userId, k=20, threshold=0.0):
    # Gather the k users which are most similar to the reference user 
    if userId in util_matrix_T.columns and movieId in Item_sim_df.columns:
        sim_items = Item_sim_df.sort_values(by=movieId, ascending=False).index[1:k+1]
        # Store the corresponding user's similarity values 
        Item_values = Item_sim_df.sort_values(by=movieId, ascending=False).loc[:,movieId].tolist()[1:k+1]
        rating_list = [] # <-- List of k items' ratings for the reference item
        weight_list = [] # <-- List of k items' similarities to the reference user

        # Create a weighted sum for each of the k items rated by the 
        # reference user (Subscriber).
        for sim_idx, item_id in enumerate(sim_items):
            # User's rating of the item
            rating = util_matrix_T.loc[item_id, userId]
            # User's similarity to the reference user 
            similarity = Item_values[sim_idx]
            # Skip the user if they have not rated the item, or are too dissimilar to 
            # the reference user
            if (np.isnan(rating)) or (similarity < threshold):
                continue
            elif not np.isnan(rating):
                rating_list.append(rating*similarity)
                weight_list.append(similarity)
        array1 =np.array(rating_list)
        array2 =np.array(weight_list)
        try:
            # Return the weighted sum as the predicted rating for the reference item
            predicted_rating = np.exp(logsumexp(-3*array1) - logsumexp(-3*array2)) 
        except ZeroDivisionError:
            # If no ratings for the reference item can be collected, return the average 
            # rating given to all Items by the user.  
            predicted_rating = sampled_df.loc[sampled_df['userId'] == userId, 'rating'].mean()
    elif userId not in util_matrix_T.columns and movieId in Item_sim_df.columns:
        predicted_rating = sampled_df.loc[sampled_df['movieId'] == movieId, 'rating'].mean()
    elif userId in util_matrix_T.columns and movieId not in Item_sim_df.columns:
        predicted_rating = sampled_df.loc[sampled_df['userId'] == userId, 'rating'].mean()
    else:
        predicted_rating = train_df['rating'].mean()
    return myround(predicted_rating)

In [28]:
collab_generate_rating_estimate(3,3)

In [29]:
collab_generate_rating_guestimate(3,3)

In [None]:
movie_id = 5418
actual_rating = train_df[(train_df['userId'] == 2) & (train_df['movieId'] == movie_id)]['rating'].values[0]
pred_rating = collab_generate_rating_estimate(movieId = movie_id, userId = 2)
print (f"movieId - {movie_id}")
print ("---")
print (f"Actual rating: \t\t {actual_rating}")
print (f"Predicted rating: \t {pred_rating}")

In [23]:
s_df=test_df.sample(n=20,random_state=2023)

In [24]:
s_df.head()

In [25]:
s_df['Userpred_rating'] = s_df.apply(lambda x: collab_generate_rating_estimate(x['movieId'], x['userId']), axis=1)

In [26]:
s_df['Iterpred_rating'] = s_df.apply(lambda x: collab_generate_rating_guestimate(x['movieId'], x['userId']), axis=1)

In [27]:
s_df.head()

In [None]:
test_df.shape

In [30]:
test_df['rating'] = test_df.apply(lambda x: collab_generate_rating_guestimate(x['movieId'], x['userId']), axis=1)

In [31]:
test_df.head()

In [32]:
def stringer(c, b):
    return str(int(c))+'_'+str(int(b))

In [33]:
test_df['Id'] = test_df.apply(lambda x: stringer(x['userId'], x['movieId']), axis=1)

In [34]:
test_df.head()

In [35]:
Hope=test_df.drop(["userId", "movieId"],axis=1)

In [36]:
Hope.head()

In [37]:
columnsTitles = ['Id', 'rating']

Hope = Hope.reindex(columns=columnsTitles)

In [38]:
Hope.head()

In [39]:
Hope.to_csv("UnsupervisedSubmissionItemFilter.csv", index=False)