In [1]:
import pickle as rick
import numpy as np
import pandas as pd

In [2]:
# read and prepare models
model_pickle = rick.load(open('model.pkl', 'rb'))
content_model = model_pickle['model']
imputer = model_pickle['imputer']
similarity_matrix = model_pickle['cosine_similarity']
ratings_matrix = model_pickle['ratings_matrix']


# Prediction design

I have decided that the most suitable type for predicting rating is a hybrid version between content-based and collaborative filtering.
Since we have many ratings i can use collaborative filtering item-to-item which is the method that works best in practice.
Only for the users that have rated few movies a content based approach would make sense.
The regression for my content-based model usually have an RMSE of about 1, which means that if the rating is 4, it predicts either 4, or there is a chance that the model predicts 3 or 5.
I will use the I will use the content-based approach on the users with the lowest number of rankings.
Below is a table which displays the number of rankings for a user in the lowest percentiles.
I will choose where to transition from a collaborative filtering model to a content-based based on these numbers

|Percentile |Number of ratings |
|----------- |------------------ |
|5 |21 |
|10 |25 |
|15 |29 |
|20 |34 |
|30 | 46 |

I am going to use the content-based prediction on users with the lowest 15% of ratings, and the collaborative filtering on the 85% rest

---

After testing the solution above i decided to adjust my prediction implementation.
The two key reasons are that the collaborative filtering method had a lower RMSE than the content-based, and the run time was to loong for the number of users and movies.
I therefore decided to implement a solution where the content-based approach is only used when the collaborative cannot be used, because of 0 division.

In [3]:
# Helpers
def get_hybrid_split(percentile):
    user = []
    rating_count = []
    for user_id in ratings_matrix.columns.values:
        movies_rated = len(ratings_matrix[user_id][ratings_matrix[user_id] != 0].index.values)
        user.append(user_id)
        rating_count.append(movies_rated)
    return np.percentile(rating_count,percentile)

# Indexes and movies id are not equal, so i make two functions to help converting
to_index = {}
to_id = {}

for i,e in enumerate(ratings_matrix[0].index.tolist()):
    to_index[e] = i
    to_id[i] = e

user_df = pd.read_csv('clean_data/bruker.csv')
movie_df = pd.read_csv('clean_data/film.csv')

In [4]:
# Collaborative recommender model


def collaborative_user_movie(user_id,movie_id,rows,movies_rated_i,n):

    """Collaborative filter prediction for a given user and movie

    Parameters:
    user_id (int) : UserID
    movie_id (int) : MovieID
    rows [int] : Cosine similarity of movies rated by user
    movies_rated_i [int] : index of movies rated by user
    n (int) : Number of most correlated movies to use when predicting ranking for unknown data points

    Returns:
    (int,int): movie_id and predicted rating for that movie

   """
    movie_index=to_index[movie_id]
    similarities = rows[movie_index]
    movie_sim = list(zip(movies_rated_i, similarities))
    movie_sim_sorted = (sorted(movie_sim, key = lambda x: x[1]))

    most_similar_movies = movie_sim_sorted[-n:]

    most_similar_corr = [val for key, val in most_similar_movies]
    most_similar_indexes = [key for key, val in most_similar_movies]
    most_similar_ratings = [ratings_matrix[user_id][to_id[x]] for x in most_similar_indexes]

    predicted_rating = [x*y for x,y in zip(most_similar_corr,most_similar_ratings)]
    dividend = sum(predicted_rating)
    divisor = sum(most_similar_corr)

    # There is a chance that the sum of the correlation of the most similar movies is 0
    # We then use the content based approach for prediction
    if divisor == 0:
        return content_based_user_movie(user_id,movie_id)

    else:
        pred = np.around(dividend/divisor)
        return movie_id,pred


def collaborative_filtering(user_id,movies_rated,n):
    """Collaborative filter prediction for a given user

    Parameters:
    user_id (int) : UserID
    movies_rated : Movies rated by given user
    n (int) : Number of most correlated movies to use when predicting ranking for unknown data points

    Returns:
    [(int,int)]: Predicted ratings for movies not rated by the user

   """
    movies_not_rated = ratings_matrix[user_id][ratings_matrix[user_id] == 0].index.values
    movies_rated_i = [to_index[movie_id] for movie_id in movies_rated]
    rows = similarity_matrix[:,movies_rated_i]

    user_result = []

    for movie_id in movies_not_rated:
        user_result.append(collaborative_user_movie(user_id,movie_id,rows,movies_rated_i,n))


    return user_result




In [5]:
# Content-based

def content_based_user_movie(user_id, movie_id):
    """Content based prediction for a given user and movie

    Parameters:
    user_id (int) : UserID
    movie_id (int) : MovieID

    Returns:
    (int,int): Predicted ratings for  given movie

   """
    u = user_df[user_df['BrukerID']==user_id]
    u1 = u[['Kjonn', 'Alder', 'Jobb', 'Postkode']]
    u2 = imputer.transform(u1)
    m = movie_df[movie_df['FilmID']==movie_id]
    m1 = m[["FilmID","Aar","Action","Adventure","Animation","Children's","Comedy","Crime","Documentary","Drama","Fantasy","Film-Noir","Horror","Musical","Mystery","Romance","Sci-Fi","Thriller","War","Western"]]
    flat_u2 = [item for sublist in u2 for item in sublist]
    flat_m1 = [item for sublist in m1.values for item in sublist]
    X = [user_id]+flat_u2 + flat_m1
    X = pd.DataFrame([X])

    pred = np.around(content_model.predict(X))

    return movie_id,pred


def content_based(user_id):
    """Content-based prediction for a given user

    Parameters:
    user_id (int) : UserID

    Returns:
    [(int,int)]: Predicted ratings for movies not rated by the user
    """

    movies_not_rated = ratings_matrix[user_id][ratings_matrix[user_id] == 0].index.values

    user_result = []

    for movie_id in movies_not_rated:
        user_result.append(content_based_user_movie(user_id,movie_id))

    return user_result

In [6]:
# Main - Predictions

def predict_user(user_id,split, collaborative_n):
    """Prediction for a given user

    Parameters:
    user_id (int) : UserID
    split: At which percentile should the prediction switch to content-based
    collaborative_n: If the collaborative approach is to be used, N is the number of most similar movies to take into account

    Returns: list of movies not rated bu user with a prediction
    [(int,int)]: [(movie_id,predicted rating)]

    """
    movies_rated = ratings_matrix[user_id][ratings_matrix[user_id] != 0].index.values

    ## If user is in the X% lowest number of movies rated we will use a content-based approach
    ### After testing this out this took a lot more time than expected.
    ### It is understandable because for every user the content model must predict over 3000 values
    ### I switched to using content based only where the collaborative cannot complete because of 0 division
    '''
    if len(movies_rated) <= split:
        print("Content-based prediction " + str(user_id))
        return (content_based(user_id))

    '''
    print("Predicting user : " + str(user_id))
    return collaborative_filtering(user_id,movies_rated,collaborative_n)


def predict(hybrid_split, n):
    """
    Prediction for a given user

    Parameters:
    user_id (int) : UserID
    split: At which percentile should the prediction switch to content-based
    collaborative_n: If the collaborative approach is to be used, N is the number of most similar movies to take into account

    Returns: list of movies not rated bu user with a prediction
    [(int,int)]: [(movie_id,predicted rating)]

    """

    predictions = ratings_matrix.copy()
    predictions = predictions.reset_index()

    split = get_hybrid_split(hybrid_split)

    users = ratings_matrix.columns.values

    # for every user, predict and put into dataframe
    for user_id in users:

        if user_id == 6040:
            break
        user_res = (predict_user(user_id,split,n))
        for res in user_res:
            movie_id = res[0]
            pred = res[1]
            predictions.iat[to_index[movie_id],user_id+1] = pred

    return predictions


In [7]:
result = predict(3,2)

print("Done predicting values")


Predicting user : 0
Predicting user : 1
Predicting user : 2
Predicting user : 3
Predicting user : 4
Predicting user : 5
Predicting user : 6
Predicting user : 7
Predicting user : 8
Predicting user : 9
Predicting user : 10
Predicting user : 11
Predicting user : 12
Predicting user : 13
Predicting user : 14
Predicting user : 15
Predicting user : 16
Predicting user : 17
Predicting user : 18
Predicting user : 19
Predicting user : 20
Predicting user : 21
Predicting user : 22
Predicting user : 23
Predicting user : 24
Predicting user : 25
Predicting user : 26
Predicting user : 27
Predicting user : 28
Predicting user : 29
Predicting user : 30
Predicting user : 31
Predicting user : 32
Predicting user : 33
Predicting user : 34
Predicting user : 35
Predicting user : 36
Predicting user : 37
Predicting user : 38
Predicting user : 39
Predicting user : 40
Predicting user : 41
Predicting user : 42
Predicting user : 43
Predicting user : 44
Predicting user : 45
Predicting user : 46
Predicting user : 47
Pr

In [8]:
# Store
result.to_csv('predictions.csv',index=False)