In [158]:
import pickle as rick
import numpy as np
import pandas as pd

In [159]:
# read and prepare models
content_model_pickle = rick.load(open('content_model.pkl', 'rb'))
content_model = content_model_pickle['model']
imputer = content_model_pickle['imputer']

collabrative_model_pickle = rick.load(open('collaborative_model.pkl', 'rb'))
similarity_matrix = pd.DataFrame(collabrative_model_pickle['cosine_similarity'])
ratings_matrix = collabrative_model_pickle['ratings_matrix']


# Prediction design

I have decided that the most suitable type for predicting rating is a hybrid version between content-based and collaborative filtering.
Since we have many ratings i can use collaborative filtering item-to-item which is the method that works best in practice.
Only for the users that have rated few movies a content based approach would make sense.
The regression for my content-based model usually have an RMSE of about 1, which means that if the rating is 4, it predicts either 4, or there is a chance that the model predicts 3 or 5.
I will use the I will use the content-based approach on the users with the lowest number of rankings.
Below is a table which displays the number of rankings for a user in the lowest percentiles.
I will choose where to transition from a collaborative filtering model to a content-based based on these numbers

|Percentile |Number of ratings |
|----------- |------------------ |
|5 |21 |
|10 |25 |
|15 |29 |
|20 |34 |
|30 | 46 |

I am going to use the conten-based prediction on users with the lowest 15% of ratings, and the collabrative filtering on the 85% rest

In [160]:
# Indexes and movies id are not equal, so i make two functions to help converting
to_index = {}
to_id = {}

for i,e in enumerate(ratings_matrix[0].index.tolist()):
    to_index[e] = i
    to_id[i] = e



In [161]:
#Colaborative recommender model


def collabrative_user_movie(user_id,movie_id,rows,N):

    """Collaborative filter prediction for a given user and movie

    Parameters:
    user_id (int) : UserID
    N (int) : Number of most correlated movies to use when predicting ranking for unknown data points
    Returns: Number of ratings made by a user (used for content-based/collaborative hybrid) and predicted ratings for all movies not rated by given user
    (int,[(int,[int])): (Number of ratings by user,[(movie_id, predicted rating)

   """
    movie_index=to_index[movie_id]
    movie_row = rows.iloc[:,movie_index].sort_values(ascending=False)
    most_similar_corr = movie_row.values[:N]
    most_similar_indexes = movie_row.index.values[:N]
    most_similar_ratings = [ratings_matrix[user_id][to_id[x]] for x in most_similar_indexes]
    predicted_rating = [x*y for x,y in zip(most_similar_corr,most_similar_ratings)]
    dividend = sum(predicted_rating)
    divisor = sum(most_similar_corr)

    # There is a chance that the sum of the correlation of the most similiar movies is 0
    # We then use the content based approach for prediction
    if divisor == 0:
        return content_based_user_movie(user_id,movie_id)
        #return (movie_id,9)

    else:
        pred = np.around(dividend/divisor)
        return (movie_id,pred)


def collaborative_filtering(user_id,movies_rated,N):

    """Collaborative filter prediction for a given user

    Parameters:
    user_id (int) : UserID
    N (int) : Number of most correlated movies to use when predicting ranking for unknown data points
    Returns:
    [int]: Predicted ratings for movies not rated by the user

   """
    movies_not_rated = ratings_matrix[user_id][ratings_matrix[user_id] == 0].index.values
    movies_rated_i = [to_index[id] for id in movies_rated]

    rows = similarity_matrix.iloc[movies_rated_i, :]

    result = []

    for movie_id in movies_not_rated:
        result.append(collabrative_user_movie(user_id,movie_id,rows,N))

    return result




In [166]:
def content_based_user_movie(user_id, movie_id):

    return (movie_id,9)
    #return (movie_id,pred)


def content_based(user_id):
    """Content-based rediction for a given user

    Parameters:
    user_id (int) : UserID
    Returns:
    [(int,int)]: Predicted ratings for movies not rated by the user
    """

    movies_not_rated = ratings_matrix[user_id][ratings_matrix[user_id] == 0].index.values

    result = []

    for movie_id in movies_not_rated:
        result.append(content_based_user_movie(user_id,movie_id))

    return result



In [163]:
# Helpers
def get_hybrid_split(percentile):
    user = []
    rating_count = []
    for user_id in ratings_matrix.columns.values:
        movies_rated = len(ratings_matrix[user_id][ratings_matrix[user_id] != 0].index.values)
        user.append(user_id)
        rating_count.append(movies_rated)
    return np.percentile(rating_count,percentile)




In [164]:
# Main - Predictions

def predict_user(user_id,split, collaborative_N):
    """Prediction for a given user

    Parameters:
    user_id (int) : UserID
    hybrid_percentile:
    collaborative_N: If the collaborative approach is to be used, N is the number of most similar movies to take into account

    Returns: list of movies not rated bu user with a prediction
    [(int,int)]: [(movie_id,predicted rating)]

    """
    movies_rated = ratings_matrix[user_id][ratings_matrix[user_id] != 0].index.values

    # If user is in the X% lowest number of movies rated we will use a content-based approach
    if len(movies_rated) <= split:
        print("Content-based prediction " + str(user_id))
        return (content_based(user_id, movies_rated))

    else:
        print("Collabrative filtering prediction for user " + str(user_id))
        return collaborative_filtering(user_id,movies_rated,collaborative_N)


def predict(hybrid_split, N):
    predictions = ratings_matrix.copy()
    predictions = predictions.reset_index()

    split = get_hybrid_split(hybrid_split)

    # for every user, predict and put into dataframe
    for user_id in [0,1]:#ratings_matrix.columns.values:
        user_res = (predict_user(user_id,split,N))
        for res in user_res:
            movie_id = res[0]
            pred = res[1]
            predictions.iat[to_index[movie_id],user_id+1] = pred

    print(predictions)

    print("Done predicting values to predictions.csv")

In [165]:
predict(15,2)

Collabrative filtering prediction for user 0
Collabrative filtering prediction for user 1
BrukerID  FilmID    0    1    2    3    4    5    6    7    8  ...  6031  \
0              0  3.0  3.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   
1              1  4.0  3.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   
2              2  4.0  3.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   
3              3  3.0  4.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   
4              5  3.0  4.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   
...          ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...   ...   
3689        3948  3.0  3.0  0.0  0.0  4.0  0.0  0.0  0.0  0.0  ...   0.0   
3690        3949  2.0  4.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   
3691        3950  3.0  3.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   
3692        3951  1.0  4.0  0.0  0.0  4.0  0.0  4.0  0.0  0.0  ...   0.0   
3693        3952  2.0  3.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0  