# Hybrid Recommender System

In [1]:
import pandas as pd
import numpy as np

# Exercise 1

In this exercise, we are going to try out different methods, that can be used to combine rankings from multiple models.

Below you can see a toy dataframe with the scores of how likely a user will like 5 different items estimated with 2 different models:
- Model 1: rating predictions from a collaborative filtering model
- Model 2: cosine similarities from a content-based model

In [36]:
d = {'item_id': ['I1', 'I2', 'I3', 'I4', 'I5'],
     'model1_score': [1.2, 2.8, 3.0, 4.5, 5.0],
     'model2_score': [0.8, 0.5, 0.2, 0.9, 0.4]}
df = pd.DataFrame(data=d)

## 1.1

Rank the 5 items according to the scores from model 1 and 2 respectively (higher score is better in both models).

In [41]:
df['model1_rank'] = df['model1_score'].rank(method='first', ascending=False).astype(int)
df[['item_id','model1_rank']].sort_values(by="model1_rank", ascending=True)


Unnamed: 0,item_id,model1_rank
4,I5,1
3,I4,2
2,I3,3
1,I2,4
0,I1,5


In [42]:
df['model2_rank'] = df['model2_score'].rank(method='first', ascending=False).astype(int)
df[['item_id','model2_rank']].sort_values(by="model2_rank", ascending=True)


Unnamed: 0,item_id,model2_rank
3,I4,1
0,I1,2
1,I2,3
4,I5,4
2,I3,5


## 1.2

Normalize the scores from the 2 models (by subtracting the mean and dividing with the standard deviation) and compute a combined rank using the **Weighted Sum** method with $\alpha=\beta=1$. Round the results to 3 decimal points.

In [67]:
alpha, beta = 1, 1

df['model1_score_norm'] = (df['model1_score'] - df['model1_score'].mean())/df['model1_score'].std()
df['model2_score_norm'] = (df['model2_score'] - df['model2_score'].mean())/df['model2_score'].std()

df['weighted_sum'] = alpha * df['model1_score_norm'] + beta * df['model2_score_norm']

df['combined_score_rank'] = df['weighted_sum'].rank(method='min', ascending=False).astype(int)

df[['item_id', 'weighted_sum', 'combined_score_rank']].sort_values(by="combined_score_rank", ascending=True)
#<YOUR CODE HERE>

Unnamed: 0,item_id,weighted_sum,combined_score_rank
3,I4,1.976625,1
4,I5,0.572962,2
1,I2,-0.540125,3
0,I1,-0.560767,4
2,I3,-1.448695,5


## 1.3

Merge the ranking from the 2 models using the **Borda Fuse** method.

In [70]:
df['borda_fuse_points'] = (df[['model1_rank', 'model2_rank']] - 5).abs().sum(axis=1)

In [71]:
df

Unnamed: 0,item_id,model1_score,model2_score,model1_rank,model2_rank,model1_score_norm,model2_score_norm,weighted_sum,combined_score_rank,borda_fuse_points,borda_fuse_rank
0,I1,1.2,0.8,5,2,-1.393819,0.833052,-0.560767,4,3,3
1,I2,2.8,0.5,4,3,-0.331862,-0.208263,-0.540125,3,3,3
2,I3,3.0,0.2,3,5,-0.199117,-1.249578,-1.448695,5,2,5
3,I4,4.5,0.9,2,1,0.796468,1.180157,1.976625,1,7,1
4,I5,5.0,0.4,1,4,1.12833,-0.555368,0.572962,2,5,2


In [None]:
# "min" instead of "first" is used here, because if two items have the same points, they should have the same rank
df['borda_fuse_rank'] = df['borda_fuse_points'].rank(method='min', ascending=False).astype(int)

In [73]:
df[['item_id','borda_fuse_rank']].sort_values(by="borda_fuse_rank", ascending=True)

Unnamed: 0,item_id,borda_fuse_rank
3,I4,1
4,I5,2
0,I1,3
1,I2,3
2,I3,5


## 1.4

Merge the ranking from the 2 models using the **Reciprocal Rank Fusion** method with $k=0$.

In [87]:
df['reciprocal_rank_fusion_score'] = df[['model1_rank','model2_rank']].apply(lambda x: 1 / x['model1_rank'] + 1/ x['model2_rank'], axis=1)

In [88]:
df['reciprocal_rank_fusion_rank'] =  df['reciprocal_rank_fusion_score'].rank(method='min', ascending=False).astype(int)
df[['item_id','reciprocal_rank_fusion_rank']].sort_values(by="reciprocal_rank_fusion_rank", ascending=True)

Unnamed: 0,item_id,reciprocal_rank_fusion_rank
3,I4,1
4,I5,2
0,I1,3
1,I2,4
2,I3,5


# Exercise 2

In this exercise, we are going to predict the rating of a single user-item pair using a hybrid method, where we use the user profiles from a content-based method as input to a collaborative filtering (neighborhood-based) method.

Download and load the provided dataframe containing content-based user profiles of the user with reviewerID `A25C2M3QF9G7OQ` and all users that have rated the item with asin `B00EYZY6LQ`.

In [89]:
user_profiles = pd.read_pickle("user_profiles.pkl")

## 2.1 

Compute the cosine similarities between user `A25C2M3QF9G7OQ` and the other users based on their user profiles. 
What are the similarities and what are the ratings given by these users on item `B00EYZY6LQ`?

In [103]:
from sklearn.metrics.pairwise import cosine_similarity

# Load data generated in Session 1 or the provided data splits (see Absalon, W7 Lab)
df_train = pd.read_pickle("train_dataframe.pkl")
df_test = pd.read_pickle("test_dataframe.pkl")

user_item_matrix = df_train.pivot_table(index='reviewerID', columns='asin', values='overall')
user_item_matrix = user_item_matrix.fillna(0)
input_users = user_item_matrix[user_item_matrix['B00EYZY6LQ']>0]

In [118]:
input_users.index.tolist()

['A1F7YU6O5RU432',
 'A1R1BFJCMWX0Y3',
 'A1UQBFCERIP7VJ',
 'A22CW0ZHY3NJH8',
 'A2LW5AL0KQ9P1M',
 'A2PD27UKAD3Q00',
 'A2WW57XX2UVLM6',
 'A2ZY49IDE6TY5I',
 'A39WWMBA0299ZF',
 'A3M6TSEV71537G',
 'A3S3R88HA0HZG3',
 'A914TQVHI872U',
 'AOEUN9718KVRD']

In [91]:
user_item_matrix

asin,B0000530HU,B00006L9LC,B00021DJ32,B0002JHI1I,B0006O10P4,B0009RF9DW,B000FI4S1E,B000FOI48G,B000FTYALG,B000GLRREU,...,B00N2WQ2IW,B00NT0AR7E,B00RZYW4RG,B00W259T7G,B016V8YWBC,B019809F9Y,B019FWRG3C,B019V2KYZS,B01BNEYGQU,B01E7UKR38
reviewerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A105A034ZG9EHO,0.0,0.0,0.0,0.0,0.0,5.0,5.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A10JB7YPWZGRF4,0.0,0.0,0.0,0.0,0.0,5.0,5.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A10M2MLE2R0L6K,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0
A10P0NAKKRYKTZ,0.0,0.0,0.0,0.0,0.0,5.0,5.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A10ZJZNO4DAVB,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
AZCOSCQG73JZ1,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AZD3ON9ZMEGL6,0.0,0.0,0.0,0.0,0.0,5.0,5.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AZFYUPGEE6KLW,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AZJMUP77WBQZQ,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [113]:
user_profiles

Unnamed: 0,action,advanced,aerosol,amp,artisanal,bamboo,bar,bath,blossoms,blown,...,urban,us,vary,volume,wash,water,waterpik,whip,williams,wintergreen
A25C2M3QF9G7OQ,0.0,0.0,0.0,0.166233,0.281132,0.0,0.227028,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.016359,1.016359,0.0,0.0,0.193798
A1F7YU6O5RU432,0.234015,0.0,0.365106,0.138372,0.0,0.0,0.0,0.0,0.0,0.365106,...,0.0,0.365106,0.0,0.0,0.172441,0.0,0.0,0.0,0.0,0.387595
A1R1BFJCMWX0Y3,0.0,0.0,0.27383,0.124675,0.210849,0.306746,0.170271,0.247713,0.0,0.27383,...,0.280623,0.27383,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A1UQBFCERIP7VJ,0.0,0.0,0.202837,0.0,0.0,0.0,0.170017,0.0,0.0,0.202837,...,0.070712,0.202837,0.185138,0.198231,0.0,0.0,0.0,0.0,0.0,0.215331
A22CW0ZHY3NJH8,0.195012,0.0,0.182553,0.11531,0.0,0.272663,0.0,0.220189,0.0,0.182553,...,0.249443,0.182553,0.0,0.0,0.143701,0.0,0.0,0.0,0.0,0.0
A2LW5AL0KQ9P1M,0.0,0.0,0.243404,0.21325,0.234276,0.204497,0.18919,0.165142,0.0,0.243404,...,0.187082,0.243404,0.0,0.115609,0.0,0.0,0.0,0.126371,0.0,0.0
A2PD27UKAD3Q00,0.0,0.0,0.60851,0.0,0.0,0.0,0.0,0.0,0.623548,0.60851,...,0.530342,0.60851,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A2WW57XX2UVLM6,0.0,0.0,0.486808,0.224169,0.0,0.0,0.0,0.0,0.0,0.486808,...,0.0,0.486808,0.0,0.346828,0.0,0.0,0.0,0.379114,0.0,0.0
A2ZY49IDE6TY5I,0.0,0.0,0.292085,0.166233,0.281132,0.0,0.227028,0.0,0.0,0.292085,...,0.0,0.292085,0.0,0.0,0.0,1.016359,1.016359,0.0,0.0,0.0
A39WWMBA0299ZF,0.0,0.0,0.60851,0.0,0.0,0.0,0.0,0.0,0.623548,0.60851,...,0.212137,0.60851,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [115]:
user_profiles.loc['A25C2M3QF9G7OQ']

action         0.000000
advanced       0.000000
aerosol        0.000000
amp            0.166233
artisanal      0.281132
                 ...   
water          1.016359
waterpik       1.016359
whip           0.000000
williams       0.000000
wintergreen    0.193798
Name: A25C2M3QF9G7OQ, Length: 121, dtype: float64

In [121]:
user_similarities = cosine_similarity([user_profiles.loc['A25C2M3QF9G7OQ']], user_profiles[user_profiles.index.isin(input_users.index.tolist())])

In [123]:
user_similarities.round(3)

array([[0.085, 0.187, 0.098, 0.263, 0.225, 0.076, 0.058, 0.805, 0.081,
        0.028, 0.041, 0.202, 0.164]])

In [None]:
input_users[input_users.index != 'A25C2M3QF9G7OQ']['B00EYZY6LQ']

reviewerID
A1F7YU6O5RU432    5.0
A1R1BFJCMWX0Y3    3.0
A1UQBFCERIP7VJ    5.0
A22CW0ZHY3NJH8    3.0
A2LW5AL0KQ9P1M    4.0
A2PD27UKAD3Q00    5.0
A2WW57XX2UVLM6    4.0
A2ZY49IDE6TY5I    4.0
A39WWMBA0299ZF    5.0
A3M6TSEV71537G    5.0
A3S3R88HA0HZG3    4.0
A914TQVHI872U     5.0
AOEUN9718KVRD     3.0
Name: B00EYZY6LQ, dtype: float64

In [133]:
user_similarities

array([[0.08514993, 0.18691177, 0.09779072, 0.2630178 , 0.22541571,
        0.07576673, 0.05774509, 0.80531953, 0.08064641, 0.02819989,
        0.04140884, 0.20230512, 0.16422012]])

In [143]:
result = input_users[['B00EYZY6LQ']].copy()
result['similarity_score'] = user_similarities[0].round(3)

In [144]:
result

asin,B00EYZY6LQ,similarity_score
reviewerID,Unnamed: 1_level_1,Unnamed: 2_level_1
A1F7YU6O5RU432,5.0,0.085
A1R1BFJCMWX0Y3,3.0,0.187
A1UQBFCERIP7VJ,5.0,0.098
A22CW0ZHY3NJH8,3.0,0.263
A2LW5AL0KQ9P1M,4.0,0.225
A2PD27UKAD3Q00,5.0,0.076
A2WW57XX2UVLM6,4.0,0.058
A2ZY49IDE6TY5I,4.0,0.805
A39WWMBA0299ZF,5.0,0.081
A3M6TSEV71537G,5.0,0.028


## 2.2

Predict the rating for user `A25C2M3QF9G7OQ` on item `B00EYZY6LQ` based on the ratings from the $3$ most similar users, using a weighted (by similarity) average. What is the prediction (round it to 2 decimal points)?

In [154]:
result.sort_values(by="similarity_score",ascending=False)[:3]

asin,B00EYZY6LQ,similarity_score
reviewerID,Unnamed: 1_level_1,Unnamed: 2_level_1
A2ZY49IDE6TY5I,4.0,0.805
A22CW0ZHY3NJH8,3.0,0.263
A2LW5AL0KQ9P1M,4.0,0.225


In [None]:
k = 3

top_k_similar_users = result.sort_values(by="similarity_score",ascending=False)[:3]
prediction_hybrid = (top_k_similar_users["B00EYZY6LQ"] * top_k_similar_users['similarity_score']).sum() / top_k_similar_users['similarity_score'].sum()
print(f'Predicted rating: {round(prediction_hybrid,2)}')

Predicted rating: 3.8
