In [1]:
import pandas as pd
import numpy as np
from scipy import spatial

In [2]:
df_dict = dict()
df_dict = {'User1': {'Item1': 4,
                    'Item2': 3,
                    'Item3': 5,
                    'Item4': 4},
          'User2': {'Item1': 5,
                    'Item2': 3},
           'User3': {'Item1': 4,
                    'Item2': 3,
                    'Item3': 3,
                    'Item4': 4},
           'User4': {'Item1': 2,
                    'Item2': 1},
           'User5': {'Item1': 4,
                    'Item2': 2}
          }
df_user_item = pd.DataFrame(df_dict)
df_user_item 

Unnamed: 0,User1,User2,User3,User4,User5
Item1,4,5.0,4,2.0,4.0
Item2,3,3.0,3,1.0,2.0
Item3,5,,3,,
Item4,4,,4,,


## Cosine Similarity

<img src="https://www.geeksforgeeks.org/wp-content/ql-cache/quicklatex.com-6a635084614b8873ccdb323986d9a7aa_l3.svg" width = 400 height = 400/>

In [3]:
import math
from numpy.linalg import norm

def get_sim(u1, u2):
    num = np.dot(u1, u2)
    den = norm(u1)*norm(u2)
    return num/den

def similarity(df, user1, user2):
    u1, u2 = np.array([]), np.array([])
    for i, j in zip(df[user1].tolist(), df[user2].tolist()):
        if pd.notnull(i) and pd.notnull(j):
            u1 = np.append(u1, [i])
            u2 = np.append(u2, [j]) 
    return get_sim(u1, u2)

## Part 1: Finding similar users

### Considering only common rated items

In [4]:
users = df_user_item.columns.tolist()
user_user_sim_matrix1 = pd.DataFrame(columns = df_user_item.columns,  index = df_user_item.columns)

df = df_user_item
for user1 in users:
    for user2 in users:
        user_user_sim_matrix1.at[user1, user2] = similarity(df, user1, user2)
user_user_sim_matrix1

Unnamed: 0,User1,User2,User3,User4,User5
User1,1.0,0.994692,0.974835,0.98387,0.98387
User2,0.994692,1.0,0.994692,0.997054,0.997054
User3,0.974835,0.994692,1.0,0.98387,0.98387
User4,0.98387,0.997054,0.98387,1.0,1.0
User5,0.98387,0.997054,0.98387,1.0,1.0


User2 and User3 should be more closer than User2 and User1, above method finds both equally similar

### Filling NaNs with 0

In [5]:
user_user_sim_matrix2 = pd.DataFrame(columns = df_user_item.columns,  index = df_user_item.columns)
df = df_user_item.fillna(0)
for user1 in users:
    for user2 in users:
        user_user_sim_matrix2.at[user1, user2] = similarity(df, user1, user2)
user_user_sim_matrix2

Unnamed: 0,User1,User2,User3,User4,User5
User1,1.0,0.61219,0.974835,0.60553,0.60553
User2,0.61219,1.0,0.703353,0.997054,0.997054
User3,0.974835,0.703353,1.0,0.695701,0.695701
User4,0.60553,0.997054,0.695701,1.0,1.0
User5,0.60553,0.997054,0.695701,1.0,1.0


Finds User2 and User3 more similar than User2 and User1, which seems to be more inline with the ratings.


### Filling NaNs with mean and subtracting mean user rating

In [6]:
user_user_sim_matrix3 = pd.DataFrame(columns = df_user_item.columns,  index = df_user_item.columns)

df = df_user_item - df_user_item.mean()
df = df.fillna(df.mean())

for user1 in users:
    for user2 in users:
        user_user_sim_matrix3.at[user1, user2] = similarity(df, user1, user2)
user_user_sim_matrix3

Unnamed: 0,User1,User2,User3,User4,User5
User1,1.0,0.5,0.0,0.5,0.5
User2,0.5,1.0,0.707107,1.0,1.0
User3,0.0,0.707107,1.0,0.707107,0.707107
User4,0.5,1.0,0.707107,1.0,1.0
User5,0.5,1.0,0.707107,1.0,1.0


Although users 1 and 3 are similar based on ratings (4,3,5,4) and (4,3,3,4) similarity based on above method is 0

<b>Reference:</b> https://www.sciencedirect.com/science/article/pii/S0950705113003560

## Part 2: Recommending items based on similar users

## Predicting missing rating

<img src="https://www.geeksforgeeks.org/wp-content/ql-cache/quicklatex.com-a81ab02a7ef2d17d21abce7d4015a964_l3.svg" width = 400 height = 400/>

<b>Reference:</b> https://www.geeksforgeeks.org/user-based-collaborative-filtering/?ref=rp

In [7]:
def get_recommendations(unknown_user):
    #unknown_user = 'User2'
    mean_r_u = df_user_item[unknown_user].mean()

    #df_user_item[unknown_user]
    unrated_items = df_user_item[df_user_item[unknown_user].isna()][unknown_user].index
    if len(unrated_items) == 0:
        return 0
    
    
    users = df_user_item.columns.tolist()
    users.remove(unknown_user)

    ratings = {}

    df = df_user_item.fillna(0)
    for item in unrated_items:
        num, den = 0, 0
        for user in users:
            sim = user_user_sim_matrix2.at[unknown_user, user]
            r_ip = df_user_item.at[item, user] - df_user_item[user].mean()
            if pd.notnull(r_ip):    
                num += (sim * r_ip)
            else:
                num += 0
            den += abs(sim)


        ratings[item] = mean_r_u + (num/ den)
    ratings = dict(sorted(ratings.items(), key=lambda item: item[1], reverse = True))  
    return ratings


In [8]:
unknown_user = 'User2'
ratings = get_recommendations(unknown_user)
print(f'Recommendations for {unknown_user}')
if ratings == 0:
    print(f'{unknown_user} has watched all movies')
else:
    for i, r in ratings.items():
        print(f'{i} --> {r}')

Recommendations for User2
Item4 --> 4.106257870972776
Item3 --> 4.078713340085062
