In [43]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.metrics import jaccard_score

In [44]:
users = ['User1', 'User2', 'User3', 'User4', 'User5']
items = ['Item A', 'Item B', 'Item C', 'Item D', 'Item E']

datasets = [
    [1, 0, 1, 1, 0],
    [1, 0, 0, 1, 1],
    [1, 0, 1, 0, 0],
    [0, 1, 0, 1, 1],
    [1, 1, 1, 0, 0],
]

df = pd.DataFrame(datasets, columns=items, index=users)

# Calculate Jaccard similarity for items
item_jaccard_similarity_matrix = pd.DataFrame(0, index=items, columns=items, dtype=float)

for col1 in items:
    for col2 in items:
        item_jaccard_similarity_matrix.loc[col1, col2] = jaccard_score(df[col1], df[col2])

print("Item Jaccard Similarity Matrix:")
print(item_jaccard_similarity_matrix)

# Calculate Jaccard similarity for users
user_jaccard_similarity_matrix = pd.DataFrame(0, index=users, columns=users, dtype=float)

for user1 in users:
    for user2 in users:
        user_jaccard_similarity_matrix.loc[user1, user2] = jaccard_score(df.loc[user1], df.loc[user2])

print("\nUser Jaccard Similarity Matrix:")
print(user_jaccard_similarity_matrix)

Item Jaccard Similarity Matrix:
        Item A    Item B  Item C    Item D    Item E
Item A    1.00  0.200000    0.75  0.400000  0.200000
Item B    0.20  1.000000    0.25  0.250000  0.333333
Item C    0.75  0.250000    1.00  0.200000  0.000000
Item D    0.40  0.250000    0.20  1.000000  0.666667
Item E    0.20  0.333333    0.00  0.666667  1.000000

User Jaccard Similarity Matrix:
          User1  User2     User3  User4     User5
User1  1.000000   0.50  0.666667    0.2  0.500000
User2  0.500000   1.00  0.250000    0.5  0.200000
User3  0.666667   0.25  1.000000    0.0  0.666667
User4  0.200000   0.50  0.000000    1.0  0.200000
User5  0.500000   0.20  0.666667    0.2  1.000000


In [45]:
# find similar users based similarities, except itself
topN_users = {}
for i in user_jaccard_similarity_matrix.index:
    _df = user_jaccard_similarity_matrix.loc[i].drop([i])
    _df_sorted = _df.sort_values(ascending=False)
    top2 = list(_df_sorted.index[:2])
    topN_users[i] = top2

In [46]:
topN_users

{'User1': ['User3', 'User2'],
 'User2': ['User1', 'User4'],
 'User3': ['User1', 'User5'],
 'User4': ['User2', 'User1'],
 'User5': ['User3', 'User1']}

In [35]:
df

Unnamed: 0,Item A,Item B,Item C,Item D,Item E
User1,1,0,1,1,0
User2,1,0,0,1,1
User3,1,0,1,0,0
User4,0,1,0,1,1
User5,1,1,1,0,0


In [40]:
df.loc['User3']

Item A    1
Item B    0
Item C    1
Item D    0
Item E    0
Name: User3, dtype: int64

In [47]:
rs_results = {}
for user, sim_users in topN_users.items():
    rs_result = set()
    for sim_user in sim_users:
        rs_result = rs_result.union(set(df.loc[sim_user].replace(0,np.nan).dropna().index))
        
    rs_result -= set(df.loc[user].replace(0,np.nan).dropna().index)
    rs_results[user] = rs_result 
    
rs_results

{'User1': {'Item E'},
 'User2': {'Item B', 'Item C'},
 'User3': {'Item B', 'Item D'},
 'User4': {'Item A', 'Item C'},
 'User5': {'Item D'}}

In [50]:
# ratings case 
users = ['User1', 'User2', 'User3', 'User4', 'User5']
items = ['Item A', 'Item B', 'Item C', 'Item D', 'Item E']

datasets = [
    [5, 3, 4, 4, None],
    [3, 1, 2, 3, 3],
    [4, 3, 4, 3, 5],
    [3, 3, 1, 5, 4],
    [1, 5, 5, 2, 1],
]

df = pd.DataFrame(datasets, columns=items, index=users)
user_similar = df.T.corr()
item_similar = df.corr()
print("user corrlation")
print(user_similar.round(4))
print("item corrlation")
print(item_similar.round(4))

user corrlation
        User1   User2   User3   User4   User5
User1  1.0000  0.8528  0.7071  0.0000 -0.7921
User2  0.8528  1.0000  0.4677  0.4900 -0.9001
User3  0.7071  0.4677  1.0000 -0.1612 -0.4666
User4  0.0000  0.4900 -0.1612  1.0000 -0.6415
User5 -0.7921 -0.9001 -0.4666 -0.6415  1.0000
item corrlation
        Item A  Item B  Item C  Item D  Item E
Item A  1.0000 -0.4767 -0.1231  0.5322  0.9695
Item B -0.4767  1.0000  0.6455 -0.3101 -0.4781
Item C -0.1231  0.6455  1.0000 -0.7206 -0.4276
Item D  0.5322 -0.3101 -0.7206  1.0000  0.5817
Item E  0.9695 -0.4781 -0.4276  0.5817  1.0000


In [51]:
df

Unnamed: 0,Item A,Item B,Item C,Item D,Item E
User1,5,3,4,4,
User2,3,1,2,3,3.0
User3,4,3,4,3,5.0
User4,3,3,1,5,4.0
User5,1,5,5,2,1.0
