In [108]:
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.metrics.pairwise import pairwise_distances

In [109]:
metric = 'jaccard'
target_user_id = 1
target_user_id -= 1 # Convert to zero-based user Ids
knn = 3 # Number of nearest neighbors

# Loading datasets

In [110]:
ratings = np.loadtxt(
  open('ratings.csv', 'rb'),
  delimiter=',',
  skiprows=0,
  dtype=[('user_id', np.uint32), ('item_id', np.uint32), ('score', np.uint8)]
)

items = np.loadtxt(
  open('items.csv', 'rb'),
  delimiter=',',
  skiprows=0,
  dtype=[('item_id', np.uint32), ('name', str, 100)]
)

users = np.loadtxt(
  open('users.csv', 'rb'),
  delimiter=',',
  skiprows=0,
  dtype=[('user_id', np.uint32), ('name', np.str_, 100)]
)

In [111]:
pd.DataFrame(items).head(5)

Unnamed: 0,item_id,name
0,1,Diseño de logotipo (DG)
1,2,Busco un sitio web para un juego (IT)
2,3,Website para Venta de Cursos (IT)
3,4,Zoho Crm / Zoho Creator (IT)
4,5,Blog de Contenidos (IT)


In [112]:
pd.DataFrame(users).head(5)

Unnamed: 0,user_id,name
0,1,Jack Roy (IT)
1,2,Colorado Wood (DG)
2,3,Hamish Mccormick (DG)
3,4,Graham Jacobs (IT)
4,5,Caesar Mcgowan (DG)


In [113]:
pd.DataFrame(ratings).head(5)

Unnamed: 0,user_id,item_id,score
0,1,2,1
1,1,3,2
2,1,4,2
3,1,5,1
4,2,1,2


In [114]:
print 'There are %d ratings.' % len(ratings)
print 'There are %d users.' % len(users)
print 'There are %d items.' % len(items)
print 'Fetching recommendations for user "%s" (ID: %d)' % (users[target_user_id]['name'], users[target_user_id]['user_id'])

There are 35 ratings.
There are 10 users.
There are 10 items.
Fetching recommendations for user "Jack Roy (IT)" (ID: 1)


In [115]:
print 'Fetching recommendations for:'
pd.DataFrame([[users[target_user_id]['name']]], index=[users[target_user_id]['user_id']], columns=['Name'])

Fetching recommendations for:


Unnamed: 0,Name
1,Jack Roy (IT)


# Converting sparse data

In [116]:
# Create sparse matrix from ratings
sparse_ratings = csr_matrix(
    (ratings['score'], (ratings['user_id'] - 1, ratings['item_id'] - 1)),
    shape=(max(ratings['user_id']), max(ratings['item_id'])),
    dtype=np.uint32
)

print pd.DataFrame(sparse_ratings.todense(), index=np.unique(ratings['user_id']), columns=np.unique(ratings['item_id']))

    1   2   3   4   5   6   7   8   9   10
1    0   1   2   2   1   0   0   0   0   0
2    2   0   0   0   0   1   2   1   0   2
3    0   0   0   0   0   1   2   1   2   2
4    0   0   2   1   0   0   0   0   0   0
5    2   0   0   0   0   2   2   0   0   1
6    0   0   0   0   0   2   1   0   2   1
7    1   0   0   0   0   2   0   0   0   0
8    0   2   1   2   2   0   0   0   0   0
9    0   0   2   0   1   0   0   0   0   0
10   0   0   1   2   1   0   0   0   0   0


# Users similarity matrix

In [117]:
# Build distance matrix
distances = 1 - pairwise_distances(sparse_ratings.todense(), metric=metric)

# Print matrix
pd.options.display.float_format = '{:,.4f}'.format
pd.DataFrame(distances, index=np.unique(ratings['user_id']), columns=np.unique(ratings['user_id']))

Unnamed: 0,1,2,3,4,5,6,7,8,9,10
1,1.0,0.0,0.0,0.25,0.0,0.0,0.0,0.25,0.5,0.5
2,0.0,1.0,0.6667,0.0,0.4,0.0,0.0,0.0,0.0,0.0
3,0.0,0.6667,1.0,0.0,0.1667,0.2,0.0,0.0,0.0,0.0
4,0.25,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.3333,0.0
5,0.0,0.4,0.1667,0.0,1.0,0.4,0.25,0.0,0.0,0.0
6,0.0,0.0,0.2,0.0,0.4,1.0,0.2,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.25,0.2,1.0,0.0,0.0,0.0
8,0.25,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.5
9,0.5,0.0,0.0,0.3333,0.0,0.0,0.0,0.0,1.0,0.3333
10,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.3333,1.0


# Get nearest neighbors

In [118]:
nearest_neighbors = np.argsort(distances[target_user_id])[::-1][:knn]
nearest_neighbors = nearest_neighbors[nearest_neighbors != target_user_id]

similarity_sum = 0
for user_id in nearest_neighbors:
  similarity_sum += distances[target_user_id, user_id]

print 'Nearest neighbors are:'

pd.DataFrame(
    [(users[user_id]['name'], users[user_id]['user_id'], distances[target_user_id, user_id]) for user_id in nearest_neighbors],
    columns=['name', 'user_id', 'distance']
)

Nearest neighbors are:


Unnamed: 0,name,user_id,distance
0,Callum Acosta (IT),10,0.5
1,Kenyon Mooney (IT),9,0.5


# Get recomendations

In [119]:
# Get items the nearest neighbors liked and the user didn't
ratings_subset = ratings[
    np.in1d(
        ratings['user_id'],
        nearest_neighbors + 1 # correct zero-based
    )
]

temp = np.zeros((ratings_subset.shape[0], 4)) # user_id, item_id, score, ranking
temp[:, 0] = ratings_subset['user_id']
temp[:, 1] = ratings_subset['item_id']
temp[:, 2] = ratings_subset['score']

for user_id in nearest_neighbors:
  # Multiply action weight * user similary
  temp[temp[:, 0] == user_id + 1, 3] += distances[target_user_id, user_id] * temp[temp[:, 0] == user_id + 1, 2]

unique_items = np.unique(temp[:, 1].astype(np.uint32))
scores = np.zeros((unique_items.size, 3))
scores[:, 0] = np.array(unique_items)

for item in unique_items:
    # (sum match similarity / similarity sum) * time decay
    scores[scores[:, 0] == item, 1] = (
        temp[temp[:, 1] == item][:, 3].sum() / similarity_sum
    )

sorted_scores = scores[np.argsort(scores[:, 1])[::-1]]
sorted_scores[:, 2] = np.arange(sorted_scores[:, 0].size) + 1

pd.options.display.float_format = '{:,.5f}'.format
pd.DataFrame(
    [(items[row[0] - 1]['name'], row[1]) for index, row in enumerate(sorted_scores)],
    index=sorted_scores[:, 2].astype(int),
    columns=['Name', 'Score']
)

Unnamed: 0,Name,Score
1,Website para Venta de Cursos (IT),1.5
2,Blog de Contenidos (IT),1.0
3,Zoho Crm / Zoho Creator (IT),1.0
