In [131]:
import pickle
import pandas as pd
import scipy
import numpy as np
import sklearn
from sklearn.metrics.pairwise import cosine_similarity

In [130]:
import keywords


In [160]:
with open('tfidf.pkl', 'rb') as f:  
    tfidf_matrix, tfidf_feature_names, full_item_ids = pickle.load(f)

In [161]:
with open('interactions.pkl', 'rb') as f:  
    utility_matrix, interactions_work = pickle.load(f)

In [170]:
items_subset = list(interactions_work.contentId.unique())
len(items_subset)

129

In [199]:
interactions_work[interactions_work['personId']==USER]

Unnamed: 0,personId,contentId,rating
0,-9016528795238256703,-6136272094613269629,2.0
1,-9016528795238256703,-2711301039947937868,2.0
2,-9016528795238256703,-1590585250246572231,2.0
3,-9016528795238256703,1933229167501870037,2.0
4,-9016528795238256703,5191381587333696286,2.0
5,-9016528795238256703,6200172800690402606,2.0


## Build item and user vector (profile)

In [127]:
def get_items_profile(full_ids, items_ids, tfidf_matrix):
    item_profiles_list = [tfidf_matrix[full_ids.index(x):full_ids.index(x)+1] for x in items_ids]
    item_profiles = scipy.sparse.vstack(item_profiles_list)
    return item_profiles

def build_user_profile(user_id, interactions, full_item_list, tfidf_matrix):
    interactions_person_df = interactions.set_index('personId').loc[user_id]
    items_interacted_profiles = get_items_profile(full_item_list, interactions_person_df['contentId'], tfidf_matrix)
    user_item_ratings = np.array(interactions_person_df['rating']).reshape(-1,1)
    weighted_avg = np.sum(items_interacted_profiles.multiply(user_item_ratings), axis=0) / np.sum(user_item_ratings)
    user_profile_normalized = sklearn.preprocessing.normalize(weighted_avg)
    return user_profile_normalized

In [128]:
ITEM = -4110354420726924665
USER = -9016528795238256703
profile = build_user_profile(USER, interactions_work, full_item_ids, tfidf_matrix)

topn = sorted(profile[0,:], reverse=True)[0:10]
index_topn = np.argsort(-profile[0,:])[0:10]
important_words = [tfidf_feature_names[idx] for idx in index_topn]

## Similarities

In [243]:
items_to_ignore = interactions_work[interactions_work['personId'] == USER].contentId.unique()
items_to_compare = [x for x in items_subset if x not in items_to_ignore]
subset_items_profiles = get_items_profile(full_item_ids, items_to_compare, tfidf_matrix)
cosine_similarities = cosine_similarity(profile, subset_items_profiles)
results = pd.DataFrame(cosine_similarities, columns=items_to_compare)
results

Unnamed: 0,-4374331682165863764,809601605585939618,-8773118241761372618,-7315032288233856709,-2176468683077766369,-4102297002729307038,2709926970543371965,6157037646878010131,-6142462826726347616,-4760639635023250284,...,6652210819857467321,-2778760500673113802,1003778007373126185,-4110354420726924665,5619251370090681244,84318068629167514,3703141283586666995,-7101541512657907485,3067875254349597654,7767869406844505704
0,0.193578,0.049124,0.108123,0.074888,0.22369,0.13895,0.001444,0.01243,0.28966,0.054628,...,0.177694,0.120092,0.051659,0.330539,0.058624,0.033673,0.061973,0.159983,0.220213,0.074679


## GET TOP N

In [235]:
topn = 10
similar_indices = np.argsort(-cosine_similarities[0,:])[0:topn]
similar_items = [(items_to_compare[i], cosine_similarities[0,i]) for i in similar_indices]
similar_items

[(-7926018713416777892, 0.3567839157763085),
 (-4110354420726924665, 0.3305389560443456),
 (-6195775145989617417, 0.3119655478027027),
 (3353902017498793780, 0.30974111893261985),
 (2255603060224026824, 0.3008502925389378),
 (2900029983899283273, 0.29484672782905363),
 (-6142462826726347616, 0.2896595392637854),
 (8471417198703153120, 0.28893926818464977),
 (6023609667389715259, 0.2808048345687099),
 (-9171475473795142532, 0.2722205561089389)]