In [1]:
import numpy as np
from lightfm import LightFM
from lightfm.datasets import fetch_stackexchange, fetch_movielens
from lightfm.evaluation import precision_at_k, auc_score

from scipy.spatial.distance import cdist



In [2]:
import seaborn as sns

In [3]:
# Load the MovieLens 100k dataset. Only five
# star ratings are treated as positive.
data = fetch_stackexchange(dataset='crossvalidated', data_home='data', indicator_features=False, tag_features=True)

In [4]:
data['item_features'].shape, data['item_feature_labels'].shape

((72360, 1246), (1246,))

In [5]:
train, test, features = data['train'], data['test'], data['item_features']

In [6]:
# LightFM?

In [7]:
# Instantiate and train the model
model = LightFM(no_components=20, loss='warp', user_alpha=0, item_alpha=0, random_state=42)

In [8]:
%time model.fit(train, epochs=20, item_features=features)

CPU times: user 1.06 s, sys: 5.1 ms, total: 1.06 s
Wall time: 1.06 s


<lightfm.lightfm.LightFM at 0x14adb5880>

In [9]:
def get_similar(query = 'regression', n = 10):    
    row_id = np.where(query == data['item_feature_labels'])[0][0]
    tag_embeddings = (model.item_embeddings.T / np.linalg.norm(model.item_embeddings, axis=1)).T
    query_embedding= tag_embeddings[row_id,:]
    # similarity = np.dot(query_embedding, tag_embeddings.T)
    similarity = 1 - cdist(np.asmatrix(query_embedding), tag_embeddings, metric='cosine')[0]
    most_similar = np.argsort(-similarity)[:n]
    return data['item_feature_labels'][most_similar]

In [10]:
get_similar('pca')

array(['pca', 'discriminant-analysis', 'steins-phenomenon',
       'dimensionality-reduction', 'svd', 'pls', 'manova',
       'canonical-correlation', 'quotation', 'suppressor'], dtype='<U50')

In [11]:
test_precision = precision_at_k(model, test, train_interactions=train, item_features=features).mean()
print('Precision: test %.2f.' % (test_precision))

test_auc = auc_score(model, test, train_interactions=train, item_features=features).mean()
print('AUC: test %.2f.' % (test_auc))

Precision: test 0.01.
AUC: test 0.72.


In [12]:
## def sample_recommendation(model, data, user_ids):
#     n_users, n_items = data['train'].shape
#     for user_id in user_ids:
#         known_positives_indx = data['train'].tocsr()[user_id].indices       

#         scores = model.predict(user_id, np.arange(n_items))
#         top_items_indx = np.argsort(-scores)
        
#         print("User %s" % user_id)
#         print("     Known positives:")
        
#         for x in known_positives_indx[:3]:
#             print("       {}: {}" .format(x, data['item_feature_labels'][x]))

#         print("     Recommended:")
        
#         for x in top_items_indx[:3]:
#             print("        {}: {}" .format(x, data['item_feature_labels'][x]))


In [13]:
# sample_recommendation(model, data, [33])

In [14]:
# import numpy as np
# # from scipy.stats import pearsonr, linregress
# # from sklearn.decomposition import PCA
# from scipy.spatial.distance import cdist

In [15]:
# n_iterations = 10
# correlations = np.zeros((50,50,n_iterations))*np.nan

In [16]:
# data['train'] = data['train'].tocsr()
# data['item_features'] = data['item_features'].tocsr()

In [17]:
# n_users, n_items = data['train'].shape
# n_users, n_items

In [18]:
# for i in range(n_iterations):
#     this_correlations = []
#     this_pvalues = []
    
#     half1_idx = np.random.choice(n_items, n_items//2, replace=False)
#     half2_idx = np.setdiff1d(np.arange(n_items), half1_idx)
#     # half1_idx.shape, half2_idx.shape
#     assert n_items == (len(half1_idx) + len(half2_idx))
#     # data['train'].shape, data['item_features'].shape
    
#     # half 1
#     half1_train = data['train'][:, half1_idx]
#     half1_features= data['item_features'][half1_idx, :]
    
#     half1_model = LightFM(no_components=50, loss='warp', user_alpha=0, item_alpha=0, random_state=42)
#     half1_model.fit(half1_train, epochs=30, item_features=half1_features)
#     half1_item_embeddings = half1_model.item_embeddings
    
#     # half1_model = PCA(n_components=100)
#     # half1_item_embeddings = half1_model.fit_transform(half1_train.todense())

#     # half 2
#     half2_train = data['train'][:, half2_idx]
#     half2_features= data['item_features'][half2_idx, :]
    
#     half2_model = LightFM(no_components=50, loss='warp', user_alpha=0, item_alpha=0, random_state=42)
#     half2_model.fit(half2_train, epochs=30, item_features=half2_features)
#     half2_item_embeddings = half2_model.item_embeddings
    
#     # half2_model = PCA(n_components=100)
#     # half2_item_embeddings = half2_model.fit_transform(half2_train.todense())
    
    
#     correlations[:,:,i] = 1-cdist(half1_item_embeddings, half2_item_embeddings, metric='correlation')
    
# #     for c in range(100):
# #         # half1_model.item_embeddings.shape, half2_model.item_embeddings.shape
# #         pr = pearsonr(half1_item_embeddings[:,c], half2_item_embeddings[:,c])
# #         this_correlations.append(pr.statistic)
# #         this_pvalues.append(pr.pvalue)
    
# #     correlations.append(this_correlations)
# #     pvalues.append(this_pvalues)    
    

In [19]:
# sns.heatmap(correlations.mean(axis=2))

In [20]:
# for c,p in zip(np.array(correlations).mean(axis=0), np.array(pvalues).mean(axis=0)):
#     print(c,'\t',p)

In [21]:
# correlations = 1-cdist(half1_item_embeddings, half2_item_embeddings, metric='correlation')

In [22]:
# from sklearn.cluster import k_means

In [23]:
# cluster = k_means(correlations, n_clusters=10)

In [24]:
# sns.heatmap(correlations[np.argsort(cluster[1]),:])