In [7]:
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.neighbors import NearestNeighbors
from sklearn.model_selection import ShuffleSplit, KFold
import matplotlib.pyplot as plt
from scipy import sparse
from itertools import product
from time import time
import seaborn
%run Helpers.ipynb

In [9]:
n_components = [10, 20, 40, 60]
data = [U, Uhat]
num_user = X.shape[0]
metrics = ['cosine', 'minkowski']
#also should iterate over number of neighbors used

scores = np.zeros((2, 2, 4))
times = np.zeros((2, 2, 4))

user_test = Utest[:,-1].toarray().flatten().astype('int')
user_test_idx = np.arange(0, Utest.shape[0])

In [14]:
for i, j, k in product(range(len(data)), range(len(metrics)), range(len(n_components))):
    temp_score = []
    
    #make pca
    pca_data = TruncatedSVD(n_components=n_components[k]).fit_transform(data[i])
    
    #make and train neighbors
    nn  = NearestNeighbors(metric=metrics[j], algorithm='brute').fit(pca_data)
    start = time()
    neighbors = nn.kneighbors(pca_data[user_test], 5)[1]
    times[i,j,k] = time() - start
    
    #check if we got the recipe they liked
    for u, n in zip(user_test_idx, neighbors):
        temp_score.append( user_score(u, n) )
    scores[i,j,k] = np.array(temp_score).mean()
    print(i,j,k, scores[i,j,k])
    
np.save("results/scores_userNN.npy", scores)
np.save("results/times_userNN.npy", times)

0 0 0 20.0
0 0 1 20.0
0 0 2 20.0
0 0 3 20.0
0 1 0 20.0
0 1 1 20.0
0 1 2 20.0
0 1 3 20.0


In [12]:
#make pca
pca_data = TruncatedSVD(n_components=60).fit_transform(U)

#make and train neighbors
nn  = NearestNeighbors().fit(pca_data)
start = time()
neighbors = nn.kneighbors(pca_data[user_test], 5)[1]
times[i,j,k] = time() - start

In [13]:
neighbors.shape

(13639, 5)

In [None]:
#check if we got the recipe they liked
for u, n in zip(user_test_idx, neighbors):
    temp_score.append( user_score(u, n) )
scores[i,j,k] = np.array(temp_score).mean()
print(i,j,k, scores[i,j,k])

In [None]:
scores = np.load("results/scores_recipeNN.npy")

fig,ax = plt.subplots(1, 2, figsize=(12,3))
ax = ax.reshape(-1)
for i in range(len(data)):
    seaborn.heatmap(scores[i], vmin=10.6, vmax=12.4, xticklabels=n_components, yticklabels=metrics, cmap=seaborn.diverging_palette(20, 220, n=200), ax=ax[i])
    ax[i].set_ylabel('Metrics')
    ax[i].set_xlabel('Number of PCA Components')

fig.suptitle('Average Number of Matching Ingredients/Tags')
ax[0].set_title('Original')
ax[1].set_title('TF-IDF')
plt.savefig("figs/recipeNN_scores.pdf", bbox_inches="tight")
plt.show()

In [None]:
times = np.load("results/times_recipeNN.npy")

fig,ax = plt.subplots(1, 2, figsize=(12,3))
ax = ax.reshape(-1)
for i in range(len(data)):
    seaborn.heatmap(times[i], vmin=32, vmax=42, xticklabels=n_components, yticklabels=metrics, cmap=seaborn.diverging_palette(20, 220, n=200), ax=ax[i])
    ax[i].set_ylabel('Metrics')
    ax[i].set_xlabel('Number of PCA Components')

fig.suptitle(f'Average Time to find Neighbors for {num_rec//7} pts')
ax[0].set_title('Original')
ax[1].set_title('TF-IDF')
plt.savefig("figs/recipeNN_times.pdf", bbox_inches="tight")
plt.show()