In [1]:
import numpy as np
import matplotlib.pyplot as plt
from scipy import sparse
import seaborn as sns
import pickle
from collections import Counter

from sklearn.metrics import make_scorer
from sklearn.decomposition import TruncatedSVD, NMF, SparsePCA, KernelPCA, LatentDirichletAllocation
from sklearn.neighbors import NearestNeighbors
from sklearn.model_selection import GridSearchCV
from sklearn.base import BaseEstimator

from hypopt import GridSearch
from sklearn.pipeline import Pipeline

In [2]:
#load everything
num_ing = 8023
recipe_decoder = pickle.load( open('data-cleaned/recipe_decoder.pkl', 'rb') )
ingr_decoder   = pickle.load( open('data-cleaned/ingredient_decoder.pkl', 'rb') )
tag_decoder    = pickle.load( open('data-cleaned/tag_decoder.pkl', 'rb') )
X     = sparse.load_npz("data-cleaned/recipes.npz")
Xhat  = sparse.load_npz("data-cleaned/recipes_tfidf.npz")
U     = sparse.load_npz("data-cleaned/user_train.npz")
Uhat  = sparse.load_npz("data-cleaned/user_train_tfidf.npz")
Utest = sparse.load_npz("data-cleaned/user_test.npz")
def intersection(lst1, lst2): 
    return list(set(lst1) & set(lst2)) 

I'll split whole process into 3

  1. Dimension Reduction
  2. Get nearest options
  3. Choose 5 best from that set
  4. Scoring

Some of these will cover both (3) and (4). We'll use sklearn Pipeline to get things done. Our methods will just take a little tweaking, but we can make them work.

In [42]:
def recipe_score(i, js):
    score = np.zeros_like(js)
    #iterate through all recommendations
    for n, j in enumerate(js):
        temp = X[i, :-1] + X[j, :-1]
        score[n] = np.count_nonzero(temp.data==2) + int(X[i,-1]==X[j,-1])
    return score

def recommend_scoring(y_true, y_pred):
    max_scores = np.zeros_like(y_pred)
    #iterate through all data
    for i, (yi, y_predi) in enumerate(zip(y_true, y_pred)):
        #iterate through each liked recipe to find closest
        for l in yi:
            if l == -1:
                continue
            max_scores[i] = np.maximum(max_scores[i], recipe_score(l, y_predi))
            
    return max_scores.mean(axis=1)

scorer = make_scorer(recommend_scoring, greater_is_better=True)

In [40]:
class kNN(BaseEstimator):
    def __init__(self, n_neighbors=5, metric='minkowski', algorithm='brute'):
        self.n_neighbors = n_neighbors+1 #add one to exclude ourself
        self.metric      = metric
        self.algorithm   = algorithm
        self.estimator   = NearestNeighbors(n_neighbors=n_neighbors+1, metric=metric, algorithm = algorithm)

    def fit(self, X, y=None):
        self.estimator.fit(X)
        return self

    def predict(self, X):
        #find closest users
        idxs = self.estimator.kneighbors(X)[1][:,1:]

        rec_recipes = np.zeros((X.shape[0], 5), dtype='int')
        #iterate through each user's closest users
        for i, idx in enumerate(idxs):
            #find their liked recipes
            close_recipes = U[idx].nonzero()[1]
            #and find 5 most common ones to recommend
            rec_recipe   = Counter(close_recipes).most_common(5)
            rec_recipes[i]   = np.array([k[0] for k in rec_recipe][:5])

        return rec_recipes

    def score(self, X, y):
        return recommend_scoring(self.predict(X), y)

In [29]:
user_test = Utest[:,-1].toarray().flatten().astype('int')
y = np.zeros((Utest.shape[0], 2), dtype='int')-1
for i in range(len(y)):
    recipes = Utest[i].nonzero()[1][:-1]
    if len(recipes) == 1:
        y[i,0] = recipes
    elif len(recipes) == 2:
        y[i,:] = recipes
    else:
        raise ValueError("Someone reviewed 3 recipes!")

y_blank = np.zeros((U.shape[0], 2))

In [44]:
pipe = Pipeline([("pca", TruncatedSVD(n_components=10)),
                ("knn", kNN(n_neighbors=5))])

pipe_params = {"pca__n_components": [10,20],
                "knn__n_neighbors": [5, 10]}

opt = GridSearch(model=pipe, param_grid=pipe_params)
opt.fit(U, y_blank, U[user_test], y)



  0%|          | 0/4 [00:00<?, ?it/s][A[A