In [2]:
import numpy as np
import matplotlib.pyplot as plt
from scipy import sparse
import seaborn as sns
import pickle

from collections import Counter
import pandas as pd

from sklearn.decomposition import TruncatedSVD, NMF, SparsePCA, KernelPCA, LatentDirichletAllocation
from sklearn.cluster import KMeans, SpectralClustering, DBSCAN
from sklearn.mixture import GaussianMixture
from sklearn.neighbors import NearestNeighbors
from sklearn.base import BaseEstimator

from searchgrid import set_grid, make_grid_search, make_pipeline
from sklearn.model_selection import PredefinedSplit, GridSearchCV
from sklearn.pipeline import Pipeline

In [3]:
#load everything
num_ing = 8023
recipe_decoder = pickle.load( open('data-cleaned/recipe_decoder.pkl', 'rb') )
ingr_decoder   = pickle.load( open('data-cleaned/ingredient_decoder.pkl', 'rb') )
tag_decoder    = pickle.load( open('data-cleaned/tag_decoder.pkl', 'rb') )
X     = sparse.load_npz("data-cleaned/recipes.npz")
Xhat  = sparse.load_npz("data-cleaned/recipes_tfidf.npz")
U     = sparse.load_npz("data-cleaned/user_train.npz")
Uhat  = sparse.load_npz("data-cleaned/user_train_tfidf.npz")
Utest = sparse.load_npz("data-cleaned/user_test.npz")
def intersection(lst1, lst2): 
    return list(set(lst1) & set(lst2)) 

## Make y data for scoring

In [4]:
user_test = Utest[:,-1].toarray().flatten().astype('int')
y = np.zeros((Utest.shape[0], 2), dtype='int')-1
for i in range(len(y)):
    recipes = Utest[i].nonzero()[1][:-1]
    if len(recipes) == 1:
        y[i,0] = recipes
    elif len(recipes) == 2:
        y[i,:] = recipes
    else:
        raise ValueError("Someone reviewed 3 recipes!")

y_blank = np.zeros((U.shape[0], 2))

U_tog = sparse.vstack([U[user_test], U])
y_tog = np.concatenate([y, y_blank])

test_fold = np.concatenate([
    # The training data.
    np.full(U[user_test].shape[0], 0, dtype=np.int8),
    # The development data.
    np.full(U.shape[0], -1, dtype=np.int8)
])
cv = PredefinedSplit(test_fold)

I'll split whole process into 3

  1. Dimension Reduction
  2. Get nearest options
  3. Choose 5 best from that set
  4. Scoring

Some of these will cover both (3) and (4). We'll use sklearn Pipeline to get things done. Our methods will just take a little tweaking, but we can make them work.

## Scoring Functions and Recommender Class - we'll inherit from it as we go

In [5]:
def recipe_score(i, js):
    score = np.zeros_like(js)
    #iterate through all recommendations
    for n, j in enumerate(js):
        #if didn't make enough recommendations, counts as 0
        if j == -1:
            continue
        temp = X[i, :-1] + X[j, :-1]
        score[n] = np.count_nonzero(temp.data==2) + int(X[i,-1]==X[j,-1])
    return score

def recommend_scoring(y_true, y_pred):
    max_scores = np.zeros_like(y_pred)
    #iterate through all data
    for i, (yi, y_predi) in enumerate(zip(y_true, y_pred)):
        #iterate through each liked recipe to find closest
        for l in yi:
            if l == -1:
                continue
            max_scores[i] = np.maximum(max_scores[i], recipe_score(l, y_predi))
            
    return max_scores.mean(axis=1)

class Recommender(BaseEstimator):
    def score(self, X, y):
        return recommend_scoring(y, self.predict(X)).mean()

## Make all Recommender Classes Here - Easy to add more

In [6]:
class kNNRecommender(Recommender):
    def __init__(self, n_neighbors=5, metric='minkowski', algorithm='brute'):
        self.n_neighbors = n_neighbors #add one to exclude ourself
        self.metric      = metric
        self.algorithm   = algorithm
        self.estimator   = NearestNeighbors(n_neighbors=self.n_neighbors+1, metric=metric, algorithm = algorithm)

    def fit(self, X, y=None):
        self.estimator.fit(X)
        return self

    def predict(self, X):
        #find closest users
        idxs = self.estimator.kneighbors(X)[1][:,1:]

        rec_recipes = np.zeros((X.shape[0], 5), dtype='int')
        #iterate through each user's closest users
        for i, idx in enumerate(idxs):
            #find their liked recipes
            close_recipes = U[idx].nonzero()[1]
            #and find 5 most common ones to recommend
            rec_recipe   = Counter(close_recipes).most_common(5)
            #sometimes there's not 5 recommendations - make due
            if len(rec_recipe) < 5:
                rec_recipes[i] = np.array([k[0] for k in rec_recipe] + [-1]*(5-len(rec_recipe)))
            else:
                rec_recipes[i]   = np.array([k[0] for k in rec_recipe][:5])

        return rec_recipes

In [7]:
class ClusterRecommender(Recommender):
    def __init__(self, clusterer=KMeans(), n_clusters=10):
        self.n_clusters  = n_clusters
        self.clusterer   = clusterer
        self.clusterer.n_clusters = self.n_clusters

    def fit(self, X, y=None):
        self.labels = self.clusterer.fit_predict(X)
#         self.labels = self.clusterer.predict(X)
        return self

    def predict(self, X):
        #find closest users
        labels = self.clusterer.predict(X)

        rec_recipes = np.zeros((X.shape[0], 5), dtype='int')
        #iterate through each user's cluster
        for i, label in enumerate(labels):
            #find all recipes in cluster
            idx = np.argwhere(self.labels==label).flatten()
            #find their liked recipes
            close_recipes = U[idx].nonzero()[1]
            #and find 5 most common ones to recommend
            rec_recipe   = Counter(close_recipes).most_common(5)
            #sometimes there's not 5 recommendations - make due
            if len(rec_recipe) < 5:
                rec_recipes[i] = np.array([k[0] for k in rec_recipe] + [-1]*(5-len(rec_recipe)))
            else:
                rec_recipes[i]   = np.array([k[0] for k in rec_recipe][:5])

        return rec_recipes

In [8]:
class ClusterNNRecommender(Recommender):
    def __init__(self, clusterer=KMeans(), n_clusters=10, n_neighbors=5):
        self.n_clusters  = n_clusters
        self.n_neighbors = n_neighbors
        self.clusterer   = clusterer
        self.clusterer.n_clusters = self.n_clusters
        self.knn         = NearestNeighbors(n_neighbors=self.n_neighbors+1)

    def fit(self, X, y=None):
        self.X = X
        self.labels = self.clusterer.fit_predict(X)
        return self

    def predict(self, X):
        #find closest users
        labels = self.clusterer.predict(X)

        rec_recipes = np.zeros((X.shape[0], 5), dtype='int')
        #iterate through each user's cluster
        for i, (label, x) in enumerate(zip(labels, X)):
            #find all recipes in cluster
            cluster = np.argwhere(self.labels==label).flatten()

            #find k nearest neighbors in the cluster (unless there isn't enough of them)
            if len(cluster) > self.n_neighbors+1:
                idx = self.knn.fit(self.X[cluster]).kneighbors(x.reshape(1,-1))[1][0,1:]
            else:
                idx = cluster
                
            #find their liked recipes
            close_recipes = U[idx].nonzero()[1]
            #and find 5 most common ones to recommend
            rec_recipe   = Counter(close_recipes).most_common(5)
            #sometimes there's not 5 recommendations - make due
            if len(rec_recipe) < 5:
                rec_recipes[i] = np.array([k[0] for k in rec_recipe] + [-1]*(5-len(rec_recipe)))
            else:
                rec_recipes[i]   = np.array([k[0] for k in rec_recipe][:5])

        return rec_recipes

In [9]:
class NNRadiusRecommender(Recommender):
    def __init__(self, radius=1, metric='minkowski', algorithm='brute'):
        self.radius    = radius
        self.metric    = metric
        self.algorithm = algorithm
        self.estimator = NearestNeighbors(radius=self.radius, metric=metric, algorithm = algorithm)

    def fit(self, X, y=None):
        self.estimator.fit(X)
        return self

    def predict(self, X):
        #find closest users
        distances, idxs = self.estimator.radius_neighbors(X)

        rec_recipes = np.zeros((X.shape[0], 5), dtype='int')
        #iterate through each user's closest users
        for i, (distance, idx) in enumerate(zip(distances, idxs)):
            #find their liked recipes (making sure to exclude ourself)
            close_recipes = U[idx[distance!=0]].nonzero()[1]
            #and find 5 most common ones to recommend
            rec_recipe   = Counter(close_recipes).most_common(5)
            #sometimes there's not 5 recommendations - make due
            if len(rec_recipe) < 5:
                rec_recipes[i] = np.array([k[0] for k in rec_recipe] + [-1]*(5-len(rec_recipe)))
            else:
                rec_recipes[i]   = np.array([k[0] for k in rec_recipe][:5])

        return rec_recipes

In [None]:
pipe = Pipeline([("dr", TruncatedSVD(n_components=10)),
                ("rdr", ClusterRecommender(clusterer=SpectralClustering(), n_clusters=5))])

pipe.fit(U)
pipe.score(U[user_test][:10], y[:10])

In [None]:
pipe = Pipeline([("dr", TruncatedSVD()),
                ("rdr", ClusterRecommender())])

n_components = [20, 40, 60, 80]
tsvd = set_grid(TruncatedSVD(), n_components=n_components)
kpca = set_grid(KernelPCA(), n_components=n_components)
nmf = set_grid(NMF(), n_components=n_components)
spca = set_grid(SparsePCA(), n_components=n_components)

n_clusters = [10, 50, 100, 150]
n_neighbors = [2, 10, 50, 100]
metrics = ['minkowski', 'cosine']
knn = set_grid(kNNRecommender(), n_neighbors=n_neighbors, metric=metrics)
nnra    = set_grid(NNRadiusRecommender(), radius=, metrics=)
cluster = set_grid(ClusterRecommender(), clusterer=[KMeans(), GaussianMixture(), SpectralClustering()], n_clusters=n_clusters)
dbscan  = set_grid(ClusterRecommender(), clusterer=[DBSCAN()], eps=, min_samples=)
cluster = set_grid(ClusterNNRecommender(), clusterer=[KMeans(), GaussianMixture(), SpectralClustering()], n_clusters=n_clusters, n_neighbors=)
dbscan  = set_grid(ClusterNNRecommender(), clusterer=[DBSCAN()], eps=, min_samples=, n_neighbors=)

pipenew = set_grid(pipe, dr=[tsvd, nmf, spca], rdr=[knn, cluster])
opt  = make_grid_search(pipenew, cv=cv, verbose=3, n_jobs=3)
opt.fit(U_tog, y_tog)
pickle.dump(opt.cv_results_, open("results.pkl", 'wb'))
print(opt.cv_results_)

In [5]:
columns = ["kNN", "NNBall", "KMeans", "GMM", "MinCut", "KMeansNN", "GMMNN", "MinCutNN", ]
rows = ["PCA", "KPCA", "NMF", "LDA"]
d = pd.DataFrame(tuple(), columns=columns, index=rows)
d.to_pickle("results/user_U_sum.pkl")
d.to_pickle("results/user_U_fr.pkl")
d.to_pickle("results/user_U_rf.pkl")
d.to_pickle("results/user_Uhat_sum.pkl")
d.to_pickle("results/user_Uhat_fr.pkl")
d.to_pickle("results/user_Uhat_rf.pkl")

In [31]:
pd.read_pickle("results/user_U.pkl")["kNN"]["PCA"]

(8.572754600777182,
 1.857438564300537,
 47.26626968383789,
 {'dr__n_components': 20, 'rdr__n_neighbors': 2})

In [None]:
_