In [104]:
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.neighbors import NearestNeighbors
from sklearn.model_selection import ShuffleSplit, KFold
import matplotlib.pyplot as plt
from scipy import sparse
from itertools import product
from time import time
import seaborn
import pickle
from collections import Counter
import heapq

In [6]:
#load everything
num_ing = 8023
recipe_decoder = pickle.load( open('data-cleaned/recipe_decoder.pkl', 'rb') )
ingr_decoder   = pickle.load( open('data-cleaned/ingredient_decoder.pkl', 'rb') )
tag_decoder    = pickle.load( open('data-cleaned/tag_decoder.pkl', 'rb') )
X     = sparse.load_npz("data-cleaned/recipes.npz")
Xhat  = sparse.load_npz("data-cleaned/recipes_tfidf.npz")
U     = sparse.load_npz("data-cleaned/user_train.npz")
Uhat  = sparse.load_npz("data-cleaned/user_train_tfidf.npz")
Utest = sparse.load_npz("data-cleaned/user_test.npz")
def intersection(lst1, lst2): 
    return list(set(lst1) & set(lst2)) 

In [None]:
class userCluster(Recommender):
    def __init__(self, clusterer='kmeans', **kwargs):
        if clusterer == 'kmeans':
            self.clusterer = KMeans(**kwargs)
        elif clusterer == 'gmm':
            self.clusterer = GaussianMixture(**kwargs)
        elif clusterer == 'mincut':
            self.clusterer = SpectralClustering(**kwargs)

    def fit(self, X, y=None):
        self.labels = self.clusterer.fit_predict(X)
        return self

    def predict(self, X):
        #find closest users
        labels = self.clusterer.predict(X)

        rec_recipes = np.zeros((X.shape[0], 5), dtype='int')
        #iterate through each user's cluster
        for i, label in enumerate(labels):
            #find all recipes in cluster
            idx = np.argwhere(self.labels==label).flatten()
            #find their liked recipes
            close_recipes = U[idx].nonzero()[1]
            #and find 5 most common ones to recommend
            rec_recipe   = Counter(close_recipes).most_common(5)
            #sometimes there's not 5 recommendations - make due
            if len(rec_recipe) < 5:
                rec_recipes[i] = np.array([k[0] for k in rec_recipe] + [-1]*(5-len(rec_recipe)))
            else:
                rec_recipes[i]   = np.array([k[0] for k in rec_recipe][:5])

        return rec_recipes

In [118]:
REC = 5

def recommend_freq_rating(user_idx):
    #get recipes and ratings
    idx = U[user_idx].nonzero()
    recipes = idx[1]
    ratings = np.array(U[user_idx][idx]).flatten()
    
    #iterate through counting and summing ratings
    recipe_count = dict()
    rating_count = dict()
    for recipe, rating in zip(recipes, ratings):
        recipe_count[recipe] = recipe_count.get(recipe, 0) + 1
        rating_count[recipe] = rating_count.get(recipe, 0) + rating

    #put all together into list (which will determine sorting)
    lst = [(count, rating / count, recipe) for (recipe, count), (_, rating) in zip(recipe_count.items(), rating_count.items())]
    lst = heapq.nlargest(REC*5, lst)
    tog = sorted(lst, reverse=True)

    #return
    if len(tog) < REC:
        return np.array([k[2] for k in tog] + [-1]*(REC-len(tog)))
    else:
        return np.array([k[2] for k in tog[:REC]])

def recommend_rating_freq(user_idx):
    #get recipes and ratings
    idx = U[user_idx].nonzero()
    recipes = idx[1]
    ratings = np.array(U[user_idx][idx]).flatten()
    
    #iterate through counting and summing ratings
    recipe_count = dict()
    rating_count = dict()
    for recipe, rating in zip(recipes, ratings):
        recipe_count[recipe] = recipe_count.get(recipe, 0) + 1
        rating_count[recipe] = rating_count.get(recipe, 0) + rating

    #put all together into list (which will determine sorting)
    lst = [(rating / count, count, recipe) for (recipe, count), (_, rating) in zip(recipe_count.items(), rating_count.items())]
    lst = heapq.nlargest(REC*5, lst)
    tog = sorted(lst, reverse=True)

    #return
    if len(tog) < REC:
        return np.array([k[2] for k in tog] + [-1]*(REC-len(tog)))
    else:
        return np.array([k[2] for k in tog[:REC]])

def recommend_sum(user_idx):
    #get recipes and ratings
    idx = U[user_idx].nonzero()
    recipes = idx[1]
    ratings = np.array(U[user_idx][idx]).flatten()
    
    #iterate through counting and summing ratings
    rating_count = dict()
    for recipe, rating in zip(recipes, ratings):
        rating_count[recipe] = rating_count.get(recipe, 0) + rating

    #put all together into list (which will determine sorting)
    lst = [(rating, recipe) for recipe, rating in rating_count.items()]
    lst = heapq.nlargest(REC*5, lst)
    tog = sorted(lst, reverse=True)

    #return
    if len(tog) < REC:
        return np.array([k[1] for k in tog] + [-1]*(REC-len(tog)))
    else:
        return np.array([k[1] for k in tog[:REC]]) 