In [1]:
import pandas as pd
import numpy as np
import scipy as sp
import pickle
import DataLoader
from collections import defaultdict
from UserMeanEstimator import UserMeanEstimator
from RecipeMeanEstimator import RecipeMeanEstimator
from LogisticEstimator import LogisticEstimator
from PretrainedEstimator import PretrainedEstimator

In [2]:
# ratings = pd.read_csv("data/all_recipe_clean.csv")
with open("data/all_recipes.pkl", "rb") as infile:
    recipes = pickle.load(infile) 
ratings = pd.read_csv("data/all_ratings.csv")
dataLoader = DataLoader.DataLoader(ratings, recipes)
user_holdout, recipe_holdout, holdout = dataLoader.get_holdout_data()
holdout_X = [t[:2] for t in holdout]
holdout_Y = np.array([t[2] for t in holdout])

In [28]:
class RecipeRecommender:
    """
    
    distance: Function that takes in list of recipe_ids selected so far
        and target recipe_id and computes the distance between them. Higher
        distance scores are considered better for diversity.
    """
    def __init__(self, 
                 dataLoader, 
                 estimator, 
                 diversity_weight=1.0, 
                 distance = "cosine"):
        self.estimator = estimator
        self.dataLoader = dataLoader
        self.recipe_ids = frozenset(dataLoader.get_recipe_ids())
        self.diversity_weight = diversity_weight
        self.diversity = self.get_diversity_calculation(distance)
        self.sim_cache = {}
        
    def get_diversity_calculation(self, distance):
        """
        Returns the function used to calculate how item new_item would affect the diversity
        of recipe set current_items. Higher diversity score is better
        """
        
        def average_cosine_similarity(current_items, new_item_id):
            sim = 0.0
            for recipe_id in current_items:
                sim += self.compute_cosine_similiarity(recipe_id, new_item_id)
            return -sim / len(current_items)
        
        def shared_ingredients(current_items, new_item_id):
            used_ingredients = { i
                                for r_id in current_items
                                for i in self.dataLoader.get_recipe_info(r_id)["ingredients"]}
            new_recipe_ingredients = { i
                                      for i in self.dataLoader.get_recipe_info(new_item_id)["ingredients"]}
    
            return 1.0 - len(new_recipe_ingredients - used_ingredients) / len(new_recipe_ingredients)
    
        if distance == "cosine":
            return average_cosine_similarity
        elif distance == "ingredients":
            return shared_ingredients
        else:
            raise ValueError("Unexpected distance: {}".format(distance))
            
    
    def compute_cosine_similiarity(self, r1, r2):
        key = tuple(sorted([r1, r2]))
        if key not in self.sim_cache:
            ratings1 = self.dataLoader.get_recipe_ratings(r1, defaultdict(set))
            ratings2 = self.dataLoader.get_recipe_ratings(r2, defaultdict(set))
            prod = 0
            for k in ratings1:
                if k in ratings2:
                    prod += ratings1[k] * ratings2[k]
            self.sim_cache[key] = prod / (len(ratings1) * len(ratings2))
        return self.sim_cache[key]

    def get_recommendations(self, user_id, n_recs=100):
        preds = self.predict_unrated_recipes(user_id)
        return self.select_diverse_recipes(preds, n_recs)
        
    def predict_unrated_recipes(self, user_id):
        """
        return a sorted list with tuples of recipe id and the user's predicted rating
        for all unrated recipes
        """
        user_rated_recipes = frozenset(self.dataLoader.get_user_ratings(user_id, {}))
        unrated_recipes = self.recipe_ids - user_rated_recipes
        X = [(user_id, recipe_id) for recipe_id in unrated_recipes]
        predictions = self.estimator.predict(X)
        res = []
        for i, info in enumerate(X):
            _, recipe_id = info
            res.append((recipe_id, predictions[i]))
        return sorted(res,key=lambda x: -x[1])
    
    def select_diverse_recipes(self, ratings, n_recs, search_limit=4000):
        """
        ratings: list of (recipe_id, rating) tuples, sorted in descending
            order by rating
        search_limit: only searches through the top search_limit rated
            recipes to make predictions (improves speed)
        """
        if search_limit:
            ratings = ratings[:search_limit]
        recs = []
        original_ratings = {id: rating for id, rating in ratings}
        for _ in range(n_recs):
            target_recipe_id, _ = ratings.pop(0)
            recs.append(target_recipe_id)
            new_ratings = []
            for i, item in enumerate(ratings):
                recipe_id, _ = item
                predicted_rating = original_ratings[recipe_id]
                revised_score = predicted_rating + self.diversity_weight * self.diversity(recs, recipe_id)
                if revised_score == predicted_rating:
                    new_ratings = new_ratings + ratings[i:]
                    break
                new_ratings.append((recipe_id, revised_score))
            ratings = sorted(new_ratings, key=lambda x: -x[1])
        return recs
            

In [29]:
# Step 1: For a given user, run the estimator an all unrated recipes

# estimator = RecipeMeanEstimator(dataLoader, defaultdict(set))
logitEstimator = LogisticEstimator(dataLoader, None, None)
with open("models/logit_trained_vec.pkl", 'rb') as infile:
    logitEstimator.vec = pickle.load(infile)
with open("models/rfcev_trained.pkl", "rb") as infile:
    rfecv = pickle.load(infile)

estimator = PretrainedEstimator(rfecv, logitEstimator)
recommender = RecipeRecommender(dataLoader, estimator, distance="ingredients")
preds_ing = recommender.predict_unrated_recipes(953814)

In [30]:
# Step 2: Using the estimates, construct the set that balances diversity and predicted utility
preds_new = recommender.select_diverse_recipes(preds_ing, 100)
# preds_old
# diversity_metric = recommender.get_diversity_calculation("ingredients")
# preds_cos
preds_new
# len(preds_ing)

[262161,
 131107,
 262185,
 262225,
 131156,
 262229,
 131180,
 262261,
 65655,
 196732,
 131198,
 65663,
 262272,
 65671,
 65686,
 131237,
 65710,
 262324,
 262326,
 65720,
 65734,
 65759,
 262376,
 262377,
 65769,
 262392,
 262398,
 262403,
 196877,
 196879,
 262446,
 262465,
 262468,
 262470,
 262493,
 262495,
 262496,
 65892,
 262502,
 262503,
 65896,
 262506,
 262510,
 221512,
 65924,
 262533,
 131523,
 197065,
 262622,
 262636,
 262644,
 262647,
 131591,
 66066,
 131643,
 262717,
 197189,
 66130,
 131668,
 131686,
 262797,
 262806,
 262807,
 262857,
 66294,
 262922,
 262928,
 197394,
 66326,
 66336,
 131873,
 131876,
 131891,
 262966,
 66391,
 131931,
 66396,
 197470,
 66404,
 263033,
 66428,
 263037,
 263038,
 263050,
 263054,
 66459,
 197576,
 263123,
 132065,
 132071,
 208612,
 132097,
 66565,
 132109,
 66590,
 132127,
 263217,
 78370,
 66646,
 263255]

In [29]:
# recommender.get_recommendations(36964)
diversity_metric([238270, 240778], 237458)

0.2727272727272727

In [36]:
np.all(np.isclose(preds_cos, preds_ing))

True

In [31]:
frozenset(preds).difference(preds_new)

frozenset()

In [32]:
preds_new

[262161,
 131107,
 262185,
 262225,
 131156,
 262229,
 131180,
 262261,
 65655,
 196732,
 131198,
 65663,
 262272,
 65671,
 65686,
 131237,
 65710,
 262324,
 262326,
 65720,
 65734,
 65759,
 262376,
 262377,
 65769,
 262392,
 262398,
 262403,
 196877,
 196879,
 262446,
 262465,
 262468,
 262470,
 262493,
 262495,
 262496,
 65892,
 262502,
 262503,
 65896,
 262506,
 262510,
 221512,
 65924,
 262533,
 131523,
 197065,
 262622,
 262636,
 262644,
 262647,
 131591,
 66066,
 131643,
 262717,
 197189,
 66130,
 131668,
 131686,
 262797,
 262806,
 262807,
 262857,
 66294,
 262922,
 262928,
 197394,
 66326,
 66336,
 131873,
 131876,
 131891,
 262966,
 66391,
 131931,
 66396,
 197470,
 66404,
 263033,
 66428,
 263037,
 263038,
 263050,
 263054,
 66459,
 197576,
 263123,
 132065,
 132071,
 208612,
 132097,
 66565,
 132109,
 66590,
 132127,
 263217,
 78370,
 66646,
 263255]