In [5]:
import pandas as pd
import numpy as np
import scipy as sp
import pickle
import DataLoader
from collections import defaultdict
from UserMeanEstimator import UserMeanEstimator
from RecipeMeanEstimator import RecipeMeanEstimator

In [6]:
# ratings = pd.read_csv("data/all_recipe_clean.csv")
with open("data/test_data_100.pkl", "rb") as infile:
    recipes = pickle.load(infile) 
ratings = pd.read_csv("data/test_ratings_100.csv")
dataLoader = DataLoader.DataLoader(ratings, recipes)
# user_holdout, recipe_holdout, holdout = dataLoader.get_holdout_data()
# holdout_X = [t[:2] for t in holdout]
# holdout_Y = np.array([t[2] for t in holdout])

In [25]:
class RecipeRecommender:
    """
    
    distance: Function that takes in list of recipe_ids selected so far
        and target recipe_id and computes the distance between them. Higher
        distance scores are considered better for diversity.
    """
    def __init__(self, 
                 dataLoader, 
                 estimator, 
                 diversity_weight=1.0, 
                 distance = "cosine"):
        self.estimator = estimator
        self.dataLoader = dataLoader
        self.recipe_ids = frozenset(dataLoader.get_recipe_ids())
        self.diversity_weight = diversity_weight
        self.diversity = self.get_diversity_calculation(distance)
        self.sim_cache = {}
        
    def get_diversity_calculation(self, distance):
        """
        Returns the function used to calculate how item new_item would affect the diversity
        of recipe set current_items. Higher diversity score is better
        """
        
        def average_cosine_similarity(current_items, new_item_id):
            sim = 0.0
            for recipe_id in current_items:
                sim += self.compute_cosine_similiarity(recipe_id, new_item_id)
            return -sim / len(current_items)
        
        def shared_ingredients(current_items, new_item_id):
            used_ingredients = { i["key ingredient"]
                                for r_id in current_items
                                for i in self.dataLoader.get_recipe_info(r_id)["ingredients"]}
            new_recipe_ingredients = { i["key ingredient"] 
                                      for i in self.dataLoader.get_recipe_info(new_item_id)["ingredients"]}
    
            return 1.0 - len(new_recipe_ingredients - used_ingredients) / len(new_recipe_ingredients)
    
        if distance == "cosine":
            return average_cosine_similarity
        elif distance == "ingredients":
            return shared_ingredients
        else:
            raise ValueError("Unexpected distance: {}".format(distance))
            
    
    def compute_cosine_similiarity(self, r1, r2):
        key = tuple(sorted([r1, r2]))
        if key not in self.sim_cache:
            ratings1 = self.dataLoader.get_recipe_ratings(r1, defaultdict(set))
            ratings2 = self.dataLoader.get_recipe_ratings(r2, defaultdict(set))
            prod = 0
            for k in ratings1:
                if k in ratings2:
                    prod += ratings1[k] * ratings2[k]
            self.sim_cache[key] = prod / (len(ratings1) * len(ratings2))
        return self.sim_cache[key]

    def get_recommendations(self, user_id, n_recs=100):
        preds = self.predict_unrated_recipes(user_id)
        return self.select_diverse_recipes(preds, n_recs)
        
    def predict_unrated_recipes(self, user_id):
        """
        return a sorted list with tuples of recipe id and the user's predicted rating
        for all unrated recipes
        """
        user_rated_recipes = frozenset(self.dataLoader.get_user_ratings(user_id, {}))
        unrated_recipes = self.recipe_ids - user_rated_recipes
        X = [(user_id, recipe_id) for recipe_id in unrated_recipes]
        predictions = estimator.predict(X)
        res = []
        for i, info in enumerate(X):
            _, recipe_id = info
            res.append((recipe_id, predictions[i]))
        return sorted(res,key=lambda x: -x[1])
    
    def select_diverse_recipes(self, ratings, n_recs):
        """
        ratings: list of (recipe_id, rating) tuples, sorted in descending
            order by rating
        """
        recs = []
        original_ratings = {id: rating for id, rating in ratings}
        for _ in range(n_recs):
            target_recipe_id, _ = ratings.pop(0)
            recs.append(target_recipe_id)
            new_ratings = []
            for i, item in enumerate(ratings):
                recipe_id, _ = item
                predicted_rating = original_ratings[recipe_id]
                revised_score = predicted_rating + self.diversity_weight * self.diversity(recs, recipe_id)
                if revised_score == predicted_rating:
                    new_ratings = new_ratings + ratings[i:]
                    break
                new_ratings.append((recipe_id, revised_score))
            ratings = sorted(new_ratings, key=lambda x: -x[1])
        return recs
            

In [35]:
# Step 1: For a given user, run the estimator an all unrated recipes

estimator = RecipeMeanEstimator(dataLoader, defaultdict(set))
recommender = RecipeRecommender(dataLoader, estimator, distance="ingredients")
preds_ing = recommender.predict_unrated_recipes(953814)

In [31]:
# Step 2: Using the estimates, construct the set that balances diversity and predicted utility
# preds_old = recommender.select_diverse_recipes(preds, 100)
# preds_old
# diversity_metric = recommender.get_diversity_calculation("ingredients")
preds_cos

[(88071, 5.0),
 (150812, 5.0),
 (16927, 5.0),
 (150818, 5.0),
 (229920, 5.0),
 (56358, 5.0),
 (70188, 5.0),
 (235059, 5.0),
 (245050, 5.0),
 (45652, 5.0),
 (255337, 5.0),
 (20339, 5.0),
 (163972, 5.0),
 (262027, 5.0),
 (25504, 5.0),
 (228261, 5.0),
 (146088, 5.0),
 (128967, 5.0),
 (40399, 5.0),
 (86227, 5.0),
 (223190, 5.0),
 (230407, 4.0),
 (16651, 4.0),
 (159246, 4.0),
 (17426, 4.0),
 (34579, 4.0),
 (30749, 4.0),
 (218654, 4.0),
 (20511, 4.0),
 (229665, 4.0),
 (76066, 4.0),
 (172066, 4.0),
 (24616, 4.0),
 (199725, 4.0),
 (155971, 4.0),
 (87365, 4.0),
 (232018, 4.0),
 (174170, 4.0),
 (161118, 4.0),
 (54887, 4.0),
 (74345, 4.0),
 (9076, 4.0),
 (215411, 4.0),
 (254325, 4.0),
 (23167, 4.0),
 (240778, 4.0),
 (25739, 4.0),
 (212366, 4.0),
 (229267, 4.0),
 (24213, 4.0),
 (16024, 4.0),
 (11934, 4.0),
 (14239, 4.0),
 (9377, 4.0),
 (83368, 4.0),
 (21417, 4.0),
 (214957, 4.0),
 (30126, 4.0),
 (212920, 4.0),
 (11971, 4.0),
 (8906, 4.0),
 (172234, 4.0),
 (130265, 4.0),
 (228078, 4.0),
 (237296, 4

In [29]:
# recommender.get_recommendations(36964)
diversity_metric([238270, 240778], 237458)

0.2727272727272727

In [36]:
np.all(np.isclose(preds_cos, preds_ing))

True