In [1]:
import pandas as pd
import numpy as np
import scipy as sp
import pickle
import operator
from DataLoader import DataLoader
from UserMeanEstimator import UserMeanEstimator
from RecipeMeanEstimator import RecipeMeanEstimator
from LogisticEstimator import LogisticEstimator
from PretrainedEstimator import PretrainedEstimator

In [2]:
class DataLoaderPreloaded(DataLoader):
    def __init__(self, recipes, all_ratings, recipe_to_raters):
        self.recipes = recipes
        self.all_ratings = all_ratings
        self.recipe_to_raters_test = recipe_to_raters

In [3]:
# ratings = pd.read_csv("data/all_recipe_clean.csv")
with open("data/all_recipes.pkl", "rb") as infile:
    recipes = pickle.load(infile) 
ratings = pd.read_csv("data/all_ratings.csv")
dataLoader = DataLoader(ratings, recipes)

In [45]:
# Compresses recipe data to only include the necessary information
recipes_compressed = {}
for recipe_id, recipe_data in recipes.items():
    ingredients_compressed = {'key ingredients': recipe_data['ingredients']['key ingredients']}
    ingredients_compressed['full list'] = [
        {'quantity': i['quantity'],
         'key ingredient': i['key ingredient']}
        for i in recipe_data['ingredients']['full list']
    ]
    
    recipes_compressed[recipe_id] = {
        'calories': recipe_data['calories'],
        'categories': recipe_data['categories'],
        'ingredients': ingredients_compressed,
        'servings': recipe_data['servings'],
        'title': recipe_data['title'],
        'user ratings': recipe_data['user ratings']
    }

with open("data/recipes_compressed.pkl", "wb") as outfile:
    pickle.dump(recipes_compressed, outfile)


In [4]:
class RecipeRecommender:
    """
    
    distance: Function that takes in list of recipe_ids selected so far
        and target recipe_id and computes the distance between them. Higher
        distance scores are considered better for diversity.
    """
    def __init__(self, 
                 dataLoader, 
                 estimator, 
                 diversity_weight=1.0, 
                 distance = "cosine"):
        self.estimator = estimator
        self.dataLoader = dataLoader
        self.recipe_ids = frozenset(dataLoader.get_recipe_ids())
        self.diversity_weight = diversity_weight
        self.diversity = self.get_diversity_calculation(distance)
        self.sim_cache = {}
        
    def get_diversity_calculation(self, distance):
        """
        Returns the function used to calculate how item new_item would affect the diversity
        of recipe set current_items. Higher diversity score is better
        """
        
        def average_cosine_similarity(current_items, new_item_id):
            sim = 0.0
            for recipe_id in current_items:
                sim += self.compute_cosine_similiarity(recipe_id, new_item_id)
            return -sim / len(current_items)
        
        def shared_ingredients(current_items, new_item_id):
            used_ingredients = { i
                                for r_id in current_items
                                for i in self.dataLoader.get_recipe_info(r_id)["ingredients"]}
            new_recipe_ingredients = { i
                                      for i in self.dataLoader.get_recipe_info(new_item_id)["ingredients"]}
    
            return 1.0 - len(new_recipe_ingredients - used_ingredients) / len(new_recipe_ingredients)
    
        if distance == "cosine":
            return average_cosine_similarity
        elif distance == "ingredients":
            return shared_ingredients
        else:
            raise ValueError("Unexpected distance: {}".format(distance))
            
    
    def compute_cosine_similiarity(self, r1, r2):
        key = tuple(sorted([r1, r2]))
        if key not in self.sim_cache:
            ratings1 = self.dataLoader.get_recipe_ratings(r1, None)
            ratings2 = self.dataLoader.get_recipe_ratings(r2, None)
            prod = 0
            for k in ratings1:
                if k in ratings2:
                    prod += ratings1[k] * ratings2[k]
            self.sim_cache[key] = prod / (len(ratings1) * len(ratings2))
        return self.sim_cache[key]


    def compute_user_similarity(self, user_ratings1, user_ratings2): 
        def zero_center_ratings(ratings):
            rating_mean = sum(ratings.values()) / len(ratings)
            return { rid : value - rating_mean for rid, value in ratings.items() }

        def l2_norm(ratings):
            return np.linalg.norm(np.fromiter(ratings.values(), dtype=np.float64))

        if len(user_ratings1) < 2 or len(user_ratings2) < 2:
            return 0.0

        prod = 0.0
        zero_centered1 = zero_center_ratings(user_ratings1)
        zero_centered2 = zero_center_ratings(user_ratings2)
        
        norm1 = l2_norm(zero_centered1)
        norm2 = l2_norm(zero_centered2)
        
        if norm1 == 0.0 or norm2 == 0.0:
            return 0.0
        
        for k in zero_centered1:
            if k in zero_centered2:
                prod += zero_centered1[k] * zero_centered2[k]
        return prod / (norm1 * norm1)
    
    def find_similar_user(self, ratings_profile):
        potential_users = set()
        all_ids = ratings_profile["preferred_recipes"] + ratings_profile["non_preferred_recipes"]
        for recipe_id in all_ids:
            ratings = self.dataLoader.get_recipe_ratings(recipe_id, None, split="test")
            potential_users = potential_users.union(frozenset(ratings.keys()))

        ratings_model = {rid: 5.0 for rid in ratings_profile["preferred_recipes"]}
        for rid in ratings_profile["non_preferred_recipes"]:
            ratings_model[rid] = 3.0

        user_similarity = {}
        for user_id in potential_users:
            user_ratings = self.dataLoader.get_user_ratings(user_id, None)
            sim = self.compute_user_similarity(ratings_model, user_ratings)
            if sim > 0:
                user_similarity[user_id] = sim

        return max(user_similarity.items(), key=operator.itemgetter(1))[0]
    
    def get_recommendations(self, ratings_profile, n_recs=100):
        user_id = self.find_similar_user(ratings_profile)
        preds = self.predict_unrated_recipes(user_id)
        return self.select_diverse_recipes(preds, n_recs)
        
    def predict_unrated_recipes(self, user_id):
        """
        return a sorted list with tuples of recipe id and the user's predicted rating
        for all unrated recipes
        """
        user_rated_recipes = frozenset(self.dataLoader.get_user_ratings(user_id, None))
        unrated_recipes = self.recipe_ids - user_rated_recipes
        X = [(user_id, recipe_id) for recipe_id in unrated_recipes]
        predictions = self.estimator.predict(X)
        res = []
        for i, info in enumerate(X):
            _, recipe_id = info
            res.append((recipe_id, predictions[i]))
        return sorted(res,key=lambda x: -x[1])
    
    def select_diverse_recipes(self, ratings, n_recs, search_limit=4000):
        """
        ratings: list of (recipe_id, rating) tuples, sorted in descending
            order by rating
        search_limit: only searches through the top search_limit rated
            recipes to make predictions (improves speed)
        """
        if search_limit:
            ratings = ratings[:search_limit]
        recs = []
        original_ratings = {id: rating for id, rating in ratings}
        for _ in range(n_recs):
            target_recipe_id, _ = ratings.pop(0)
            recs.append(target_recipe_id)
            new_ratings = []
            for i, item in enumerate(ratings):
                recipe_id, _ = item
                predicted_rating = original_ratings[recipe_id]
                revised_score = predicted_rating + self.diversity_weight * self.diversity(recs, recipe_id)
                if revised_score == predicted_rating:
                    new_ratings = new_ratings + ratings[i:]
                    break
                new_ratings.append((recipe_id, revised_score))
            ratings = sorted(new_ratings, key=lambda x: -x[1])
        return recs
            

In [6]:
# Step 1: If making a prediction based on an unknown user profile of only ratings, find the most similar user
test_ratings_profile = {
    "preferred_recipes" : [201085, 16806, 154183, 90481, 10387],
    "non_preferred_recipes" : [11952, 19125, 19211, 51499, 12984]
}

logitEstimator = LogisticEstimator(dataLoader, None, None)
with open("models/logit_trained_vec.pkl", 'rb') as infile:
    logitEstimator.vec = pickle.load(infile)
with open("models/rfcev_trained.pkl", "rb") as infile:
    rfecv = pickle.load(infile)

estimator = PretrainedEstimator(rfecv, logitEstimator)
recommender = RecipeRecommender(dataLoader, estimator, distance="ingredients")
    
similar_user = recommender.find_similar_user(test_ratings_profile)
similar_user

646606

In [7]:
# Step 2: For a given user, run the estimator an all unrated recipes to predict their ratings
preds_ing = recommender.predict_unrated_recipes(similar_user)
preds_ing

[(262161, 5),
 (262272, 5),
 (65759, 5),
 (262392, 5),
 (262403, 5),
 (262502, 5),
 (131523, 5),
 (262717, 5),
 (263054, 5),
 (263123, 5),
 (263331, 5),
 (66933, 5),
 (132703, 5),
 (264155, 5),
 (133105, 5),
 (67990, 5),
 (133633, 5),
 (68330, 5),
 (134182, 5),
 (265281, 5),
 (68697, 5),
 (134280, 5),
 (222077, 5),
 (265424, 5),
 (222092, 5),
 (68837, 5),
 (68879, 5),
 (265521, 5),
 (265530, 5),
 (200340, 5),
 (69274, 5),
 (265979, 5),
 (69410, 5),
 (266343, 5),
 (69865, 5),
 (69903, 5),
 (70038, 5),
 (201117, 5),
 (70163, 5),
 (266826, 5),
 (70282, 5),
 (70312, 5),
 (70343, 5),
 (70404, 5),
 (267149, 5),
 (267247, 5),
 (70935, 5),
 (136846, 5),
 (202389, 5),
 (71422, 5),
 (71632, 5),
 (202860, 5),
 (71803, 5),
 (71891, 5),
 (72022, 5),
 (72112, 5),
 (203248, 5),
 (268999, 5),
 (269003, 5),
 (79518, 5),
 (7032, 5),
 (138163, 5),
 (72636, 5),
 (7219, 5),
 (72876, 5),
 (72878, 5),
 (7567, 5),
 (269763, 5),
 (73177, 5),
 (204344, 5),
 (73297, 5),
 (138973, 5),
 (270048, 5),
 (73580, 5),
 

In [9]:
# Step 3: Using the estimates, construct the set that balances diversity and predicted utility
preds = recommender.select_diverse_recipes(preds_ing, 100)
preds

[262161,
 262272,
 65759,
 262392,
 262403,
 262502,
 131523,
 262717,
 263054,
 263123,
 263331,
 66933,
 132703,
 264155,
 133105,
 67990,
 133633,
 68330,
 134182,
 265281,
 68697,
 134280,
 222077,
 265424,
 222092,
 68837,
 68879,
 265521,
 265530,
 200340,
 69274,
 265979,
 69410,
 266343,
 69865,
 69903,
 70038,
 201117,
 70163,
 266826,
 70282,
 70312,
 70343,
 70404,
 267149,
 267247,
 70935,
 136846,
 202389,
 71422,
 71632,
 202860,
 71803,
 71891,
 72022,
 72112,
 203248,
 268999,
 269003,
 79518,
 7032,
 138163,
 72636,
 7219,
 72876,
 72878,
 7567,
 269763,
 73177,
 204344,
 73297,
 138973,
 270048,
 73580,
 139291,
 139340,
 204933,
 204952,
 270532,
 8513,
 8527,
 8537,
 8553,
 8565,
 8568,
 270721,
 139665,
 8609,
 8624,
 139917,
 8890,
 8900,
 271046,
 8914,
 8993,
 9194,
 140294,
 9236,
 9253,
 9257]

In [10]:
# A full recommendation run
recommender.get_recommendations(test_ratings_profile)

[262161,
 262272,
 65759,
 262392,
 262403,
 262502,
 131523,
 262717,
 263054,
 263123,
 263331,
 66933,
 132703,
 264155,
 133105,
 67990,
 133633,
 68330,
 134182,
 265281,
 68697,
 134280,
 222077,
 265424,
 222092,
 68837,
 68879,
 265521,
 265530,
 200340,
 69274,
 265979,
 69410,
 266343,
 69865,
 69903,
 70038,
 201117,
 70163,
 266826,
 70282,
 70312,
 70343,
 70404,
 267149,
 267247,
 70935,
 136846,
 202389,
 71422,
 71632,
 202860,
 71803,
 71891,
 72022,
 72112,
 203248,
 268999,
 269003,
 79518,
 7032,
 138163,
 72636,
 7219,
 72876,
 72878,
 7567,
 269763,
 73177,
 204344,
 73297,
 138973,
 270048,
 73580,
 139291,
 139340,
 204933,
 204952,
 270532,
 8513,
 8527,
 8537,
 8553,
 8565,
 8568,
 270721,
 139665,
 8609,
 8624,
 139917,
 8890,
 8900,
 271046,
 8914,
 8993,
 9194,
 140294,
 9236,
 9253,
 9257]