In [8]:
import pandas as pd
import numpy as np
import scipy as sp
import pickle
import operator
from DataLoader import DataLoader
from UserMeanEstimator import UserMeanEstimator
from RecipeMeanEstimator import RecipeMeanEstimator
from LogisticEstimator import LogisticEstimator
from PretrainedEstimator import PretrainedEstimator

In [13]:
class DataLoaderPreloaded(DataLoader):
    def __init__(self, recipes, all_ratings, recipe_to_raters):
        self.recipes = recipes
        self.all_ratings = all_ratings
        self.recipe_to_raters_test = recipe_to_raters

In [23]:
# ratings = pd.read_csv("data/all_recipe_clean.csv")
with open("data/all_recipes.pkl", "rb") as infile:
    recipes = pickle.load(infile) 
ratings = pd.read_csv("data/all_ratings.csv")
dataLoader = DataLoader(ratings, recipes)
# user_holdout, recipe_holdout, holdout = dataLoader.get_holdout_data()
# holdout_X = [t[:2] for t in holdout]
# holdout_Y = np.array([t[2] for t in holdout])

NameError: name 'dlPreloaded' is not defined

In [45]:
# dlPreload = DataLoaderPreloaded(recipes, dataLoader.all_ratings, dataLoader.recipe_to_raters_test)
# dlPreload.get_recipe_ratings(272320, None, split="test")
# dlPreload.get_user_ratings(1384589, None)
# with open("data/all_ratings_processed.pkl", "wb") as outfile:
#     pickle.dump(dlPreload.all_ratings, outfile)
# dlPreload.all_ratings

# with open("data/recipe_to_raters.pkl", "wb") as outfile:
#     pickle.dump(dlPreload.recipe_to_raters_test, outfile)

# dlPreload.recipe_to_raters_test

# sys.getsizeof(recipes)
# recipeSt = pickle.dumps(recipes)
# sys.getsizeof(recipeSt)

# recipes_compressed = {}
# for recipe_id, recipe_data in recipes.items():
#     ingredients_compressed = {'key ingredients': recipe_data['ingredients']['key ingredients']}
#     ingredients_compressed['full list'] = [
#         {'quantity': i['quantity'],
#          'key ingredient': i['key ingredient']}
#         for i in recipe_data['ingredients']['full list']
#     ]
    
#     recipes_compressed[recipe_id] = {
#         'calories': recipe_data['calories'],
#         'categories': recipe_data['categories'],
#         'ingredients': ingredients_compressed,
#         'servings': recipe_data['servings'],
#         'title': recipe_data['title'],
#         'user ratings': recipe_data['user ratings']
#     }
# recipes_compressed[228101]

with open("data/recipes_compressed.pkl", "wb") as outfile:
    pickle.dump(recipes_compressed, outfile)


In [184]:
class RecipeRecommender:
    """
    
    distance: Function that takes in list of recipe_ids selected so far
        and target recipe_id and computes the distance between them. Higher
        distance scores are considered better for diversity.
    """
    def __init__(self, 
                 dataLoader, 
                 estimator, 
                 diversity_weight=1.0, 
                 distance = "cosine"):
        self.estimator = estimator
        self.dataLoader = dataLoader
        self.recipe_ids = frozenset(dataLoader.get_recipe_ids())
        self.diversity_weight = diversity_weight
        self.diversity = self.get_diversity_calculation(distance)
        self.sim_cache = {}
        
    def get_diversity_calculation(self, distance):
        """
        Returns the function used to calculate how item new_item would affect the diversity
        of recipe set current_items. Higher diversity score is better
        """
        
        def average_cosine_similarity(current_items, new_item_id):
            sim = 0.0
            for recipe_id in current_items:
                sim += self.compute_cosine_similiarity(recipe_id, new_item_id)
            return -sim / len(current_items)
        
        def shared_ingredients(current_items, new_item_id):
            used_ingredients = { i
                                for r_id in current_items
                                for i in self.dataLoader.get_recipe_info(r_id)["ingredients"]}
            new_recipe_ingredients = { i
                                      for i in self.dataLoader.get_recipe_info(new_item_id)["ingredients"]}
    
            return 1.0 - len(new_recipe_ingredients - used_ingredients) / len(new_recipe_ingredients)
    
        if distance == "cosine":
            return average_cosine_similarity
        elif distance == "ingredients":
            return shared_ingredients
        else:
            raise ValueError("Unexpected distance: {}".format(distance))
            
    
    def compute_cosine_similiarity(self, r1, r2):
        key = tuple(sorted([r1, r2]))
        if key not in self.sim_cache:
            ratings1 = self.dataLoader.get_recipe_ratings(r1, None)
            ratings2 = self.dataLoader.get_recipe_ratings(r2, None)
            prod = 0
            for k in ratings1:
                if k in ratings2:
                    prod += ratings1[k] * ratings2[k]
            self.sim_cache[key] = prod / (len(ratings1) * len(ratings2))
        return self.sim_cache[key]


    def compute_user_similarity(self, user_ratings1, user_ratings2): 
        def zero_center_ratings(ratings):
            rating_mean = sum(ratings.values()) / len(ratings)
            return { rid : value - rating_mean for rid, value in ratings.items() }

        def l2_norm(ratings):
            return np.linalg.norm(np.fromiter(ratings.values(), dtype=np.float64))

        if len(user_ratings1) < 2 or len(user_ratings2) < 2:
            return 0.0

        prod = 0.0
        zero_centered1 = zero_center_ratings(user_ratings1)
        zero_centered2 = zero_center_ratings(user_ratings2)
        
        norm1 = l2_norm(zero_centered1)
        norm2 = l2_norm(zero_centered2)
        
        if norm1 == 0.0 or norm2 == 0.0:
            return 0.0
        
        for k in zero_centered1:
            if k in zero_centered2:
                prod += zero_centered1[k] * zero_centered2[k]
        return prod / (norm1 * norm1)
    
    def find_similar_user(self, ratings_profile):
        potential_users = set()
        all_ids = ratings_profile["preferred_recipes"] + ratings_profile["non_preferred_recipes"]
        for recipe_id in all_ids:
            ratings = self.dataLoader.get_recipe_ratings(recipe_id, None, split="test")
            potential_users = potential_users.union(frozenset(ratings.keys()))

        ratings_model = {rid: 5.0 for rid in ratings_profile["preferred_recipes"]}
        for rid in ratings_profile["non_preferred_recipes"]:
            ratings_model[rid] = 3.0

        user_similarity = {}
        for user_id in potential_users:
            user_ratings = self.dataLoader.get_user_ratings(user_id, None)
            sim = self.compute_user_similarity(ratings_model, user_ratings)
            if sim > 0:
                user_similarity[user_id] = sim

        return max(user_similarity.items(), key=operator.itemgetter(1))[0]
    
    def get_recommendations(self, ratings_profile, n_recs=100):
        user_id = self.find_similar_user(ratings_profile)
        preds = self.predict_unrated_recipes(user_id)
        return self.select_diverse_recipes(preds, n_recs)
        
    def predict_unrated_recipes(self, user_id):
        """
        return a sorted list with tuples of recipe id and the user's predicted rating
        for all unrated recipes
        """
        user_rated_recipes = frozenset(self.dataLoader.get_user_ratings(user_id, None))
        unrated_recipes = self.recipe_ids - user_rated_recipes
        X = [(user_id, recipe_id) for recipe_id in unrated_recipes]
        predictions = self.estimator.predict(X)
        res = []
        for i, info in enumerate(X):
            _, recipe_id = info
            res.append((recipe_id, predictions[i]))
        return sorted(res,key=lambda x: -x[1])
    
    def select_diverse_recipes(self, ratings, n_recs, search_limit=4000):
        """
        ratings: list of (recipe_id, rating) tuples, sorted in descending
            order by rating
        search_limit: only searches through the top search_limit rated
            recipes to make predictions (improves speed)
        """
        if search_limit:
            ratings = ratings[:search_limit]
        recs = []
        original_ratings = {id: rating for id, rating in ratings}
        for _ in range(n_recs):
            target_recipe_id, _ = ratings.pop(0)
            recs.append(target_recipe_id)
            new_ratings = []
            for i, item in enumerate(ratings):
                recipe_id, _ = item
                predicted_rating = original_ratings[recipe_id]
                revised_score = predicted_rating + self.diversity_weight * self.diversity(recs, recipe_id)
                if revised_score == predicted_rating:
                    new_ratings = new_ratings + ratings[i:]
                    break
                new_ratings.append((recipe_id, revised_score))
            ratings = sorted(new_ratings, key=lambda x: -x[1])
        return recs
            

In [190]:
# Step 0: If making a prediction based on an unknown user profile of only ratings, find the most similar user
test_ratings_profile = {
    "preferred_recipes" : [201085, 16806, 154183, 90481, 10387],
    "non_preferred_recipes" : [11952, 19125, 19211, 51499, 12984]
}


    
# recommender.find_similar_user(test_ratings_profile)    

            
        

In [191]:
# Step 1: For a given user, run the estimator an all unrated recipes

# estimator = RecipeMeanEstimator(dataLoader, defaultdict(set))
logitEstimator = LogisticEstimator(dataLoader, None, None)
with open("models/logit_trained_vec.pkl", 'rb') as infile:
    logitEstimator.vec = pickle.load(infile)
with open("models/rfcev_trained.pkl", "rb") as infile:
    rfecv = pickle.load(infile)

estimator = PretrainedEstimator(rfecv, logitEstimator)
recommender = RecipeRecommender(dataLoader, estimator, distance="ingredients")
# preds_ing = recommender.predict_unrated_recipes(953814)

In [192]:
# Step 2: Using the estimates, construct the set that balances diversity and predicted utility
# preds_new = recommender.select_diverse_recipes(preds_ing, 100)
# preds_old
# diversity_metric = recommender.get_diversity_calculation("ingredients")
# preds_cos
preds_test = recommender.get_recommendations(test_ratings_profile)
# len(preds_ing)

In [176]:
# recommender.get_recommendations(36964)
# diversity_metric([238270, 240778], 237458)
preds_test3

[262161,
 131107,
 262225,
 131156,
 262229,
 262261,
 196732,
 65663,
 262272,
 65671,
 65686,
 131237,
 262324,
 262326,
 65720,
 65734,
 65759,
 262377,
 65769,
 262392,
 262398,
 262403,
 196879,
 262446,
 262465,
 262468,
 262470,
 262495,
 262496,
 262502,
 262503,
 262510,
 221512,
 65924,
 131523,
 197065,
 262622,
 262647,
 131591,
 66066,
 262717,
 197189,
 66130,
 131668,
 262806,
 262857,
 66294,
 262922,
 262928,
 66326,
 131873,
 131876,
 131891,
 66391,
 131931,
 66396,
 263037,
 263038,
 263054,
 66459,
 263123,
 132065,
 132071,
 132097,
 132109,
 132127,
 78370,
 66646,
 66657,
 132198,
 197751,
 263331,
 78402,
 66782,
 263393,
 263394,
 132326,
 132350,
 132351,
 132358,
 132411,
 132422,
 263498,
 263511,
 263514,
 263521,
 91456,
 132465,
 66933,
 263567,
 132511,
 67002,
 263611,
 263617,
 198099,
 67084,
 132646,
 198182,
 263744,
 67147]

In [36]:
np.all(np.isclose(preds_cos, preds_ing))

True

In [31]:
frozenset(preds).difference(preds_new)

frozenset()

In [139]:
# # preds_new
d1 = {'a': 5, 'b': 3, 'c': 4}
d2 = {'b': 3, 'c': 5, 'd': 2}

# frozenset(d1.keys()).intersection(d2.keys())

user_ratings1 = dataLoader.get_user_ratings(138, None)
user_ratings2 = dataLoader.get_user_ratings(669128, None)
compute_user_similarity(user_ratings1, user_ratings2)

0.11215007691282607

In [199]:
# overlaps= []

# for i, recipe_id in enumerate(preds_test2):
#     this_ingredients = frozenset(recipes[recipe_id]['ingredients']['key ingredients'])
    
#     for other_recipe_id in preds_test[i+1:]:
#         other_ingredients = frozenset(recipes[other_recipe_id]['ingredients']['key ingredients'])
#         pairwise_overlap = len(this_ingredients.intersection(other_ingredients)) / len(this_ingredients.union(other_ingredients))
#         overlaps.append(pairwise_overlap)
    
# np.mean(overlaps)

# for recipe_id in preds_test:
#     print(recipes[recipe_id]['title'])
# user_ratings1

# v1 = np.array([5, 3, 4])
# v1 = v1 - np.mean(v1)
# v2 = np.array([3, 5, 2])
# v2 = v2 - np.mean(v2)

# (-1.0 * (-1/3)) / (np.linalg.norm(v1) * np.linalg.norm(v2))

# np.dot(v1[1:], v2[:2])  / (np.linalg.norm(v1) * np.linalg.norm(v2))

# np.linalg.norm(v2)

# find_similar_user(test_ratings_profile)

frozenset(preds_test) - frozenset(preds_test3) 



frozenset({7032,
           7219,
           7567,
           8513,
           8527,
           8537,
           8553,
           8565,
           8568,
           8609,
           8624,
           8890,
           8900,
           8914,
           8993,
           9194,
           9236,
           9253,
           9257,
           67990,
           68330,
           68697,
           68837,
           68879,
           69274,
           69410,
           69865,
           69903,
           70038,
           70163,
           70282,
           70312,
           70343,
           70404,
           70935,
           71422,
           71632,
           71803,
           71891,
           72022,
           72112,
           72636,
           72876,
           72878,
           73177,
           73297,
           73580,
           79518,
           132703,
           133105,
           133633,
           134182,
           134280,
           136846,
           138163,
           138973,
   

In [163]:


# comp1 = np.random.choice(recipes_10, 5)
# comp2 = np.random.choice(recipes_10, 5)
# print(comp1, comp2) 
# target_user_recipes.intersection(dataLoader.get_user_ratings(669128, None).keys())

1854140