## Import

In [2]:
import pandas as pd
import numpy as np
import pickle

from lightfm import LightFM
from lightfm.data import Dataset



## Load data

In [7]:
path = "C:/Users/leaed/Documents/Techlabs/Mealwheeldata/"
raw_interactions = pd.read_csv(path + "RAW_interactions.csv", sep=",")

In [8]:
raw_interactions = raw_interactions[raw_interactions.rating > 3]

In [9]:
raw_interactions = raw_interactions[["user_id", "recipe_id"]]

## Build model

In [10]:
# LightFM will not use our IDs, but rather internal indices
# It does this, because it needs consecutive non-negative integers (but the input could be anything)
# Therefore, we need a mapping between our IDs and the internal indices
# E.g. user_id = 38094 -> internal_user_id = 1, user_id = 1293707 -> internal_user_id = 2
# Very well explained: https://making.lyst.com/lightfm/docs/examples/dataset.html?highlight=dataset
#building-the-id-mappings

# We can do this easily by using the Dataset class from the LightFM package
dataset = Dataset()
dataset.fit(users=raw_interactions["user_id"], items=raw_interactions["recipe_id"])

In [11]:
# To have an easy way of converting input ids to internal ids, we define a mappings class
class Mappings:
    def __init__(self, dataset: Dataset) -> None:
        """
        userid: user_id
        row: internal user id
        itemid: recipe_id
        column: internal recipe id
        """
        userid2row, _, itemid2col, _ = dataset.mapping()
        self.userid2row = userid2row
        self.itemid2col = itemid2col
        # Invert dictionaries to get mapping in other direction
        self.row2userid = {value: key for key, value in self.userid2row.items()}
        self.col2itemid = {v: k for k, v in self.itemid2col.items()}
        # Use like this: 
        # mappings = Mappings(dataset)
        # mappings.userid2row["axfafe24"]

# And use it:
mappings = Mappings(dataset)
# Example. This returns the internal user id of user_id=38094
mappings.userid2row[1293707]

1

In [12]:
from lightfm import LightFM
from sklearn.base import clone


class LightFMResizable(LightFM):
    """A LightFM that resizes the model to accomodate new users,
    items, and features"""

    def fit_partial(
        self,
        interactions,
        user_features=None,
        item_features=None,
        sample_weight=None,
        epochs=1,
        num_threads=1,
        verbose=False,
    ):
        try:
            self._check_initialized()
            self._resize(interactions, user_features, item_features)
        except ValueError:
            # This is the first call so just fit without resizing
            pass

        super().fit_partial(
            interactions,
            user_features,
            item_features,
            sample_weight,
            epochs,
            num_threads,
            verbose,
        )

        return self

    def _resize(self, interactions, user_features=None, item_features=None):
        """Resizes the model to accommodate new users/items/features"""

        no_components = self.no_components
        no_user_features, no_item_features = interactions.shape  # default

        if hasattr(user_features, "shape"):
            no_user_features = user_features.shape[-1]
        if hasattr(item_features, "shape"):
            no_item_features = item_features.shape[-1]

        if (
            no_user_features == self.user_embeddings.shape[0]
            and no_item_features == self.item_embeddings.shape[0]
        ):
            return self

        new_model = clone(self)
        new_model._initialize(no_components, no_item_features, no_user_features)

        # update all attributes from self._check_initialized
        for attr in (
            "item_embeddings",
            "item_embedding_gradients",
            "item_embedding_momentum",
            "item_biases",
            "item_bias_gradients",
            "item_bias_momentum",
            "user_embeddings",
            "user_embedding_gradients",
            "user_embedding_momentum",
            "user_biases",
            "user_bias_gradients",
            "user_bias_momentum",
        ):
            # extend attribute matrices with new rows/cols from
            # freshly initialized model with right shape
            old_array = getattr(self, attr)
            old_slice = [slice(None, i) for i in old_array.shape]
            new_array = getattr(new_model, attr)
            new_array[tuple(old_slice)] = old_array
            setattr(self, attr, new_array)

        return self

In [13]:
# Then we build the interaction matrix (a table with users as rows and recipes as columns, and a 1 in the cell if the user rated the recipe)
interactions, _ = dataset.build_interactions(raw_interactions.to_records(index=False))

In [13]:
# We could also add item_features (like the ratings)
# https://making.lyst.com/lightfm/docs/examples/dataset.html?highlight=dataset#building-the-interactions-matrix

In [14]:
# Now we can raw_datae model
# This might take a few minutes
model = LightFMResizable(loss="warp", learning_rate=0.05, random_state=42)
model.fit(interactions=interactions, epochs=100)

# Save model to pickle file
filename = "recommendation_model.pkl"
with open(filename, 'wb') as file:  
     pickle.dump(model, file)

# Function for Adrian 

In [25]:
def get_recommendations(a, new_user_recipe_ids):
    """
    Input: 
    a: number of recommendations you want
    new_user_recipe_ids: list 5 liked recipe_ids
    Output:
    output: a recommendations as a json format
    
    Model is in a pickle file
    """
    
    ### Load model from pickle file
    filename = "recommendation_model.pkl" 
    with open(filename, 'rb') as file:  
        model = pickle.load(file)
    
    ### fit_partial new user
    new_user = pd.DataFrame({
        "user_id":  [1,1,1,1,1], 
        "recipe_id":  new_user_recipe_id})
    dataset.fit_partial(users=new_user["user_id"], items=new_user["recipe_id"])
    new_interactions, _ = dataset.build_interactions(new_user.to_records(index=False))
    model.fit_partial(interactions=new_interactions)

    ### Get recommendations for this user
    user_id = 1
    # Get the internal id (or: row) for this user
    user_row = mappings.userid2row[user_id]
    # Get the number of items in the dataset
    _, n_items = dataset.interactions_shape()
    # Get an array with all internal item ids
    item_columns = np.arange(n_items) # [0, 1, 2, ..., 231636]
    # Get the scores for each item (for our user)
    scores = model.predict(user_ids=user_row, item_ids=item_columns)
    # Define a function that sorts the scores and returns the top_n elements
    def get_top_sorted(scores: np.ndarray, top_n):
        """
        Get the top indices sorted descendingly from the scores list array.
        Args:
        scores: An array with scores.
        top_n: The number of top scores to be returned.
        Returns:
            ScoringList: The first element of the tuple is the index where the score was
                    in the original array, the second element is the score itself.
        """
        best_idxs = np.argpartition(scores, -top_n)[-top_n:]
        return sorted(zip(best_idxs, scores[best_idxs]), key=lambda x: -x[1])
    
    sorted_scores_top = get_top_sorted(scores, a)
    
    # Add results to a DataFrame
    recommendations = pd.DataFrame(sorted_scores_top, columns=["internal_item_id", "score"])
    # Add user_id
    recommendations["user_id"] = user_id
    # Add recipe_id
    # Google something like "python apply lambda" to learn more about how this works
    recommendations["recipe_id"] = recommendations["internal_item_id"].apply(lambda x: mappings.col2itemid[x])
    # Drop internal_item_id and reorder other columns
    recommendations = recommendations[["user_id", "recipe_id", "score"]]
    ## Get the names of the recipes our sample user
    # Get all interactions from out test user
    user_recipes = raw_interactions[raw_interactions["user_id"] == user_id]

    # Apply function from above also to this dataframe
    user_recipes["recipe_name"] = user_recipes.apply(lambda x: get_recipe_name(x["recipe_id"]), axis=1)
    raw_recipes = pd.read_csv( "C:/Users/leaed/Documents/Techlabs/Mealwheeldata/RAW_recipes.csv", sep=",")
    
    to_adrian = user_recipes.set_index('recipe_id').join(raw_recipes.set_index('id'))
    to_adrian.drop(['name'], axis = 1, inplace = True)
    to_adrian.drop(['contributor_id'], axis = 1, inplace = True)
    to_adrian.drop(['submitted'], axis = 1, inplace = True)
    to_adrian.drop(['user_id'], axis = 1, inplace = True)

    ### Convert to json 
    output = to_adrian.to_json(orient="records")
    
    ### return json
    return output
    

# Test fuction

In [20]:
new_user_recipe_id = [4065, 10123, 295797, 108524, 10045]

In [28]:
get_recommendations(1, new_user_recipe_ids)
# maybe that helps: https://stackoverflow.com/questions/43196907/valueerror-wrong-number-of-items-passed-meaning-and-suggestions

ValueError: Wrong number of items passed 2, placement implies 1

# Test fit partial

In [None]:
new_user = pd.DataFrame({
    "user_id":  [1,1,1,1,1],
    "recipe_id":  [4065, 10123, 295797, 108524, 10045]
})


In [None]:
dataset.fit_partial(users=new_user["user_id"], items=new_user["recipe_id"])
new_interactions, _ = dataset.build_interactions(new_user.to_records(index=False))

# In production update your old model with new data.
model.fit_partial(interactions=new_interactions)


## Test model
Get recommendations for one sample user and check if the recommendations make sense

In [75]:
# Specify the user for which predictions should be made
user_id = 2001868099
# Get the internal id (or: row) for this user
user_row = mappings.userid2row[user_id]
# Get the number of items in the dataset
_, n_items = dataset.interactions_shape()
# Get an array with all internal item ids
item_columns = np.arange(n_items) # [0, 1, 2, ..., 231636]
# Get the scores for each item (for our user)
scores = model.predict(user_ids=user_row, item_ids=item_columns)
# How to interpret:
# score[0] = recommendation score for internal item id 0
# score[1] = recommendation score for internal item id 1
# ...
# The item with the highest score is most likely to be a good recommendation
scores

array([-1.0169863, -1.3943113, -1.1754183, ..., -2.5216193, -2.3398874,
       -1.7851042], dtype=float32)

In [1]:
# Define a function that sorts the scores and returns the top_n elements
def get_top_sorted(scores: np.ndarray, top_n):
    """
    Get the top indices sorted descendingly from the scores list array.
    Args:
        scores: An array with scores.
        top_n: The number of top scores to be returned.
    Returns:
        ScoringList: The first element of the tuple is the index where the score was
                in the original array, the second element is the score itself.
    """
    best_idxs = np.argpartition(scores, -top_n)[-top_n:]
    return sorted(zip(best_idxs, scores[best_idxs]), key=lambda x: -x[1])

# Example: Use fuction to return top 5 recommendations
sorted_scores_top_10 = get_top_sorted(scores, 10)
sorted_scores_top_10

NameError: name 'np' is not defined

In [77]:
# Add results to a DataFrame
recommendations = pd.DataFrame(sorted_scores_top_10, columns=["internal_item_id", "score"])
# Add user_id
recommendations["user_id"] = user_id
# Add recipe_id
# Google something like "python apply lambda" to learn more about how this works
recommendations["recipe_id"] = recommendations["internal_item_id"].apply(lambda x: mappings.col2itemid[x])
# Drop internal_item_id and reorder other columns
recommendations = recommendations[["user_id", "recipe_id", "score"]]
recommendations

Unnamed: 0,user_id,recipe_id,score
0,2001868099,27208,2.071391
1,2001868099,97496,2.04204
2,2001868099,39087,1.923201
3,2001868099,32204,1.906267
4,2001868099,67256,1.885892
5,2001868099,89204,1.859283
6,2001868099,54257,1.831842
7,2001868099,80156,1.797088
8,2001868099,2886,1.778458
9,2001868099,15411,1.758239


In [78]:
## Get the recipe names for each recipe_id
# Load recipe data
raw_recipes = pd.read_csv(path + "RAW_recipes.csv", sep=",")

# Define function that returns recipe name when given a recipe id
def get_recipe_name(recipe_id):
    return raw_recipes[raw_recipes["id"] == recipe_id]["name"].item()

# Apply this function to every row of the recommendations dataframe (with apply and lambda)
recommendations["recipe_name"] = recommendations.apply(lambda x: get_recipe_name(x["recipe_id"]), axis=1)
recommendations

Unnamed: 0,user_id,recipe_id,score,recipe_name
0,2001868099,27208,2.071391,to die for crock pot roast
1,2001868099,97496,2.04204,soft snickerdoodle cookies
2,2001868099,39087,1.923201,creamy cajun chicken pasta
3,2001868099,32204,1.906267,whatever floats your boat brownies
4,2001868099,67256,1.885892,best ever banana cake with cream cheese frosting
5,2001868099,89204,1.859283,crock pot chicken with black beans cream cheese
6,2001868099,54257,1.831842,yes virginia there is a great meatloaf
7,2001868099,80156,1.797088,the most wonderful gingerbread cookies
8,2001868099,2886,1.778458,best banana bread
9,2001868099,15411,1.758239,impossible peanut butter cookies


In [51]:
raw_recipes = pd.read_csv(path + "RAW_recipes.csv", sep=",")

In [53]:
raw_recipes.tags.unique()

array(["['60-minutes-or-less', 'time-to-make', 'course', 'main-ingredient', 'cuisine', 'preparation', 'occasion', 'north-american', 'side-dishes', 'vegetables', 'mexican', 'easy', 'fall', 'holiday-event', 'vegetarian', 'winter', 'dietary', 'christmas', 'seasonal', 'squash']",
       "['30-minutes-or-less', 'time-to-make', 'course', 'main-ingredient', 'cuisine', 'preparation', 'occasion', 'north-american', 'breakfast', 'main-dish', 'pork', 'american', 'oven', 'easy', 'kid-friendly', 'pizza', 'dietary', 'northeastern-united-states', 'meat', 'equipment']",
       "['time-to-make', 'course', 'preparation', 'main-dish', 'chili', 'crock-pot-slow-cooker', 'dietary', 'equipment', '4-hours-or-less']",
       ...,
       "['60-minutes-or-less', 'time-to-make', 'course', 'main-ingredient', 'preparation', 'appetizers', 'eggs-dairy', 'easy', 'finger-food', 'eggs', 'presentation', 'served-cold', '3-steps-or-less']",
       "['30-minutes-or-less', 'time-to-make', 'course', 'preparation', 'for-large-g

In [55]:
raw_interactions['recipe_id'].value_counts().head(20)

2886      1613
27208     1601
89204     1579
39087     1448
67256     1322
54257     1305
22782     1234
32204     1220
69173      997
68955      904
33919      877
82102      855
25885      847
28148      802
135350     786
26110      770
99476      762
10744      731
129926     730
33671      727
Name: recipe_id, dtype: int64