In [1]:
import os
import sys
sys.path.append('..')

from scripts.data_processing import load_processed_recipes, load_raw_recipes

import pandas as pd
import numpy as np
import numba

from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

## Load the data

In [50]:
# Load the data
recipes_df = load_processed_recipes(data_folder_path=os.path.join('..', 'data', 'processed'))
#recipes_df.index = recipes_df["id"]
#recipes_df.drop(columns=["id"], inplace=False)

In [None]:
recipes_df.iloc[:, 1:][recipes_df["id"]==38].to_numpy()

In [64]:
recipes_df.iloc[:, 1:][recipes_df["id"]==38].to_numpy().squeeze()

(967,)

In [58]:
final_np_array = recipes_df.iloc[:, 1:].to_numpy()

In [4]:
@numba.jit(nopython=True, parallel=True)
def fast_cosine_matrix(u, M):
    scores = np.zeros(M.shape[0])
    for i in numba.prange(M.shape[0]):
        v = M[i]
        m = u.shape[0]
        udotv = 0
        u_norm = 0
        v_norm = 0
        for j in range(m):
            if (np.isnan(u[j])) or (np.isnan(v[j])):
                continue

            udotv += u[j] * v[j]
            u_norm += u[j] * u[j]
            v_norm += v[j] * v[j]

        u_norm = np.sqrt(u_norm)
        v_norm = np.sqrt(v_norm)

        if (u_norm == 0) or (v_norm == 0):
            ratio = 1.0
        else:
            ratio = udotv / (u_norm * v_norm)
        scores[i] = ratio
    return scores

In [87]:
def get_top_n_recipe_ids(recipes_df, recipe_id, top_n=5):
    u = recipes_df.iloc[:, 1:][recipes_df["id"]==recipe_id].to_numpy().squeeze()
    M = recipes_df.iloc[:, 1:].to_numpy()
    
    res = fast_cosine_matrix(u, M)
    res = np.argsort(res)[-(top_n+1):][:-1]
    return list(recipes_df.iloc[res].id)

In [90]:
get_top_n_recipe_ids(recipes_df, 38)

[211779, 410225, 310642, 53600, 375996]

In [5]:
u = final_np_array[0]
M = final_np_array

In [6]:
M.shape

(231637, 967)

In [21]:
res = fast_cosine_matrix(u, M)

In [37]:
res.shape

(231637,)

In [65]:
np.argsort(res)[-6:][:-1]

array([118102, 202397, 164065,  30211, 190300])

In [75]:
list(recipes_df.iloc[np.argsort(res)[-6:][:-1]].id)

[211779, 410225, 310642, 53600, 375996]

In [18]:
abc = recipes_df.astype(np.float16).iloc[:, 1:]

In [9]:
recipes_array = np.array(recipes_df)
recipes_array = np.float16(recipes_array)

In [None]:
similarity = np.dot(recipes_array, recipes_array.T)

In [21]:
type(recipes_array)

numpy.ndarray

In [None]:
similarity = np.dot(A, A.T)

# squared magnitude of preference vectors (number of occurrences)
square_mag = np.diag(similarity)

# inverse squared magnitude
inv_square_mag = 1 / square_mag

# if it doesn't occur, set it's inverse magnitude to zero (instead of inf)
inv_square_mag[np.isinf(inv_square_mag)] = 0

# inverse of the magnitude
inv_mag = np.sqrt(inv_square_mag)

# cosine similarity (elementwise multiply by inverse magnitudes)
cosine = similarity * inv_mag
return cosine.T * inv_mag

In [None]:
# Use of a subset right now
subest_recipes_df = recipes_df_norm.iloc[:100, :]
# Build similarity marrix of movies based on similarity of genres
csmatrix = cosine_similarity(recipes_df)
csmatrix = pd.DataFrame(csmatrix,columns=subest_recipes_df['id'],index=subest_recipes_df['id'])
csmatrix.head()

id,38,39,40,41,43,45,48,49,50,52,...,207,209,212,213,215,216,223,224,226,229
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
38,1.0,0.993135,0.998412,0.997251,0.993387,0.997475,0.994331,0.996211,0.997216,0.998368,...,0.998957,0.998879,0.998927,0.998987,0.998468,0.998384,0.998885,0.998957,0.999091,0.998938
39,0.993135,1.0,0.993135,0.993196,0.991388,0.992666,0.992831,0.994599,0.992596,0.992582,...,0.994379,0.994662,0.994501,0.994519,0.995097,0.994857,0.994502,0.99462,0.994774,0.994639
40,0.998412,0.993135,1.0,0.995956,0.991886,0.998118,0.993245,0.995298,0.998012,0.999356,...,0.999062,0.999123,0.999155,0.99912,0.998728,0.998432,0.999011,0.999174,0.998943,0.999138
41,0.997251,0.993196,0.995956,1.0,0.993954,0.995904,0.994646,0.995992,0.994849,0.996292,...,0.997159,0.997233,0.99714,0.997239,0.997195,0.996982,0.997211,0.997166,0.997311,0.997281
43,0.993387,0.991388,0.991886,0.993954,1.0,0.992715,0.997897,0.994834,0.991393,0.991233,...,0.992753,0.993084,0.993064,0.993313,0.992814,0.992872,0.99297,0.993063,0.993611,0.993417


In [None]:
def most_similar_recipe(recipes_df, recipes_df_processed, recipe_id, top_n=10):
    top_recipe_names = []
    top_recipe_ids = []
    
    # Build the similarity matrix of recipes
    csmatrix = cosine_similarity(recipes_df_norm)
    csmatrix = pd.DataFrame(csmatrix,columns=recipes_df_norm['id'],index=recipes_df_norm['id'])
    
    # Get the top n most similar recipes
    most_similar_recipes = csmatrix.loc[recipe_id, :].sort_values(ascending=False).index[1:top_n+1]
    for recipe in most_similar_recipes:
        top_recipe_names.append(recipes_df[recipes_df['id'] == recipe]['name'])
        top_recipe_ids.append(recipe)
    
    # Return the top names and ids
    return top_recipe_names, top_recipe_ids   
    

In [None]:
recipes_df = load_raw_recipes(data_folder_path=os.path.join('..', 'data'))
recipes_df_processed = load_processed_recipes(data_folder_path=os.path.join('..', 'data', 'processed'))
subset_df = recipes_df_processed.iloc[:100, :]

top_recipes, top_recipe_ids = most_similar_recipe(recipes_df, subset_df, 39, top_n=10)
print(f"Top Recipe name: {recipes_df[recipes_df['id'] == 39]['name']}")
display(top_recipes)
display(top_recipe_ids)

Top Recipe name: 23695    biryani
Name: name, dtype: object


[46956    chicken fried steak with cracked pepper gravy
 Name: name, dtype: object,
 32064    butterflied lamb with garlic butter
 Name: name, dtype: object,
 54270    citrus baked halibut
 Name: name, dtype: object,
 54546    clam   lobster bake
 Name: name, dtype: object,
 2012    a new spaghetti with clams
 Name: name, dtype: object,
 53707    cinnamon curry rice
 Name: name, dtype: object,
 42595    chicken and broccoli lasagna
 Name: name, dtype: object,
 36729    carrie s pizza rolls
 Name: name, dtype: object,
 49636    chocolate cashew spread
 Name: name, dtype: object,
 65682    crock pot chocolate zucchini nut bread
 Name: name, dtype: object]

[187, 109, 201, 196, 115, 205, 112, 141, 216, 215]

## Generate Predicted Ratings

In [57]:
recipes_df[recipes_df['id'] == 39]

Unnamed: 0,name,id,minutes,contributor_id,submitted,tags,nutrition,n_steps,steps,description,ingredients,n_ingredients
23695,biryani,39,265,1567,1999-08-29,"['weeknight', 'time-to-make', 'course', 'main-...","[1110.7, 90.0, 81.0, 15.0, 126.0, 82.0, 28.0]",17,['soak saffron in warm milk for 5 minutes and ...,"delhi, india","['saffron', 'milk', 'hot green chili peppers',...",26
