In [5]:
import pandas as pd
import numpy as np

In [6]:
recipes = pd.read_csv("../data/archive/RAW_recipes.csv")
interactions = pd.read_csv("../data/archive/RAW_interactions.csv")

In [7]:
interactions["count"] = 1
interactions.head()

Unnamed: 0,user_id,recipe_id,date,rating,review,count
0,38094,40893,2003-02-17,4,Great with a salad. Cooked on top of stove for...,1
1,1293707,40893,2011-12-21,5,"So simple, so delicious! Great for chilly fall...",1
2,8937,44394,2002-12-01,4,This worked very well and is EASY. I used not...,1
3,126440,85009,2010-02-27,5,I made the Mexican topping and took it to bunk...,1
4,57222,85009,2011-10-01,5,"Made the cheddar bacon topping, adding a sprin...",1


### Top 200 recipe ids by rating count

In [8]:
top_200_recipes = interactions.groupby("recipe_id")["count"].sum().sort_values(ascending=False).to_frame()[:200]
top_200_recipe_ids = top_200_recipes.index.to_list()

### Top 75 user ids by rating count

In [9]:
top_75_users = interactions.groupby("user_id")["count"].sum().sort_values(ascending=False).to_frame()[:75]
top_75_users_ids = top_75_users.index.to_list()

### Interactions with top users and top recipes

In [10]:
top_200_recipes_top_75_users = interactions[interactions["user_id"].isin(top_75_users_ids) & interactions["recipe_id"].isin(top_200_recipe_ids)]

In [11]:
top_200_recipes_top_75_users

Unnamed: 0,user_id,recipe_id,date,rating,review,count
6054,5060,30081,2005-03-18,2,Well I'm really sorry but we found these absol...,1
6056,39835,30081,2005-08-26,3,"These were good for sandwich night, but I foun...",1
6064,158086,30081,2006-08-23,5,This was very easy and tasted wonderful...my b...,1
6069,53932,30081,2006-09-08,4,I used ground turkey and thought the spices we...,1
6137,176615,30081,2008-06-06,5,"Great recipe! Superfast and easy, the whole fa...",1
...,...,...,...,...,...,...
1123448,133174,131018,2013-01-28,5,This is on my menu for game day! I made this ...,1
1123461,39835,131018,2013-06-22,5,We loved this - used diet ginger ale but other...,1
1123494,400708,131018,2014-07-15,5,I&#039;ll just add my 5 stars to this easy rec...,1
1129713,140132,43072,2008-06-25,4,These were really good. I had some cottage ch...,1


In [12]:
num_users = top_200_recipes_top_75_users["user_id"].nunique()
num_recipes = top_200_recipes_top_75_users["recipe_id"].nunique()
num_interactions = len(top_200_recipes_top_75_users)

In [13]:
print(f"Num users: {num_users}, Num recipes: {num_recipes}, Num final interactions: {num_interactions}")

Num users: 74, Num recipes: 199, Num final interactions: 1659


In [14]:
percent_full = (num_interactions / (num_users * num_recipes)) * 100
print(f"The matrix will be {percent_full:.2f}% full")

The matrix will be 11.27% full


In [15]:
final_interactions = top_200_recipes_top_75_users.copy()

### Building sparse matrix

In [16]:
user_recipe_matrix = top_200_recipes_top_75_users.pivot_table(
    index="user_id",
    columns="recipe_id",
    values="rating"
)

In [17]:
#user_recipe_matrix

#### Check nulls

In [18]:
user_recipe_matrix.shape

(74, 199)

In [19]:
nan_recipe = user_recipe_matrix.isna().sum(axis=1).to_frame()
nan_recipe[nan_recipe[0] > 195]

Unnamed: 0_level_0,0
user_id,Unnamed: 1_level_1
58104,196
107583,197
163112,197
169430,197
573325,198


In [20]:
#nan_recipe

In [21]:
top200_recipe_df = recipes[recipes["id"].isin(top_200_recipe_ids)]
top200_recipe_df.head()

Unnamed: 0,name,id,minutes,contributor_id,submitted,tags,nutrition,n_steps,steps,description,ingredients,n_ingredients
574,whatever floats your boat brownies,32204,35,37305,2002-06-25,"['60-minutes-or-less', 'time-to-make', 'course...","[390.7, 30.0, 161.0, 7.0, 12.0, 50.0, 17.0]",14,"['preheat oven to 350f', 'grease an 8 inch squ...","these are absolutely the chewiest, moistest, f...","['butter', 'unsweetened cocoa', 'sugar', 'eggs...",14
1455,4 minute spicy garlic shrimp,107997,17,52074,2005-01-10,"['30-minutes-or-less', 'time-to-make', 'course...","[152.4, 21.0, 0.0, 8.0, 10.0, 9.0, 0.0]",6,"['heat a large skillet over medium high heat',...",recipe is originally from the food network cou...,"['shrimp', 'olive oil', 'garlic cloves', 'red ...",8
1598,5 minute vegan pancakes,132263,15,233464,2005-08-03,"['15-minutes-or-less', 'time-to-make', 'course...","[444.8, 25.0, 45.0, 54.0, 20.0, 10.0, 21.0]",8,"['set out all your ingredients', 'set a stove ...","a cinch to make and tastes wonderful, if you l...","['flour', 'sugar', 'baking powder', 'salt', 's...",6
2288,absolute best ever lasagna,28768,150,37779,2002-05-18,"['weeknight', 'time-to-make', 'course', 'main-...","[517.1, 45.0, 15.0, 34.0, 73.0, 75.0, 8.0]",16,"['brown ground meat , onion and garlic', 'add ...",my family's favorite. this is a very meaty las...,"['lean ground beef', 'italian sausage', 'onion...",17
4599,amazing chicken marinade,29598,35,35193,2002-05-29,"['60-minutes-or-less', 'time-to-make', 'course...","[337.6, 25.0, 72.0, 33.0, 51.0, 12.0, 6.0]",8,"['in a large , non-reactive container , whisk ...",this recipe came from allrecipes (ruth crickme...,"['cider vinegar', 'whole grain mustard', 'garl...",10


In [22]:
top200_recipe_df["tags"].iloc[2]

"['15-minutes-or-less', 'time-to-make', 'course', 'preparation', 'occasion', 'for-1-or-2', 'pancakes-and-waffles', 'breakfast', 'easy', 'beginner-cook', 'vegan', 'vegetarian', 'dietary', 'low-cholesterol', 'low-saturated-fat', 'high-calcium', 'egg-free', 'free-of-something', 'high-in-something', 'low-in-something', 'brunch', 'number-of-servings']"

In [23]:
len(top200_recipe_df)

200

### Output for later use

In [24]:
top200_recipe_df.to_csv("../data/recipe_200.csv", index = False)

In [25]:
final_interactions

Unnamed: 0,user_id,recipe_id,date,rating,review,count
6054,5060,30081,2005-03-18,2,Well I'm really sorry but we found these absol...,1
6056,39835,30081,2005-08-26,3,"These were good for sandwich night, but I foun...",1
6064,158086,30081,2006-08-23,5,This was very easy and tasted wonderful...my b...,1
6069,53932,30081,2006-09-08,4,I used ground turkey and thought the spices we...,1
6137,176615,30081,2008-06-06,5,"Great recipe! Superfast and easy, the whole fa...",1
...,...,...,...,...,...,...
1123448,133174,131018,2013-01-28,5,This is on my menu for game day! I made this ...,1
1123461,39835,131018,2013-06-22,5,We loved this - used diet ginger ale but other...,1
1123494,400708,131018,2014-07-15,5,I&#039;ll just add my 5 stars to this easy rec...,1
1129713,140132,43072,2008-06-25,4,These were really good. I had some cottage ch...,1


### Checking users

In [26]:
user = pd.read_csv("../data/archive/PP_users.csv")

In [27]:
user.head()

Unnamed: 0,u,techniques,items,n_items,ratings,n_ratings
0,0,"[8, 0, 0, 5, 6, 0, 0, 1, 0, 9, 1, 0, 0, 0, 1, ...","[1118, 27680, 32541, 137353, 16428, 28815, 658...",31,"[5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 4.0, 4.0, ...",31
1,1,"[11, 0, 0, 2, 12, 0, 0, 0, 0, 14, 5, 0, 0, 0, ...","[122140, 77036, 156817, 76957, 68818, 155600, ...",39,"[5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, ...",39
2,2,"[13, 0, 0, 7, 5, 0, 1, 2, 1, 11, 0, 1, 0, 0, 1...","[168054, 87218, 35731, 1, 20475, 9039, 124834,...",27,"[3.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 5.0, ...",27
3,3,"[498, 13, 4, 218, 376, 3, 2, 33, 16, 591, 10, ...","[163193, 156352, 102888, 19914, 169438, 55772,...",1513,"[5.0, 5.0, 5.0, 5.0, 4.0, 4.0, 5.0, 5.0, 5.0, ...",1513
4,4,"[161, 1, 1, 86, 93, 0, 0, 11, 2, 141, 0, 16, 0...","[72857, 38652, 160427, 55772, 119999, 141777, ...",376,"[5.0, 5.0, 5.0, 5.0, 4.0, 4.0, 5.0, 4.0, 5.0, ...",376
