In [43]:
import pandas as pd

import tensorflow as tf
from transformers import DistilBertTokenizer

# Recipe Recommendation System

## part 3 - Dataset Manipulation

## Interactions

In [44]:
interactions_raw = pd.read_csv("data/foodRecSys-V1_kaggle/raw-data_interaction.csv")
interactions_raw

Unnamed: 0,user_id,recipe_id,rating,dateLastModified
0,8542392,222388,5,2017-04-22T12:46:43.663\n
1,11174581,222388,5,2013-06-20T15:50:25.96\n
2,8262477,222388,5,2015-02-14T07:27:51.307\n
3,3574785,240488,5,2017-10-07T18:20:08.973\n
4,12145410,240488,2,2018-01-06T00:06:09.563\n
...,...,...,...,...
3793998,12318233,170710,5,2013-11-03T11:09:47.763\n
3793999,51226,170710,5,2009-07-18T19:33:36.317\n
3794000,1476783,170710,5,2009-07-27T10:21:45.823\n
3794001,1245064,79774,4,2006-01-21T20:57:09.327\n


In [45]:
interactions_raw.dateLastModified = pd.to_datetime(interactions_raw.dateLastModified, format = "ISO8601")

For the retrieval task we need only positive interactions, so we decide to keep only reviews that rate the recipe 4 or 5.

In [46]:
interactions = interactions_raw[interactions_raw.rating.isin([4, 5])]
len(interactions)

3390626

An this is where we barbaricly hack our dataset with no other good reason than lack of time and computational power. To say it nicely we keep only recipes with resent activity. 

In [47]:
interactions = interactions[interactions.dateLastModified.dt.year > 2015]
len(interactions)

446301

In [48]:
recipes_unique = interactions.recipe_id.unique()
len(recipes_unique)

37374

We sort the data by date and split it in train, validation and test set. Most resent go to the test set.

In [49]:
interactions.sort_values(by=["dateLastModified"])

Unnamed: 0,user_id,recipe_id,rating,dateLastModified
3564587,5902044,6847,5,2016-01-01 00:21:25.293
2610452,15248620,58642,5,2016-01-01 00:28:23.240
2422813,5902044,240489,5,2016-01-01 00:28:56.727
3105167,13168891,25075,4,2016-01-01 01:23:55.833
1776366,16758355,24705,5,2016-01-01 01:24:53.490
...,...,...,...,...
3724673,11022908,254658,4,2018-03-15 22:25:02.653
3712577,21540277,7483,5,2018-03-15 22:59:39.947
3762930,3439303,234692,4,2018-03-15 23:06:54.910
3778837,13820627,12054,4,2018-03-16 00:59:01.163


In [50]:
test_set_size = 10_000

interactions_train = interactions_resent[: -2 * test_set_size]
interactions_validation = interactions_resent[-2 * test_set_size : -test_set_size]
interactions_test = interactions_resent[-test_set_size :]

In [51]:
len(interactions_train) + len(interactions_validation) + len(interactions_test)

446301

In [52]:
interactions_train.to_csv("./data/interactions_train.csv", index = False)
interactions_validation.to_csv("./data/interactions_train.csv", index = False)
interactions_test.to_csv("./data/interactions_train.csv", index = False)

## Recipe details

In [71]:
cols_to_keep = [
    "recipe_id",
    "recipe_name", 
    # "aver_rate", 
    # "review_nums", 
    # "ingredients", 
    # "cooking_directions", 
    ]

In [72]:
recipes_raw = pd.read_csv("data/foodRecSys-V1_kaggle/raw-data_recipe.csv", usecols = cols_to_keep)

In [66]:
len(recipes_unique)

37374

In [78]:
recipes = recipes_raw[recipes_raw.recipe_id.isin(set(recipes_unique))]
recipes

Unnamed: 0,recipe_id,recipe_name
0,222388,Homemade Bacon
1,240488,"Pork Loin, Apples, and Sauerkraut"
2,218939,Foolproof Rosemary Chicken Wings
3,87211,Chicken Pesto Paninis
4,245714,Potato Bacon Pizza
...,...,...
49687,262370,Amaretto Coffee Creamer
49688,240132,Almost-Homemade Buttermilk
49692,159723,Richard and Suzanne's Famous Spaghetti Sauce
49695,23544,Peach Smoothie


Here we make a proto file with for each unique recipe containing it's details. Text features are tokenized with DistilBERT. We choose 32 as our maximum number of tokens as we know from the EDA that the name of most recipe is under 10 words.

In [74]:
SEQUENCE_LEN = 32

In [75]:
dbert_tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

In [76]:
dbert_tokenizer(["Ten word long recipe name chocolate cake with sour cherries"], add_special_tokens = True, max_length = SEQUENCE_LEN, padding = "max_length", truncation = True)

{'input_ids': [[101, 2702, 2773, 2146, 17974, 2171, 7967, 9850, 2007, 14768, 24188, 5134, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]}

In [79]:
recipes[["recipe_name_input_ids", "recipe_name_attention_mask"]] = recipes.recipe_name.apply(lambda x: dbert_tokenizer(x,add_special_tokens = True, max_length = SEQUENCE_LEN, padding = "max_length", truncation = True)).apply(pd.Series)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  recipes[["recipe_name_input_ids", "recipe_name_attention_mask"]] = recipes.recipe_name.apply(lambda x: dbert_tokenizer(x,add_special_tokens = True, max_length = SEQUENCE_LEN, padding = "max_length", truncation = True)).apply(pd.Series)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  recipes[["recipe_name_input_ids", "recipe_name_attention_mask"]] = recipes.recipe_name.apply(lambda x: dbert_tokenizer(x,add_special_tokens = True, max_length = SEQUENCE_LEN, padding = "max_length", truncation = True)).apply(pd.Series

In [80]:
recipes.head()

Unnamed: 0,recipe_id,recipe_name,recipe_name_input_ids,recipe_name_attention_mask
0,222388,Homemade Bacon,"[101, 25628, 11611, 102, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,240488,"Pork Loin, Apples, and Sauerkraut","[101, 15960, 8840, 2378, 1010, 18108, 1010, 19...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, ..."
2,218939,Foolproof Rosemary Chicken Wings,"[101, 7966, 18907, 18040, 7975, 4777, 102, 0, ...","[1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,87211,Chicken Pesto Paninis,"[101, 7975, 20739, 2080, 6090, 5498, 2015, 102...","[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, ..."
4,245714,Potato Bacon Pizza,"[101, 14557, 11611, 10733, 102, 0, 0, 0, 0, 0,...","[1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [81]:
# from https://www.tensorflow.org/tutorials/load_data/tfrecord

def _bytes_feature(value):
  """Returns a bytes_list from a string / byte."""
  if isinstance(value, type(tf.constant(0))):
    value = value.numpy() # BytesList won't unpack a string from an EagerTensor.
  return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

def _float_feature(value):
  """Returns a float_list from a float / double."""
  return tf.train.Feature(float_list=tf.train.FloatList(value=[value]))

def _int64_feature(value):
  """Returns an int64_list from a bool / enum / int / uint."""
  return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))


In [82]:
def _int64_list_feature(values):
    """Returns an int64_list from a list of integers."""
    return tf.train.Feature(int64_list=tf.train.Int64List(value=values))

In [88]:
def serialize_example(recipe_info, image_dir_path = "./data/foodRecSys-V1_kaggle/raw-data-images/raw-data-images/"):
  """
  Creates a tf.train.Example message ready to be written to a file.
  """

  recipe_id = recipe_info.recipe_id

  # Create a dictionary mapping the feature name to the tf.train.Example-compatible
  # data type.
  feature = {
      "recipe_id": _int64_feature(recipe_id),
      "recipe_name": _bytes_feature(bytes(recipe_info.recipe_name, "utf-8")),
      "recipe_name_input_ids": _int64_list_feature(recipe_info.recipe_name_input_ids),
      "recipe_name_attention_mask": _int64_list_feature(recipe_info.recipe_name_attention_mask),
      "recipe_image": _bytes_feature(tf.io.read_file(f"{image_dir_path}{recipe_id}.jpg")),
    #   "aver_rate": _float_feature(recipe_info.aver_rate),
    #   "review_nums": _int64_feature(recipe_info.review_nums),
    #   "ingredients": _bytes_feature(bytes(recipe_info.ingredients, "utf-8")), # replace ^ with maybe
    #   "cooking_directions": _bytes_feature(bytes(recipe_info.cooking_directions, "utf-8")),
    #   "reviews": _bytes_feature(bytes(recipe_info.reviews, "utf-8")),
      
  }

  # Create a Features message using tf.train.Example.

  example_proto = tf.train.Example(features=tf.train.Features(feature=feature))
  
  return recipe_id, example_proto.SerializeToString()

In [89]:
def create_proto_file(pandas_row, dir_path = "./data/recipe_proto_files/"):
    
    recipe_id, serialized_example = serialize_example(pandas_row)

    proto_filename = f"{dir_path}{recipe_id}.proto"
    
    tf.io.write_file(proto_filename, serialized_example)


In [90]:
recipes.apply(create_proto_file, axis = 1)

0        None
1        None
2        None
3        None
4        None
         ... 
49687    None
49688    None
49692    None
49695    None
49696    None
Length: 37374, dtype: object

In [91]:
proto_file_path = "./data/recipe_proto_files/6663.proto"

In [92]:
# Read and decode the proto file
# raw_dataset = tf.data.TFRecordDataset([proto_file_path])

feature_description = {
    "recipe_id": tf.io.FixedLenFeature([], tf.int64),
    "recipe_name": tf.io.FixedLenFeature([], tf.string),
    "recipe_name_input_ids": tf.io.FixedLenFeature([SEQUENCE_LEN], tf.int64),
    "recipe_name_attention_mask": tf.io.FixedLenFeature([SEQUENCE_LEN], tf.int64),
    "recipe_image": tf.io.FixedLenFeature([], tf.string),
    # "aver_rate": tf.io.FixedLenFeature([], tf.float32),
    # "review_nums": tf.io.FixedLenFeature([], tf.int64),
    # "ingredients": tf.io.FixedLenFeature([], tf.string),
    # "cooking_directions": tf.io.FixedLenFeature([], tf.string),
    # "reviews": tf.io.FixedLenFeature([], tf.string),
}

In [94]:
tf.io.parse_single_example(tf.io.read_file(proto_file_path), feature_description)

{'recipe_id': <tf.Tensor: shape=(), dtype=int64, numpy=6663>,
 'recipe_image': <tf.Tensor: shape=(), dtype=string, numpy=b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x01\x00`\x00`\x00\x00\xff\xdb\x00C\x00\x04\x02\x03\x03\x03\x02\x04\x03\x03\x03\x04\x04\x04\x04\x05\t\x06\x05\x05\x05\x05\x0b\x08\x08\x06\t\r\x0b\r\r\r\x0b\x0c\x0c\x0e\x10\x14\x11\x0e\x0f\x13\x0f\x0c\x0c\x12\x18\x12\x13\x15\x16\x17\x17\x17\x0e\x11\x19\x1b\x19\x16\x1a\x14\x16\x17\x16\xff\xdb\x00C\x01\x04\x04\x04\x05\x05\x05\n\x06\x06\n\x16\x0f\x0c\x0f\x16\x16\x16\x16\x16\x16\x16\x16\x16\x16\x16\x16\x16\x16\x16\x16\x16\x16\x16\x16\x16\x16\x16\x16\x16\x16\x16\x16\x16\x16\x16\x16\x16\x16\x16\x16\x16\x16\x16\x16\x16\x16\x16\x16\x16\x16\x16\x16\x16\x16\xff\xc0\x00\x11\x08\x00\xfa\x00\xfa\x03\x01"\x00\x02\x11\x01\x03\x11\x01\xff\xc4\x00\x1f\x00\x00\x01\x05\x01\x01\x01\x01\x01\x01\x00\x00\x00\x00\x00\x00\x00\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n\x0b\xff\xc4\x00\xb5\x10\x00\x02\x01\x03\x03\x02\x04\x03\x05\x05\x04\x04\x00\x00\x01}\x0