In [74]:
import pandas as pd
# import matplotlib.pyplot as plt

import tensorflow as tf
from transformers import DistilBertTokenizer


In [75]:
recipes = pd.read_csv("../mock_data/mock-data_recipe_10.csv")

In [76]:
recipes.columns

Index(['Unnamed: 0', 'recipe_id', 'recipe_name', 'aver_rate', 'image_url',
       'review_nums', 'ingredients', 'cooking_directions', 'nutritions',
       'reviews'],
      dtype='object')

In [78]:
SEQUENCE_LEN = 32

In [79]:
dbert_tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

In [80]:
dbert_tokenizer(["duck"])

{'input_ids': [[101, 9457, 102]], 'attention_mask': [[1, 1, 1]]}

In [81]:
dbert_tokenizer(["duck"], add_special_tokens = True, max_length = SEQUENCE_LEN, padding = "max_length", truncation = True)

{'input_ids': [[101, 9457, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]}

In [82]:
recipes[["recipe_name_input_ids", "recipe_name_attention_mask"]] = recipes.recipe_name.apply(lambda x: dbert_tokenizer(x,add_special_tokens = True, max_length = SEQUENCE_LEN, padding = "max_length", truncation = True)).apply(pd.Series)

In [83]:
# recipes["recipe_name_input_ids"][0]

In [84]:
# recipes

from https://www.tensorflow.org/tutorials/load_data/tfrecord

In [85]:
def _bytes_feature(value):
  """Returns a bytes_list from a string / byte."""
  if isinstance(value, type(tf.constant(0))):
    value = value.numpy() # BytesList won't unpack a string from an EagerTensor.
  return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

def _float_feature(value):
  """Returns a float_list from a float / double."""
  return tf.train.Feature(float_list=tf.train.FloatList(value=[value]))

def _int64_feature(value):
  """Returns an int64_list from a bool / enum / int / uint."""
  return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))

In [86]:
def _int64_list_feature(values):
    """Returns an int64_list from a list of integers."""
    return tf.train.Feature(int64_list=tf.train.Int64List(value=values))

In [87]:
def serialize_example(recipe_info, image_dir_path = "../data/foodRecSys-V1_kaggle/raw-data-images/raw-data-images/"):
  """
  Creates a tf.train.Example message ready to be written to a file.
  """

  recipe_id = recipe_info.recipe_id

  # Create a dictionary mapping the feature name to the tf.train.Example-compatible
  # data type.
  feature = {
      "recipe_id": _int64_feature(recipe_id),
      "recipe_name": _bytes_feature(bytes(recipe_info.recipe_name, "utf-8")),
      "recipe_name_input_ids": _int64_list_feature(recipe_info.recipe_name_input_ids),
      "recipe_name_attention_mask": _int64_list_feature(recipe_info.recipe_name_attention_mask),
      "recipe_image": _bytes_feature(tf.io.read_file(f"{image_dir_path}{recipe_id}.jpg")),
      "aver_rate": _float_feature(recipe_info.aver_rate),
      "review_nums": _int64_feature(recipe_info.review_nums),
      "ingredients": _bytes_feature(bytes(recipe_info.ingredients, "utf-8")), # replace ^ with maybe
      "cooking_directions": _bytes_feature(bytes(recipe_info.ingredients, "utf-8")),
      "reviews": _bytes_feature(bytes(recipe_info.reviews, "utf-8")),
      
  }

  # Create a Features message using tf.train.Example.

  example_proto = tf.train.Example(features=tf.train.Features(feature=feature))
  
  return recipe_id, example_proto.SerializeToString()

In [88]:
def create_proto_file(pandas_row, dir_path = "../mock_data/output_proto_files/"):
    
    recipe_id, serialized_example = serialize_example(pandas_row)

    proto_filename = f"{dir_path}{recipe_id}.proto"
    # with tf.io.TFRecordWriter(proto_filename) as writer:
    tf.io.write_file(proto_filename, serialized_example)

    return f"Proto file '{proto_filename}' created."

In [89]:
recipes.apply(create_proto_file, axis = 1)

0    Proto file '../mock_data/output_proto_files/22...
1    Proto file '../mock_data/output_proto_files/24...
2    Proto file '../mock_data/output_proto_files/21...
3    Proto file '../mock_data/output_proto_files/87...
4    Proto file '../mock_data/output_proto_files/24...
5    Proto file '../mock_data/output_proto_files/21...
6    Proto file '../mock_data/output_proto_files/20...
7    Proto file '../mock_data/output_proto_files/24...
8    Proto file '../mock_data/output_proto_files/22...
9    Proto file '../mock_data/output_proto_files/25...
dtype: object

In [90]:
proto_filename = "../mock_data/output_proto_files/87211.proto"

In [91]:
# Read and decode the proto file
raw_dataset = tf.data.TFRecordDataset([proto_filename])

feature_description = {
    "recipe_id": tf.io.FixedLenFeature([], tf.int64),
    "recipe_name": tf.io.FixedLenFeature([], tf.string),
    "recipe_name_input_ids": tf.io.FixedLenFeature([SEQUENCE_LEN], tf.int64),
    "recipe_name_attention_mask": tf.io.FixedLenFeature([SEQUENCE_LEN], tf.int64),
    "recipe_image": tf.io.FixedLenFeature([], tf.string),
    "aver_rate": tf.io.FixedLenFeature([], tf.float32),
    "review_nums": tf.io.FixedLenFeature([], tf.int64),
    "ingredients": tf.io.FixedLenFeature([], tf.string),
    "cooking_directions": tf.io.FixedLenFeature([], tf.string),
    "reviews": tf.io.FixedLenFeature([], tf.string),
}



In [92]:
tf.io.parse_single_example(tf.io.read_file(proto_filename), feature_description)

{'aver_rate': <tf.Tensor: shape=(), dtype=float32, numpy=4.625>,
 'cooking_directions': <tf.Tensor: shape=(), dtype=string, numpy=b'focaccia bread quartered^prepared basil pesto^diced cooked chicken^diced green bell pepper^diced red onion^shredded Monterey Jack cheese'>,
 'ingredients': <tf.Tensor: shape=(), dtype=string, numpy=b'focaccia bread quartered^prepared basil pesto^diced cooked chicken^diced green bell pepper^diced red onion^shredded Monterey Jack cheese'>,
 'recipe_id': <tf.Tensor: shape=(), dtype=int64, numpy=87211>,
 'recipe_image': <tf.Tensor: shape=(), dtype=string, numpy=b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x01\x00`\x00`\x00\x00\xff\xdb\x00C\x00\x04\x02\x03\x03\x03\x02\x04\x03\x03\x03\x04\x04\x04\x04\x05\t\x06\x05\x05\x05\x05\x0b\x08\x08\x06\t\r\x0b\r\r\r\x0b\x0c\x0c\x0e\x10\x14\x11\x0e\x0f\x13\x0f\x0c\x0c\x12\x18\x12\x13\x15\x16\x17\x17\x17\x0e\x11\x19\x1b\x19\x16\x1a\x14\x16\x17\x16\xff\xdb\x00C\x01\x04\x04\x04\x05\x05\x05\n\x06\x06\n\x16\x0f\x0c\x0f\x16\x16\x16