In [1]:
from tensorflow.compat.v1.python_io import TFRecordWriter
from tensorflow.train import Feature, BytesList, Int64List, Example
from transformers import GPT2TokenizerFast

2.1.0


# Load dataset and tokenize data

In [2]:
# Load wine dataset
wines_path = "C:/Users/david/Documents/github/this-wine-does-not-exist/data/scraped/name_desc_nlp_ready.txt"
with open(wines_path, 'r', encoding='utf8') as f:
    wines_raw = f.read().splitlines()
print(f"Loaded wine dataset of length: {len(wines_raw):,}")

# Remove wines with too short descriptions
wines_clean = []
for i in wines_raw:
    try:
        desc = i.split("[description]")[1]
        if len(desc) > 150:
            wines_clean.append(i)
    except:
        pass
print(f"Cleaned dataset has {len(wines_clean):,} samples")

tokenizer = transformers.GPT2TokenizerFast.from_pretrained('gpt2')
print("Loaded tokenizer")


tokenizer.add_special_tokens(
    {'eos_token':'<|startoftext|>',
     'bos_token':'<|startoftext|>'
    }
)
tokenizer.add_tokens(['[prompt]','[response]','[category_1]',
                      '[category_2]','[origin]','[description]',
                      '<|endoftext|>'])
tokenizer.pad_token = tokenizer.eos_token
print("Modified tokenizer tokens")
#tokenizer_path = f'./tokenizer_gpt2'
#tokenizer.save_pretrained(tokenizer_path)
#print(f"Saved tokenizer to {tokenizer_path}")

wine_encodings = tokenizer(wines_clean, max_length=250, padding=True, truncation=True)
print(f"Encoded dataset with attributes: {wine_encodings.keys()}")
print(f"Total encoded samples: {len(wine_encodings['input_ids']):,}")

Loaded wine dataset of length: 246,584
Cleaned dataset has 71,634 samples
Loaded tokenizer
Modified tokenizer tokens
Saved tokenizer to ./tokenizer_gpt2
Encoded dataset


In [23]:
tokenizer.vocab_size

50257

# Serialize to TFRecord

In [19]:
tfrecord_file_name = "scraped_wines_tfr"
with tf.compat.v1.python_io.TFRecordWriter(tfrecord_file_name) as writer:
  for ix, wine_desc in enumerate(wines_clean):
    features = tf.train.Features(
      feature = {
        'text': tf.train.Feature(
          bytes_list = tf.train.BytesList(value = [bytes(wine_desc, 'utf-8')])),
        'input_ids': tf.train.Feature(
          int64_list = tf.train.Int64List(value = wine_encodings['input_ids'][ix])),
        'attention_mask': tf.train.Feature(
          int64_list = tf.train.Int64List(value = wine_encodings['attention_mask'][ix]))
      }
    )
    example = tf.train.Example(features=features)
    writer.write(example.SerializeToString())