# Convert structured data to TFRecords 

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import pathlib

import tensorflow as tf

import tfrecorder
from tfrecorder import input_schema
from tfrecorder import types

Error importing tfx_bsl_extension.arrow.array_util. Some tfx_bsl functionalities are not available

## Load [Titanic](https://www.openml.org/d/40945) dataset 

In [3]:
data_path = pathlib.Path('/tmp/datasets/titanic.csv')
if not data_path.exists():
  tf.keras.utils.get_file(
      'titanic.csv',
      origin='https://web.stanford.edu/class/archive/cs/cs109/cs109.1166/stuff/titanic.csv',
      extract=False,
      cache_dir='/tmp', cache_subdir='datasets')
  
assert data_path.exists()

Downloading data from https://web.stanford.edu/class/archive/cs/cs109/cs109.1166/stuff/titanic.csv


In [4]:
df = pd.read_csv(str(data_path))

## Add `split` column 

In [5]:
df['split'] = 'train'
df.head()

Unnamed: 0,Survived,Pclass,Name,Sex,Age,Siblings/Spouses Aboard,Parents/Children Aboard,Fare,split
0,0,3,Mr. Owen Harris Braund,male,22.0,1,0,7.25,TRAIN
1,1,1,Mrs. John Bradley (Florence Briggs Thayer) Cum...,female,38.0,1,0,71.2833,TRAIN
2,1,3,Miss. Laina Heikkinen,female,26.0,0,0,7.925,TRAIN
3,1,1,Mrs. Jacques Heath (Lily May Peel) Futrelle,female,35.0,1,0,53.1,TRAIN
4,0,3,Mr. William Henry Allen,male,35.0,0,0,8.05,TRAIN


## Convert to TFRecords 

In [6]:
results = tfrecorder.convert(
    df, 
    './tfrecords', 
    schema=input_schema.Schema({
        'Survived': types.IntegerInput,
        'Pclass': types.IntegerInput,
        'Name': types.StringInput,
        'Sex': types.StringInput,
        'Age': types.FloatInput,
        'Siblings/Spouses Aboard': types.IntegerInput,
        'Parents/Children Aboard': types.IntegerInput,
        'Fare': types.FloatInput,
        'split': types.SplitKey,
    })
)

In [7]:
print(results)

{'job_id': 'DirectRunner', 'metrics': {'rows': 887, 'good_images': None, 'bad_images': None}, 'tfrecord_dir': './tfrecords/tfrecorder-20201027-173544-create-tfrecords'}


## Load a dataset from the generated TFRecord files 

In [8]:
datasets = tfrecorder.load(results['tfrecord_dir'])

In [9]:
for x in datasets['train'].take(1):
  for k, v in x.items():
    print(f'{k}: {v.dtype}')

Age: <dtype: 'float32'>
Fare: <dtype: 'float32'>
Name: <dtype: 'string'>
Parents/Children Aboard: <dtype: 'int64'>
Pclass: <dtype: 'int64'>
Sex: <dtype: 'string'>
Siblings/Spouses Aboard: <dtype: 'int64'>
Survived: <dtype: 'int64'>
split: <dtype: 'string'>
