# Convert structured data to TFRecords 

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import pandas as pd
import pathlib

import tensorflow as tf

import tfrecorder
from tfrecorder import input_schema
from tfrecorder import types

## Load [Titanic](https://www.openml.org/d/40945) dataset 

In [None]:
data_path = pathlib.Path('/tmp/datasets/titanic.csv')
if not data_path.exists():
  tf.keras.utils.get_file(
      'titanic.csv',
      origin='https://web.stanford.edu/class/archive/cs/cs109/cs109.1166/stuff/titanic.csv',
      extract=False,
      cache_dir='/tmp', cache_subdir='datasets')
  
assert data_path.exists()

In [None]:
df = pd.read_csv(str(data_path))

## Add `split` column 

In [None]:
df['split'] = 'train'
df.head()

## Convert to TFRecords 

In [None]:
results = tfrecorder.convert(
    df, 
    './tfrecords', 
    schema=input_schema.Schema({
        'Survived': types.IntegerInput,
        'Pclass': types.IntegerInput,
        'Name': types.StringInput,
        'Sex': types.StringInput,
        'Age': types.FloatInput,
        'Siblings/Spouses Aboard': types.IntegerInput,
        'Parents/Children Aboard': types.IntegerInput,
        'Fare': types.FloatInput,
        'split': types.SplitKey,
    })
)

In [None]:
print(results)

## Load a dataset from the generated TFRecord files 

In [None]:
datasets = tfrecorder.load(results['tfrecord_dir'])

In [None]:
for x in datasets['train'].take(1):
  for k, v in x.items():
    print(f'{k}: {v.dtype}')