Skip to content
This repository has been archived by the owner on Jul 31, 2023. It is now read-only.

Commit

Permalink
Add tutorial on structured data conversion.
Browse files Browse the repository at this point in the history
This changes types.FloatInput to use tf.float32 for its feature_spec
attribute to address potential incompatibility with using tf.float64
type in TensorFlow Transform.
  • Loading branch information
cfezequiel committed Oct 16, 2020
1 parent fa18803 commit c0c3f1c
Show file tree
Hide file tree
Showing 8 changed files with 403 additions and 20 deletions.
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,6 @@ test:
nosetests --with-coverage -v --cover-package=tfrecorder

pylint:
pylint tfrecorder
pylint -j 0 tfrecorder

.PHONY: all init test pylint
383 changes: 383 additions & 0 deletions samples/Convert-structured-data.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,383 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Convert structured data to TFRecords "
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"The autoreload extension is already loaded. To reload it, use:\n",
" %reload_ext autoreload\n"
]
}
],
"source": [
"%load_ext autoreload\n",
"%autoreload 2"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Error importing tfx_bsl_extension.arrow.array_util. Some tfx_bsl functionalities are not available"
]
}
],
"source": [
"import pandas as pd\n",
"import pathlib\n",
"\n",
"import tensorflow as tf\n",
"\n",
"import tfrecorder\n",
"from tfrecorder import input_schema\n",
"from tfrecorder import types"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Load [Titanic](https://www.openml.org/d/40945) dataset "
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"data_path = pathlib.Path('/tmp/datasets/titanic.csv')\n",
"if not data_path.exists():\n",
" tf.keras.utils.get_file(\n",
" 'titanic.csv',\n",
" origin='https://web.stanford.edu/class/archive/cs/cs109/cs109.1166/stuff/titanic.csv',\n",
" extract=False,\n",
" cache_dir='/tmp', cache_subdir='datasets')\n",
" \n",
"assert data_path.exists()"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"df = pd.read_csv(str(data_path))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Add `split` column "
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Survived</th>\n",
" <th>Pclass</th>\n",
" <th>Name</th>\n",
" <th>Sex</th>\n",
" <th>Age</th>\n",
" <th>Siblings/Spouses Aboard</th>\n",
" <th>Parents/Children Aboard</th>\n",
" <th>Fare</th>\n",
" <th>split</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0</td>\n",
" <td>3</td>\n",
" <td>Mr. Owen Harris Braund</td>\n",
" <td>male</td>\n",
" <td>22.0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>7.2500</td>\n",
" <td>TRAIN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>Mrs. John Bradley (Florence Briggs Thayer) Cum...</td>\n",
" <td>female</td>\n",
" <td>38.0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>71.2833</td>\n",
" <td>TRAIN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>Miss. Laina Heikkinen</td>\n",
" <td>female</td>\n",
" <td>26.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>7.9250</td>\n",
" <td>TRAIN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>Mrs. Jacques Heath (Lily May Peel) Futrelle</td>\n",
" <td>female</td>\n",
" <td>35.0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>53.1000</td>\n",
" <td>TRAIN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>0</td>\n",
" <td>3</td>\n",
" <td>Mr. William Henry Allen</td>\n",
" <td>male</td>\n",
" <td>35.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>8.0500</td>\n",
" <td>TRAIN</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Survived Pclass Name \\\n",
"0 0 3 Mr. Owen Harris Braund \n",
"1 1 1 Mrs. John Bradley (Florence Briggs Thayer) Cum... \n",
"2 1 3 Miss. Laina Heikkinen \n",
"3 1 1 Mrs. Jacques Heath (Lily May Peel) Futrelle \n",
"4 0 3 Mr. William Henry Allen \n",
"\n",
" Sex Age Siblings/Spouses Aboard Parents/Children Aboard Fare \\\n",
"0 male 22.0 1 0 7.2500 \n",
"1 female 38.0 1 0 71.2833 \n",
"2 female 26.0 0 0 7.9250 \n",
"3 female 35.0 1 0 53.1000 \n",
"4 male 35.0 0 0 8.0500 \n",
"\n",
" split \n",
"0 TRAIN \n",
"1 TRAIN \n",
"2 TRAIN \n",
"3 TRAIN \n",
"4 TRAIN "
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df['split'] = 'TRAIN'\n",
"df.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Convert to TFRecords "
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"application/javascript": [
"\n",
" if (typeof window.interactive_beam_jquery == 'undefined') {\n",
" var jqueryScript = document.createElement('script');\n",
" jqueryScript.src = 'https://code.jquery.com/jquery-3.4.1.slim.min.js';\n",
" jqueryScript.type = 'text/javascript';\n",
" jqueryScript.onload = function() {\n",
" var datatableScript = document.createElement('script');\n",
" datatableScript.src = 'https://cdn.datatables.net/1.10.20/js/jquery.dataTables.min.js';\n",
" datatableScript.type = 'text/javascript';\n",
" datatableScript.onload = function() {\n",
" window.interactive_beam_jquery = jQuery.noConflict(true);\n",
" window.interactive_beam_jquery(document).ready(function($){\n",
" \n",
" });\n",
" }\n",
" document.head.appendChild(datatableScript);\n",
" };\n",
" document.head.appendChild(jqueryScript);\n",
" } else {\n",
" window.interactive_beam_jquery(document).ready(function($){\n",
" \n",
" });\n",
" }"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/javascript": [
"\n",
" var import_html = () => {\n",
" ['https://raw.githubusercontent.com/PAIR-code/facets/1.0.0/facets-dist/facets-jupyter.html'].forEach(href => {\n",
" var link = document.createElement('link');\n",
" link.rel = 'import'\n",
" link.href = href;\n",
" document.head.appendChild(link);\n",
" });\n",
" }\n",
" if ('import' in document.createElement('link')) {\n",
" import_html();\n",
" } else {\n",
" var webcomponentScript = document.createElement('script');\n",
" webcomponentScript.src = 'https://cdnjs.cloudflare.com/ajax/libs/webcomponentsjs/1.3.3/webcomponents-lite.js';\n",
" webcomponentScript.type = 'text/javascript';\n",
" webcomponentScript.onload = function(){\n",
" import_html();\n",
" };\n",
" document.head.appendChild(webcomponentScript);\n",
" }"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"results = tfrecorder.create_tfrecords(\n",
" df, \n",
" './tfrecords', \n",
" schema=input_schema.Schema({\n",
" 'Survived': types.IntegerInput,\n",
" 'Pclass': types.IntegerInput,\n",
" 'Name': types.StringInput,\n",
" 'Sex': types.StringInput,\n",
" 'Age': types.FloatInput,\n",
" 'Siblings/Spuses Aboard': types.IntegerInput,\n",
" 'Parents/Children Aboard': types.IntegerInput,\n",
" 'Fare': types.FloatInput,\n",
" 'split': types.SplitKey,\n",
" })\n",
")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Load a dataset from the generated TFRecord files "
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"datasets = tfrecorder.load('tfrecords/tfrecorder-20201014-115328-create-tfrecords')"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Age: <dtype: 'float32'>\n",
"Fare: <dtype: 'float32'>\n",
"Name: <dtype: 'string'>\n",
"Parents/Children Aboard: <dtype: 'int64'>\n",
"Pclass: <dtype: 'int64'>\n",
"Sex: <dtype: 'string'>\n",
"Siblings/Spuses Aboard: <dtype: 'int64'>\n",
"Survived: <dtype: 'int64'>\n",
"split: <dtype: 'string'>\n"
]
}
],
"source": [
"for x in datasets['TRAIN'].take(1):\n",
" for k, v in x.items():\n",
" print(f'{k}: {v.dtype}')"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.8"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
Loading

0 comments on commit c0c3f1c

Please sign in to comment.