This repository has been archived by the owner on Jul 31, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 33
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add tutorial on structured data conversion.
This changes types.FloatInput to use tf.float32 for its feature_spec attribute to address potential incompatibility with using tf.float64 type in TensorFlow Transform.
- Loading branch information
1 parent
fa18803
commit c0c3f1c
Showing
8 changed files
with
403 additions
and
20 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,383 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"# Convert structured data to TFRecords " | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 9, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"The autoreload extension is already loaded. To reload it, use:\n", | ||
" %reload_ext autoreload\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"%load_ext autoreload\n", | ||
"%autoreload 2" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 2, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stderr", | ||
"output_type": "stream", | ||
"text": [ | ||
"Error importing tfx_bsl_extension.arrow.array_util. Some tfx_bsl functionalities are not available" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"import pandas as pd\n", | ||
"import pathlib\n", | ||
"\n", | ||
"import tensorflow as tf\n", | ||
"\n", | ||
"import tfrecorder\n", | ||
"from tfrecorder import input_schema\n", | ||
"from tfrecorder import types" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"## Load [Titanic](https://www.openml.org/d/40945) dataset " | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 3, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"data_path = pathlib.Path('/tmp/datasets/titanic.csv')\n", | ||
"if not data_path.exists():\n", | ||
" tf.keras.utils.get_file(\n", | ||
" 'titanic.csv',\n", | ||
" origin='https://web.stanford.edu/class/archive/cs/cs109/cs109.1166/stuff/titanic.csv',\n", | ||
" extract=False,\n", | ||
" cache_dir='/tmp', cache_subdir='datasets')\n", | ||
" \n", | ||
"assert data_path.exists()" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 4, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"df = pd.read_csv(str(data_path))" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"## Add `split` column " | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 5, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"data": { | ||
"text/html": [ | ||
"<div>\n", | ||
"<style scoped>\n", | ||
" .dataframe tbody tr th:only-of-type {\n", | ||
" vertical-align: middle;\n", | ||
" }\n", | ||
"\n", | ||
" .dataframe tbody tr th {\n", | ||
" vertical-align: top;\n", | ||
" }\n", | ||
"\n", | ||
" .dataframe thead th {\n", | ||
" text-align: right;\n", | ||
" }\n", | ||
"</style>\n", | ||
"<table border=\"1\" class=\"dataframe\">\n", | ||
" <thead>\n", | ||
" <tr style=\"text-align: right;\">\n", | ||
" <th></th>\n", | ||
" <th>Survived</th>\n", | ||
" <th>Pclass</th>\n", | ||
" <th>Name</th>\n", | ||
" <th>Sex</th>\n", | ||
" <th>Age</th>\n", | ||
" <th>Siblings/Spouses Aboard</th>\n", | ||
" <th>Parents/Children Aboard</th>\n", | ||
" <th>Fare</th>\n", | ||
" <th>split</th>\n", | ||
" </tr>\n", | ||
" </thead>\n", | ||
" <tbody>\n", | ||
" <tr>\n", | ||
" <th>0</th>\n", | ||
" <td>0</td>\n", | ||
" <td>3</td>\n", | ||
" <td>Mr. Owen Harris Braund</td>\n", | ||
" <td>male</td>\n", | ||
" <td>22.0</td>\n", | ||
" <td>1</td>\n", | ||
" <td>0</td>\n", | ||
" <td>7.2500</td>\n", | ||
" <td>TRAIN</td>\n", | ||
" </tr>\n", | ||
" <tr>\n", | ||
" <th>1</th>\n", | ||
" <td>1</td>\n", | ||
" <td>1</td>\n", | ||
" <td>Mrs. John Bradley (Florence Briggs Thayer) Cum...</td>\n", | ||
" <td>female</td>\n", | ||
" <td>38.0</td>\n", | ||
" <td>1</td>\n", | ||
" <td>0</td>\n", | ||
" <td>71.2833</td>\n", | ||
" <td>TRAIN</td>\n", | ||
" </tr>\n", | ||
" <tr>\n", | ||
" <th>2</th>\n", | ||
" <td>1</td>\n", | ||
" <td>3</td>\n", | ||
" <td>Miss. Laina Heikkinen</td>\n", | ||
" <td>female</td>\n", | ||
" <td>26.0</td>\n", | ||
" <td>0</td>\n", | ||
" <td>0</td>\n", | ||
" <td>7.9250</td>\n", | ||
" <td>TRAIN</td>\n", | ||
" </tr>\n", | ||
" <tr>\n", | ||
" <th>3</th>\n", | ||
" <td>1</td>\n", | ||
" <td>1</td>\n", | ||
" <td>Mrs. Jacques Heath (Lily May Peel) Futrelle</td>\n", | ||
" <td>female</td>\n", | ||
" <td>35.0</td>\n", | ||
" <td>1</td>\n", | ||
" <td>0</td>\n", | ||
" <td>53.1000</td>\n", | ||
" <td>TRAIN</td>\n", | ||
" </tr>\n", | ||
" <tr>\n", | ||
" <th>4</th>\n", | ||
" <td>0</td>\n", | ||
" <td>3</td>\n", | ||
" <td>Mr. William Henry Allen</td>\n", | ||
" <td>male</td>\n", | ||
" <td>35.0</td>\n", | ||
" <td>0</td>\n", | ||
" <td>0</td>\n", | ||
" <td>8.0500</td>\n", | ||
" <td>TRAIN</td>\n", | ||
" </tr>\n", | ||
" </tbody>\n", | ||
"</table>\n", | ||
"</div>" | ||
], | ||
"text/plain": [ | ||
" Survived Pclass Name \\\n", | ||
"0 0 3 Mr. Owen Harris Braund \n", | ||
"1 1 1 Mrs. John Bradley (Florence Briggs Thayer) Cum... \n", | ||
"2 1 3 Miss. Laina Heikkinen \n", | ||
"3 1 1 Mrs. Jacques Heath (Lily May Peel) Futrelle \n", | ||
"4 0 3 Mr. William Henry Allen \n", | ||
"\n", | ||
" Sex Age Siblings/Spouses Aboard Parents/Children Aboard Fare \\\n", | ||
"0 male 22.0 1 0 7.2500 \n", | ||
"1 female 38.0 1 0 71.2833 \n", | ||
"2 female 26.0 0 0 7.9250 \n", | ||
"3 female 35.0 1 0 53.1000 \n", | ||
"4 male 35.0 0 0 8.0500 \n", | ||
"\n", | ||
" split \n", | ||
"0 TRAIN \n", | ||
"1 TRAIN \n", | ||
"2 TRAIN \n", | ||
"3 TRAIN \n", | ||
"4 TRAIN " | ||
] | ||
}, | ||
"execution_count": 5, | ||
"metadata": {}, | ||
"output_type": "execute_result" | ||
} | ||
], | ||
"source": [ | ||
"df['split'] = 'TRAIN'\n", | ||
"df.head()" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"## Convert to TFRecords " | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 6, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"data": { | ||
"application/javascript": [ | ||
"\n", | ||
" if (typeof window.interactive_beam_jquery == 'undefined') {\n", | ||
" var jqueryScript = document.createElement('script');\n", | ||
" jqueryScript.src = 'https://code.jquery.com/jquery-3.4.1.slim.min.js';\n", | ||
" jqueryScript.type = 'text/javascript';\n", | ||
" jqueryScript.onload = function() {\n", | ||
" var datatableScript = document.createElement('script');\n", | ||
" datatableScript.src = 'https://cdn.datatables.net/1.10.20/js/jquery.dataTables.min.js';\n", | ||
" datatableScript.type = 'text/javascript';\n", | ||
" datatableScript.onload = function() {\n", | ||
" window.interactive_beam_jquery = jQuery.noConflict(true);\n", | ||
" window.interactive_beam_jquery(document).ready(function($){\n", | ||
" \n", | ||
" });\n", | ||
" }\n", | ||
" document.head.appendChild(datatableScript);\n", | ||
" };\n", | ||
" document.head.appendChild(jqueryScript);\n", | ||
" } else {\n", | ||
" window.interactive_beam_jquery(document).ready(function($){\n", | ||
" \n", | ||
" });\n", | ||
" }" | ||
] | ||
}, | ||
"metadata": {}, | ||
"output_type": "display_data" | ||
}, | ||
{ | ||
"data": { | ||
"application/javascript": [ | ||
"\n", | ||
" var import_html = () => {\n", | ||
" ['https://raw.githubusercontent.com/PAIR-code/facets/1.0.0/facets-dist/facets-jupyter.html'].forEach(href => {\n", | ||
" var link = document.createElement('link');\n", | ||
" link.rel = 'import'\n", | ||
" link.href = href;\n", | ||
" document.head.appendChild(link);\n", | ||
" });\n", | ||
" }\n", | ||
" if ('import' in document.createElement('link')) {\n", | ||
" import_html();\n", | ||
" } else {\n", | ||
" var webcomponentScript = document.createElement('script');\n", | ||
" webcomponentScript.src = 'https://cdnjs.cloudflare.com/ajax/libs/webcomponentsjs/1.3.3/webcomponents-lite.js';\n", | ||
" webcomponentScript.type = 'text/javascript';\n", | ||
" webcomponentScript.onload = function(){\n", | ||
" import_html();\n", | ||
" };\n", | ||
" document.head.appendChild(webcomponentScript);\n", | ||
" }" | ||
] | ||
}, | ||
"metadata": {}, | ||
"output_type": "display_data" | ||
} | ||
], | ||
"source": [ | ||
"results = tfrecorder.create_tfrecords(\n", | ||
" df, \n", | ||
" './tfrecords', \n", | ||
" schema=input_schema.Schema({\n", | ||
" 'Survived': types.IntegerInput,\n", | ||
" 'Pclass': types.IntegerInput,\n", | ||
" 'Name': types.StringInput,\n", | ||
" 'Sex': types.StringInput,\n", | ||
" 'Age': types.FloatInput,\n", | ||
" 'Siblings/Spuses Aboard': types.IntegerInput,\n", | ||
" 'Parents/Children Aboard': types.IntegerInput,\n", | ||
" 'Fare': types.FloatInput,\n", | ||
" 'split': types.SplitKey,\n", | ||
" })\n", | ||
")" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"## Load a dataset from the generated TFRecord files " | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 7, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"datasets = tfrecorder.load('tfrecords/tfrecorder-20201014-115328-create-tfrecords')" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 8, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"Age: <dtype: 'float32'>\n", | ||
"Fare: <dtype: 'float32'>\n", | ||
"Name: <dtype: 'string'>\n", | ||
"Parents/Children Aboard: <dtype: 'int64'>\n", | ||
"Pclass: <dtype: 'int64'>\n", | ||
"Sex: <dtype: 'string'>\n", | ||
"Siblings/Spuses Aboard: <dtype: 'int64'>\n", | ||
"Survived: <dtype: 'int64'>\n", | ||
"split: <dtype: 'string'>\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"for x in datasets['TRAIN'].take(1):\n", | ||
" for k, v in x.items():\n", | ||
" print(f'{k}: {v.dtype}')" | ||
] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "Python 3", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.6.8" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 4 | ||
} |
Oops, something went wrong.