diff --git a/samples/ML Toolbox/Regression/Census/2 Service Analyze.ipynb b/samples/ML Toolbox/Regression/Census/2 Service Analyze.ipynb deleted file mode 100644 index b5f7917..0000000 --- a/samples/ML Toolbox/Regression/Census/2 Service Analyze.ipynb +++ /dev/null @@ -1,395 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# About this notebook\n", - "\n", - "This notebook assumes you have ran the local Census Regression notebook and you have not deleted the LOCAL_ROOT folder.In this notebook, we will use BigQuery to analyze the data files for training. " - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Setting things up" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "import mltoolbox.regression.dnn as sd" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "import os\n", - "import tensorflow as tf\n", - "from tensorflow.python.lib.io import file_io\n", - "import google.datalab.ml as ml" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "This notebook will write files during preprocessing. Please give a root folder you wish to use." - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Creating gs://cloud-ml-dev-census-regression-datalab/...\n", - "ServiceException: 409 Bucket cloud-ml-dev-census-regression-datalab already exists.\n" - ] - } - ], - "source": [ - "LOCAL_ROOT = './census_regression_workspace' # This should be the same as what was used in the local notebook\n", - "CLOUD_ROOT = 'gs://' + datalab_project_id() + '-census-regression-datalab'\n", - "\n", - "# No need to edit anything else in this cell.\n", - "LOCAL_PREPROCESSING_DIR = os.path.join(LOCAL_ROOT, 'preprocessing')\n", - "CLOUD_PREPROCESSING_DIR = os.path.join(CLOUD_ROOT, 'cloud_preprocessing') \n", - "\n", - "LOCAL_TRAIN_FILE = os.path.join(LOCAL_ROOT, 'train.csv')\n", - "CLOUD_TRAIN_FILE = os.path.join(CLOUD_ROOT, 'train.csv')\n", - "\n", - "\n", - "LOCAL_SCHEMA_FILE = os.path.join(LOCAL_ROOT, 'schema.json')\n", - "CLOUD_SCHEMA_FILE = os.path.join(CLOUD_ROOT, 'schema.json')\n", - "\n", - "if not file_io.file_exists(LOCAL_ROOT):\n", - " raise ValueError('LOCAL_ROOT not found. Did you run the local notebook?')\n", - " \n", - "!gsutil mb {CLOUD_ROOT}" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "First, let us put the csv files on GCS" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Copying file://./census_regression_workspace/train.csv [Content-Type=text/csv]...\n", - "/ [1 files][162.9 KiB/162.9 KiB] \n", - "Operation completed over 1 objects/162.9 KiB. \n", - "Copying file://./census_regression_workspace/schema.json [Content-Type=application/json]...\n", - "/ [1 files][ 998.0 B/ 998.0 B] \n", - "Operation completed over 1 objects/998.0 B. \n" - ] - } - ], - "source": [ - "!gsutil cp {LOCAL_TRAIN_FILE} {CLOUD_TRAIN_FILE}\n", - "!gsutil cp {LOCAL_SCHEMA_FILE} {CLOUD_SCHEMA_FILE}" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Analysis with BigQuery starting from csv files on GCS" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Removing gs://cloud-ml-dev-census-regression-datalab/cloud_preprocessing/numerical_analysis.json#1488569767574404...\n", - "Removing gs://cloud-ml-dev-census-regression-datalab/cloud_preprocessing/schema.json#1488569813706792...\n", - "Removing gs://cloud-ml-dev-census-regression-datalab/cloud_preprocessing/vocab_AGEP.csv#1488569772581087...\n", - "Removing gs://cloud-ml-dev-census-regression-datalab/cloud_preprocessing/vocab_COW.csv#1488569774573759...\n", - "Removing gs://cloud-ml-dev-census-regression-datalab/cloud_preprocessing/vocab_ESP.csv#1488569777206853...\n", - "Removing gs://cloud-ml-dev-census-regression-datalab/cloud_preprocessing/vocab_ESR.csv#1488569779334695...\n", - "Removing gs://cloud-ml-dev-census-regression-datalab/cloud_preprocessing/vocab_FOD1P.csv#1488569781467455...\n", - "Removing gs://cloud-ml-dev-census-regression-datalab/cloud_preprocessing/vocab_HINS4.csv#1488569786408543...\n", - "Removing gs://cloud-ml-dev-census-regression-datalab/cloud_preprocessing/vocab_INDP.csv#1488569788307611...\n", - "Removing gs://cloud-ml-dev-census-regression-datalab/cloud_preprocessing/vocab_JWMNP.csv#1488569789978231...\n", - "Removing gs://cloud-ml-dev-census-regression-datalab/cloud_preprocessing/vocab_JWTR.csv#1488569792433984...\n", - "Removing gs://cloud-ml-dev-census-regression-datalab/cloud_preprocessing/vocab_SCIENGRLP.csv#1488569808791439...\n", - "Removing gs://cloud-ml-dev-census-regression-datalab/cloud_preprocessing/vocab_SCHL.csv#1488569806838565...\n", - "Removing gs://cloud-ml-dev-census-regression-datalab/cloud_preprocessing/vocab_MAR.csv#1488569794395164...\n", - "Removing gs://cloud-ml-dev-census-regression-datalab/cloud_preprocessing/vocab_PUMA.csv#1488569802418996...\n", - "Removing gs://cloud-ml-dev-census-regression-datalab/cloud_preprocessing/vocab_POWPUMA.csv#1488569796957551...\n", - "Removing gs://cloud-ml-dev-census-regression-datalab/cloud_preprocessing/vocab_SERIALNO.csv#1488569770467617...\n", - "Removing gs://cloud-ml-dev-census-regression-datalab/cloud_preprocessing/vocab_RAC1P.csv#1488569804875310...\n", - "Removing gs://cloud-ml-dev-census-regression-datalab/cloud_preprocessing/vocab_SEX.csv#1488569810944732...\n", - "Removing gs://cloud-ml-dev-census-regression-datalab/cloud_preprocessing/vocab_WKW.csv#1488569813150444...\n", - "/ [20/20 objects] 100% Done \n", - "Operation completed over 20 objects. \n" - ] - } - ], - "source": [ - "!gsutil -m rm -fr {CLOUD_PREPROCESSING_DIR}" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "train_csv = ml.CsvDataSet(\n", - " file_pattern=CLOUD_TRAIN_FILE,\n", - " schema_file=CLOUD_SCHEMA_FILE\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Track BigQuery status at\n", - "https://bigquery.cloud.google.com/queries/cloud-ml-dev\n", - "Running numerical analysis...done.\n", - "Running categorical analysis...done.\n" - ] - } - ], - "source": [ - "sd.analyze(\n", - " cloud=True,\n", - " dataset=train_csv,\n", - " output_dir=CLOUD_PREPROCESSING_DIR,\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The output of preprocessing is a numerical_analysis file that contains analysis from the numerical columns, and a vocab file from each categorical column. The files produced by preprocessing are consumed in training, and you should not have to worry about these files. Just for fun, lets look at them." - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "gs://cloud-ml-dev-census-regression-datalab/cloud_preprocessing/schema.json\r\n", - "gs://cloud-ml-dev-census-regression-datalab/cloud_preprocessing/stats.json\r\n", - "gs://cloud-ml-dev-census-regression-datalab/cloud_preprocessing/vocab_AGEP.csv\r\n", - "gs://cloud-ml-dev-census-regression-datalab/cloud_preprocessing/vocab_COW.csv\r\n", - "gs://cloud-ml-dev-census-regression-datalab/cloud_preprocessing/vocab_ESP.csv\r\n", - "gs://cloud-ml-dev-census-regression-datalab/cloud_preprocessing/vocab_ESR.csv\r\n", - "gs://cloud-ml-dev-census-regression-datalab/cloud_preprocessing/vocab_FOD1P.csv\r\n", - "gs://cloud-ml-dev-census-regression-datalab/cloud_preprocessing/vocab_HINS4.csv\r\n", - "gs://cloud-ml-dev-census-regression-datalab/cloud_preprocessing/vocab_INDP.csv\r\n", - "gs://cloud-ml-dev-census-regression-datalab/cloud_preprocessing/vocab_JWMNP.csv\r\n", - "gs://cloud-ml-dev-census-regression-datalab/cloud_preprocessing/vocab_JWTR.csv\r\n", - "gs://cloud-ml-dev-census-regression-datalab/cloud_preprocessing/vocab_MAR.csv\r\n", - "gs://cloud-ml-dev-census-regression-datalab/cloud_preprocessing/vocab_POWPUMA.csv\r\n", - "gs://cloud-ml-dev-census-regression-datalab/cloud_preprocessing/vocab_PUMA.csv\r\n", - "gs://cloud-ml-dev-census-regression-datalab/cloud_preprocessing/vocab_RAC1P.csv\r\n", - "gs://cloud-ml-dev-census-regression-datalab/cloud_preprocessing/vocab_SCHL.csv\r\n", - "gs://cloud-ml-dev-census-regression-datalab/cloud_preprocessing/vocab_SCIENGRLP.csv\r\n", - "gs://cloud-ml-dev-census-regression-datalab/cloud_preprocessing/vocab_SERIALNO.csv\r\n", - "gs://cloud-ml-dev-census-regression-datalab/cloud_preprocessing/vocab_SEX.csv\r\n", - "gs://cloud-ml-dev-census-regression-datalab/cloud_preprocessing/vocab_WKW.csv\r\n" - ] - } - ], - "source": [ - "!gsutil ls {CLOUD_PREPROCESSING_DIR}" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[\r\n", - " {\r\n", - " \"type\": \"STRING\",\r\n", - " \"name\": \"SERIALNO\"\r\n", - " },\r\n", - " {\r\n", - " \"type\": \"FLOAT\",\r\n", - " \"name\": \"WAGP\"\r\n", - " },\r\n", - " {\r\n", - " \"type\": \"STRING\",\r\n", - " \"name\": \"AGEP\"\r\n", - " },\r\n", - " {\r\n", - " \"type\": \"STRING\",\r\n", - " \"name\": \"COW\"\r\n", - " },\r\n", - " {\r\n", - " \"type\": \"STRING\",\r\n", - " \"name\": \"ESP\"\r\n", - " },\r\n", - " {\r\n", - " \"type\": \"STRING\",\r\n", - " \"name\": \"ESR\"\r\n", - " },\r\n", - " {\r\n", - " \"type\": \"STRING\",\r\n", - " \"name\": \"FOD1P\"\r\n", - " },\r\n", - " {\r\n", - " \"type\": \"STRING\",\r\n", - " \"name\": \"HINS4\"\r\n", - " },\r\n", - " {\r\n", - " \"type\": \"STRING\",\r\n", - " \"name\": \"INDP\"\r\n", - " },\r\n", - " {\r\n", - " \"type\": \"STRING\",\r\n", - " \"name\": \"JWMNP\"\r\n", - " },\r\n", - " {\r\n", - " \"type\": \"STRING\",\r\n", - " \"name\": \"JWTR\"\r\n", - " },\r\n", - " {\r\n", - " \"type\": \"STRING\",\r\n", - " \"name\": \"MAR\"\r\n", - " },\r\n", - " {\r\n", - " \"type\": \"STRING\",\r\n", - " \"name\": \"POWPUMA\"\r\n", - " },\r\n", - " {\r\n", - " \"type\": \"STRING\",\r\n", - " \"name\": \"PUMA\"\r\n", - " },\r\n", - " {\r\n", - " \"type\": \"STRING\",\r\n", - " \"name\": \"RAC1P\"\r\n", - " },\r\n", - " {\r\n", - " \"type\": \"STRING\",\r\n", - " \"name\": \"SCHL\"\r\n", - " },\r\n", - " {\r\n", - " \"type\": \"STRING\",\r\n", - " \"name\": \"SCIENGRLP\"\r\n", - " },\r\n", - " {\r\n", - " \"type\": \"STRING\",\r\n", - " \"name\": \"SEX\"\r\n", - " },\r\n", - " {\r\n", - " \"type\": \"STRING\",\r\n", - " \"name\": \"WKW\"\r\n", - " }\r\n", - "]" - ] - } - ], - "source": [ - "!gsutil cat {CLOUD_PREPROCESSING_DIR}/schema.json" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Cleaning things up" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "If you want to delete the files you made on GCS, uncomment and run the next cell." - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "#!gsutil rm -fr {CLOUD_ROOT}" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 2", - "language": "python", - "name": "python2" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 2 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython2", - "version": "2.7.9" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -}