0.15.x (#67)

0.15.x refactor and DP updates
gretelai · Nov 13, 2020 · b1e34b2 · b1e34b2
1 parent 6c15f8b
commit b1e34b2
Show file tree

Hide file tree

Showing 45 changed files with 2,910 additions and 1,167 deletions.
diff --git a/.github/workflows/integration-tests.yml b/.github/workflows/integration-tests.yml
@@ -0,0 +1,33 @@
+name: Integration Tests
+
+on:
+  push:
+    branches:
+      - master
+  pull_request:
+    branches:
+      - master
+
+jobs:
+  tests:
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v1
+    - name: Setup Python
+      uses: actions/setup-python@v1
+      with:
+        python-version: 3.6
+    - name: Install
+      run:
+        pip install -e .
+        pip install -r test-requirements.txt
+    - name: Test
+      run: pytest -s -vv --cov src --cov-report term-missing tests-integration/
+    - uses: 8398a7/action-slack@v2
+      with:
+        status: ${{ job.status }}
+        author_name: Integration Tests
+        only_mention_fail: here
+      env:
+        SLACK_WEBHOOK_URL: ${{ secrets.SLACK_WEBHOOK_URL }}
+      if: always()
diff --git a/.github/workflows/main.yml → .github/workflows/unit-tests.yml b/.github/workflows/main.yml → .github/workflows/unit-tests.yml
@@ -1,4 +1,4 @@
-name: gretel-synthetics workflows
+name: Unit Tests
 
 on: [push]
 
@@ -23,3 +23,11 @@ jobs:
         flake8 --count --exit-zero --max-complexity=30 --max-line-length=120 --statistics src/
     - name: Test
       run: pytest -s -vv --cov src --cov-report term-missing tests/
+    - uses: 8398a7/action-slack@v2
+      with:
+        status: ${{ job.status }}
+        author_name: Unit Test
+        only_mention_fail: here
+      env:
+        SLACK_WEBHOOK_URL: ${{ secrets.SLACK_WEBHOOK_URL }}
+      if: always()
diff --git a/docs/api/tokenizers.rst b/docs/api/tokenizers.rst
@@ -0,0 +1,5 @@
+Tokenizers
+===========
+
+.. automodule:: gretel_synthetics.tokenizers
+    :members:
diff --git a/docs/index.rst b/docs/index.rst
@@ -14,6 +14,7 @@ Modules
    :maxdepth: 2
 
    api/config.rst
+   api/tokenizers.rst
    api/train.rst
    api/generate.rst
    api/batch.rst

diff --git a/examples/dataframe_batch.ipynb b/examples/dataframe_batch.ipynb
@@ -41,6 +41,15 @@
     "source_df = pd.read_csv(\"https://gretel-public-website.s3-us-west-2.amazonaws.com/datasets/notebooks/google_marketplace_analytics.csv\")"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "source_df.shape"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -64,7 +73,7 @@
     "\n",
     "config_template = {\n",
     "    \"max_line_len\": 2048,\n",
-    "    \"vocab_size\": 20000,\n",
+    "    \"vocab_size\": 200000,\n",
     "    \"field_delimiter\": \",\",\n",
     "    \"overwrite\": True,\n",
     "    \"checkpoint_dir\": checkpoint_dir\n",
@@ -119,7 +128,16 @@
     "# Next, we can trigger all batched models to create output. This loops over each model and will attempt to generate\n",
     "# ``gen_lines`` valid lines for each model. This method returns a dictionary of bools that is indexed by batch number\n",
     "# and tells us if, for each batch, we were able to generate the requested number of valid lines\n",
-    "status = batcher.generate_all_batch_lines()"
+    "status = batcher.generate_all_batch_lines(num_lines=2000)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "batcher.batches[2].gen_data_stream.getvalue()"
    ]
   },
   {
@@ -188,6 +206,13 @@
    "source": [
     "read_batch.batches_to_df()"
    ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
   }
  ],
  "metadata": {

diff --git a/examples/poe.txt b/examples/poe.txt
@@ -0,0 +1,6 @@
+Once upon a midnight dreary, while I pondered, weak and weary,
+Over many a quaint and curious volume of forgotten lore
+While I nodded, nearly napping, suddenly there came a tapping,
+As of some one gently rapping, rapping at my chamber door.
+Tis some visitor, I muttered, tapping at my chamber door
+Only this and nothing more.
diff --git a/examples/tensorflow/batch-df-char-tokenizer.ipynb b/examples/tensorflow/batch-df-char-tokenizer.ipynb
@@ -0,0 +1,109 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "\n",
+    "from gretel_synthetics.batch import DataFrameBatch, PATH_HOLDER\n",
+    "from gretel_synthetics.config import TensorFlowConfig\n",
+    "from gretel_synthetics.tokenizers import CharTokenizerTrainer"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from pathlib import Path\n",
+    "\n",
+    "checkpoint_dir = str(Path.cwd() / \"checkpoints\")\n",
+    "\n",
+    "config = TensorFlowConfig(\n",
+    "    field_delimiter=\",\",\n",
+    "    overwrite=True,\n",
+    "    checkpoint_dir=checkpoint_dir,\n",
+    "    input_data_path=PATH_HOLDER\n",
+    ")\n",
+    "\n",
+    "df = pd.read_csv(\"https://gretel-public-website.s3-us-west-2.amazonaws.com/tests/synthetics/data/USAdultIncome14K.csv\")\n",
+    "\n",
+    "df = df.sample(n=5000)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "batcher = DataFrameBatch(\n",
+    "    df=df,\n",
+    "    config=config,\n",
+    "    tokenizer=CharTokenizerTrainer(config=config)\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "batcher.create_training_data()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "batcher.train_all_batches()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "batcher.generate_all_batch_lines(parallelism=1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "batcher.batches_to_df()"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/examples/tensorflow/simple-character-model.ipynb b/examples/tensorflow/simple-character-model.ipynb
@@ -0,0 +1,115 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Create a synthetic model and generate data using TensorFlow and a simple character tokenizer\n",
+    "\n",
+    "#!pip install -Uqq \"gretel-synthetics>=0.15.0\"\n",
+    "\n",
+    "from gretel_synthetics.config import TensorFlowConfig\n",
+    "from gretel_synthetics.tokenizers import CharTokenizerTrainer\n",
+    "from gretel_synthetics.train import train"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "DATA = \"https://gretel-public-website.s3-us-west-2.amazonaws.com/datasets/uber_scooter_rides_1day.csv\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from pathlib import Path\n",
+    "\n",
+    "config = TensorFlowConfig(\n",
+    "    input_data_path=DATA,\n",
+    "    checkpoint_dir=str(Path.cwd() / \"checkpoints\"),\n",
+    "    field_delimiter=\",\",\n",
+    "    overwrite=True\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tokenizer = CharTokenizerTrainer(config=config)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "train(config, tokenizer)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Now we can generate with this config\n",
+    "\n",
+    "from gretel_synthetics.generate import generate_text\n",
+    "\n",
+    "def validate_record(line):\n",
+    "    rec = line.split(\", \")\n",
+    "    if len(rec) == 6:\n",
+    "        float(rec[5])\n",
+    "        float(rec[4])\n",
+    "        float(rec[3])\n",
+    "        float(rec[2])\n",
+    "        int(rec[0])\n",
+    "    else:\n",
+    "        raise Exception('record not 6 parts')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "for line in generate_text(config, line_validator=validate_record, num_lines=1000):\n",
+    "    print(line)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/examples/tokenizer_demo/char2idx.p b/examples/tokenizer_demo/char2idx.p
diff --git a/examples/tokenizer_demo/idx2char.p b/examples/tokenizer_demo/idx2char.p
diff --git a/examples/tokenizer_demo/m.model b/examples/tokenizer_demo/m.model