In [1]:
from datapipe.datatable import DataStore, DBConn
from datapipe.compute import Catalog, Pipeline, Table, run_pipeline
from datapipe.store.database import TableStoreDB
from datapipe.store.redis import RedisStore
from datapipe.core_steps import BatchTransform, UpdateExternalTable
from datapipe.types import data_to_index
from sqlalchemy.engine import create_engine
from sqlalchemy import inspect
from sqlalchemy import Column, String, JSON, Integer, Boolean
import pandas as pd
import redis

In [2]:
# entities:
# DBConn - database connection (conn url string or conn object itself. question - does dbconn only eat sqlalchemy?)
# DataStore - metatables data storage (entity where hashes and info about changes is stored)
# Table - data table (could be in db, or in file? question - are all sources of data described as Table?)
# Catalog - entity that describes DataTables
# Store - entity with methods for different kinds of storage.  could be TableStoreDB, TableDataSingleFileStore, folder etc...
# TableStoreDB - table data stored in database.
# Table schemas described in sqlaclchemy terms and classes

In [3]:
DBCONN = "postgresql://postgres:testpass@localhost:5432/test"
engine = create_engine(DBCONN)
# two separate connections needed for datastore and main tables
engine.execute('''
DROP SCHEMA public CASCADE;
CREATE SCHEMA public;''')
dbconn = DBConn(DBCONN)
meta_dbconn = DBConn(DBCONN)
ds = DataStore(meta_dbconn)
redis_conn_mid = redis.Redis(decode_responses=False)

Attempting to instrument while already instrumented


# Start Pipe

In [4]:
# engine.execute(
#     """
#     drop table if exists pipeline_start_meta;
#     drop table if exists pipeline_end_meta;
#     drop table if exists test_output;
#     """
# )
# keys = redis_conn.keys()
# if keys:
#     redis_conn.delete(*keys)

# INPUT_SCHEMA = [
#     Column("user_id", String, primary_key=True),
#     Column("click_count", Integer)
# ]

# OUTPUT_SCHEMA = [
#     Column("user_id", String, primary_key=True),
#     Column("click_count", Integer),
#     Column("click_count_doubled", Integer)
# ]

# catalog = Catalog({
#     "pipeline_start": Table(store=RedisStore(redis_conn, "test_input", INPUT_SCHEMA)),
#     "pipeline_end": Table(store=TableStoreDB(dbconn, "test_output", OUTPUT_SCHEMA))
# })

# def double(df):
#     df['click_count_doubled'] = df['click_count'] * 2
#     return df

# pipeline = Pipeline([
#     UpdateExternalTable('pipeline_start'),
#     BatchTransform(
#         double,
#         inputs=["pipeline_start"],
#         outputs=["pipeline_end"],
#     )
# ])

# run_pipeline(ds, catalog, pipeline)
# input_dt = catalog.get_datatable(ds, "pipeline_start")
# output_dt = catalog.get_datatable(ds, "pipeline_end")
# print(input_dt.get_data())
# print(output_dt.get_data())

# redis_conn.set('a', 1)
# run_pipeline(ds, catalog, pipeline)
# print(input_dt.get_data())
# print(output_dt.get_data())

# redis_conn.delete('a')
# run_pipeline(ds, catalog, pipeline)
# print(input_dt.get_data())
# print(output_dt.get_data())

# MID PIPE

In [5]:
INPUT_SCHEMA = [
    Column("user_id", String, primary_key=True),
    Column("click_count", Integer)
]

INTERM_SCHEMA = [
    Column('user_id', String, primary_key=True),
    Column('click_count_doubled', Integer)
]

OUTPUT_SCHEMA = [
    Column("user_id", String, primary_key=True),
    Column("click_count_doubled", Integer),
    Column("click_count_doubled_squared", Integer)
]

In [6]:
catalog = Catalog({
    "pipeline_start": Table(store=TableStoreDB(dbconn, "test_input", INPUT_SCHEMA)),
    "pipeline_mid": Table(store=RedisStore(redis_conn_mid, INTERM_SCHEMA)),
    "pipeline_end": Table(store=TableStoreDB(dbconn, "test_output", OUTPUT_SCHEMA))
})

In [7]:
def double(df):
    df['click_count_doubled'] = df['click_count'] * 2
    df.drop(columns=['click_count'], inplace=True)
    return df

def double_and_square(df):
    df['click_count_doubled_squared'] = df['click_count_doubled']**2
    return df

In [8]:
pipeline = Pipeline([
    UpdateExternalTable('pipeline_start'),
    BatchTransform(
        double,
        inputs=["pipeline_start"],
        outputs=["pipeline_mid"],
    ),
    BatchTransform(
        double_and_square,
        inputs=['pipeline_mid'],
        outputs=['pipeline_end']
    )
])

In [9]:
run_pipeline(ds, catalog, pipeline)
input_dt = catalog.get_datatable(ds, "pipeline_start")
output_dt = catalog.get_datatable(ds, "pipeline_end")
print(input_dt.get_data())
print(output_dt.get_data())

1it [00:00, 23.26it/s]

Empty DataFrame
Columns: [user_id, click_count]
Index: []
Empty DataFrame
Columns: [user_id, click_count_doubled, click_count_doubled_squared]
Index: []





In [12]:
engine.execute('''
insert into test_input values ('abc', 123)
''')
run_pipeline(ds, catalog, pipeline)
input_dt = catalog.get_datatable(ds, "pipeline_start")
mid_dt = catalog.get_datatable(ds, 'pipeline_mid')
output_dt = catalog.get_datatable(ds, "pipeline_end")
print(input_dt.get_data())
print(mid_dt.get_data())
print(output_dt.get_data())

1it [00:00,  1.46it/s]
100%|██████████| 1/1 [00:00<00:00,  7.85it/s]
100%|██████████| 1/1 [00:00<00:00,  7.65it/s]

  user_id  click_count_doubled
0     abc                  246
  user_id  click_count
0     abc          123
  user_id  click_count_doubled  click_count_doubled_squared
0     abc                  246                        60516





In [13]:
engine.execute('''
delete from test_input where user_id = 'abc'
''')
run_pipeline(ds, catalog, pipeline)
input_dt = catalog.get_datatable(ds, "pipeline_start")
mid_dt = catalog.get_datatable(ds, 'pipeline_mid')
output_dt = catalog.get_datatable(ds, "pipeline_end")
print(input_dt.get_data())
print(mid_dt.get_data())
print(output_dt.get_data())

1it [00:00,  2.04it/s]
100%|██████████| 1/1 [00:00<00:00,  1.42it/s]
100%|██████████| 1/1 [00:00<00:00,  2.45it/s]

Empty DataFrame
Columns: [user_id, click_count_doubled]
Index: []
Empty DataFrame
Columns: [user_id, click_count]
Index: []
Empty DataFrame
Columns: [user_id, click_count_doubled, click_count_doubled_squared]
Index: []



