In [3]:
!pip --version

pip 21.1.1 from /opt/conda/lib/python3.8/site-packages/pip (python 3.8)


In [4]:
!pip install --upgrade pip

Collecting pip
  Downloading pip-22.3.1-py3-none-any.whl (2.1 MB)
[K     |████████████████████████████████| 2.1 MB 6.8 MB/s eta 0:00:01
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 21.1.1
    Uninstalling pip-21.1.1:
      Successfully uninstalled pip-21.1.1
Successfully installed pip-22.3.1


In [None]:
!pip install -r requirements.txt

In [43]:
import os
import pprint
import tempfile

import absl
import tensorflow as tf
from tfx import v1 as tfx
import tfx.dsl as dsl
from tfx.orchestration.experimental.interactive.interactive_context import InteractiveContext
from tfx.examples.custom_components.presto_example_gen.proto import presto_config_pb2
from tfx.examples.custom_components.presto_example_gen.presto_component.component import PrestoExampleGen

from tfx.components.example_gen.component import FileBasedExampleGen
from tfx.components.example_gen.custom_executors import parquet_executor
from tfx.proto import example_gen_pb2, transform_pb2, trainer_pb2
from tfx.dsl.components.base import executor_spec
import tensorflow_model_analysis as tfma
from tensorflow_data_validation.utils.display_util import get_statistics_html
from tensorflow_data_validation.utils.stats_util import load_statistics


tf.get_logger().propagate = False
pp = pprint.PrettyPrinter()

In [3]:
# Set up logging.
absl.logging.set_verbosity(absl.logging.ERROR)

print('TensorFlow version: {}'.format(tf.__version__))
print('TFX version: {}'.format(tfx.__version__))

TensorFlow version: 2.5.3
TFX version: 1.2.0


In [4]:
BUCKET_NAME = "ml-data"
PIPELINE_NAME = 'fraud_detection_tfx_pipeline'
PIPELINE_ROOT = f"s3://{BUCKET_NAME}/{PIPELINE_NAME}"
METADATA_PATH = os.path.join('.', f"resources/{PIPELINE_NAME}/tfx_metadata", 'metadata.db')

SRC_FOLDER = f"s3://{BUCKET_NAME}/tb_fraud"

In [5]:
_parallelism=1
_beam_pipeline_args_by_runner = {
    'DirectRunner': [
        '--direct_running_mode=multi_threading',
        '--direct_num_workers=%d' % _parallelism,
        "--s3_endpoint_url=http://storage:9000/",
        "--s3_access_key_id=minioadmin",
        "--s3_secret_access_key=minioadmin",
        "--s3_disable_ssl",
        "--s3_verify=False"
    ],
    'SparkRunner': [
        '--runner=PortableRunner',
        '--job_endpoint=localhost:8099',
        '--environment_type=DOCKER',
        '--environment_config=apache/beam_python3.6_sdk:2.29.0-custom',
        '--cache_disabled',
        '--spark_submit_uber_jar',
        '--sdk_worker_parallelism=1',
        "--s3_endpoint_url=http://172.17.0.1:9000/",
        "--s3_access_key_id=minioadmin",
        "--s3_secret_access_key=minioadmin",
        "--s3_disable_ssl",
        "--s3_verify=False"
    ],
    'SparkDirectRunner': [
        '--runner=SparkRunner',
        '--spark_submit_uber_jar',
        '--spark_master_url=spark://localhost:7077 ',
        '--spark_rest_url=http://localhost:6066',
        '--environment_type=DOCKER',
        '--environment_config=apachebeam/python3.6_sdk',
        '--cache_disabled',
        '--sdk_worker_parallelism=1'
    ]
}

In [6]:
_beam_pipeline_args = _beam_pipeline_args_by_runner["DirectRunner"]
context = InteractiveContext(pipeline_name=PIPELINE_NAME,
                             pipeline_root=PIPELINE_ROOT,
                             metadata_connection_config=tfx.orchestration.metadata.sqlite_metadata_connection_config(METADATA_PATH),
                             beam_pipeline_args=_beam_pipeline_args
                            )

In [7]:
_presto_config = presto_config_pb2.PrestoConnConfig(host="trino-coordinator",port=8080, user="admin")
_query = 'SELECT * FROM hive.ml_fraud_detection_db.tb_fraud'
example_component = PrestoExampleGen(_presto_config, query=_query)
loader_component_response = context.run(example_component)



In [8]:
stats_component = tfx.components.StatisticsGen(examples=example_component.outputs['examples'])
stats_component_response = context.run(stats_component)



In [56]:
context.show(stats_component.outputs['statistics']))

In [54]:
train_stats.datasets

[]

In [10]:
infer_schema_component = tfx.components.SchemaGen(statistics=stats_component.outputs["statistics"])
schema_component_response = context.run(infer_schema_component)

In [11]:
context.show(infer_schema_component.outputs['schema'])

Unnamed: 0_level_0,Type,Presence,Valency,Domain
Feature name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
'c1',FLOAT,optional,single,-
'c13',FLOAT,optional,single,-
'c5',FLOAT,optional,single,-
'card1',FLOAT,optional,single,-
'card2',FLOAT,optional,single,-
'card3',FLOAT,optional,single,-
'card4',STRING,optional,single,'card4'
'card5',FLOAT,optional,single,-
'card6',STRING,optional,single,'card6'
'd1',FLOAT,optional,single,-


  pd.set_option('max_colwidth', -1)


Unnamed: 0_level_0,Values
Domain,Unnamed: 1_level_1
'card4',"'american express', 'discover', 'mastercard', 'visa'"
'card6',"'charge card', 'credit', 'debit', 'debit or credit'"
'm2',"'F', 'T'"
'm3',"'F', 'T'"
'm4',"'M0', 'M1', 'M2'"
'm5',"'F', 'T'"
'm6',"'F', 'T'"
'm7',"'F', 'T'"
'm8',"'F', 'T'"
'm9',"'F', 'T'"


In [13]:
transform_component = tfx.components.Transform(examples=example_component.outputs["examples"],
                                                   schema=infer_schema_component.outputs["schema"],
                                                   module_file="python_modules/standardscalar_preprocessor.py")
transform_component_response = context.run(transform_component)

In [30]:
context.show(transform_component.outputs['transformed_examples'])

In [14]:
trainer_component = tfx.components.Trainer(module_file='python_modules/trainer_module.py',
                                               examples=transform_component.outputs['transformed_examples'],
                                               transform_graph=transform_component.outputs['transform_graph'],
                                               schema=infer_schema_component.outputs['schema'],
                                               train_args=trainer_pb2.TrainArgs(num_steps=10),
                                               eval_args=trainer_pb2.EvalArgs(num_steps=5),
                                               custom_config= {
                                                   "epochs": 13
                                               }
                                          )
trainer_component_response = context.run(trainer_component, enable_cache=False)

fn_args = FnArgs(working_dir=None, train_files=['s3://ml-data/fraud_detection_tfx_pipeline/Transform/transformed_examples/4/Split-train/*'], eval_files=['s3://ml-data/fraud_detection_tfx_pipeline/Transform/transformed_examples/4/Split-eval/*'], train_steps=10, eval_steps=5, schema_path='s3://ml-data/fraud_detection_tfx_pipeline/SchemaGen/schema/3/schema.pbtxt', schema_file='s3://ml-data/fraud_detection_tfx_pipeline/SchemaGen/schema/3/schema.pbtxt', transform_graph_path='s3://ml-data/fraud_detection_tfx_pipeline/Transform/transform_graph/4', transform_output='s3://ml-data/fraud_detection_tfx_pipeline/Transform/transform_graph/4', data_accessor=DataAccessor(tf_dataset_factory=<function get_tf_dataset_factory_from_artifact.<locals>.dataset_factory at 0x7ff1b9812670>, record_batch_factory=<function get_record_batch_factory_from_artifact.<locals>.record_batch_factory at 0x7ff1b9812310>, data_view_decode_fn=None), serving_model_dir='s3://ml-data/fraud_detection_tfx_pipeline/Trainer/model/6/F

In [20]:
# push_uri = pusher_component.outputs['pushed_model'].get()[0].uri
push_uri = f"{trainer_component.outputs['model'].get()[0].uri}/Format-Serving"
print(push_uri)

s3://ml-data/fraud_detection_tfx_pipeline/Trainer/model/65/Format-Serving
