# Import Data & Libraries

In [43]:
! kaggle datasets download colewelkins/cardiovascular-disease

^C


Dataset URL: https://www.kaggle.com/datasets/colewelkins/cardiovascular-disease
License(s): DbCL-1.0
cardiovascular-disease.zip: Skipping, found more recently modified local copy (use --force to force download)


In [None]:
import zipfile
with zipfile.ZipFile("cardiovascular-disease.zip", 'r') as zip_ref:
    zip_ref.extractall("data")

print("Files extracted to folder data")

Files extracted to folder data


In [5]:
import pandas as pd
df = pd.read_csv('data/cardio_data_processed.csv')
df.head(10)

Unnamed: 0,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio,age_years,bmi,bp_category,bp_category_encoded
0,2,168,62.0,110,80,1,1,0,0,1,0,50,21.96712,Hypertension Stage 1,Hypertension Stage 1
1,1,156,85.0,140,90,3,1,0,0,1,1,55,34.927679,Hypertension Stage 2,Hypertension Stage 2
2,1,165,64.0,130,70,3,1,0,0,0,1,51,23.507805,Hypertension Stage 1,Hypertension Stage 1
3,2,169,82.0,150,100,1,1,0,0,1,1,48,28.710479,Hypertension Stage 2,Hypertension Stage 2
4,1,156,56.0,100,60,1,1,0,0,0,0,47,23.011177,Normal,Normal
5,1,151,67.0,120,80,2,2,0,0,0,0,60,29.384676,Hypertension Stage 1,Hypertension Stage 1
6,1,157,93.0,130,80,3,1,0,0,1,0,60,37.729725,Hypertension Stage 1,Hypertension Stage 1
7,2,178,95.0,130,90,3,3,0,0,1,1,61,29.983588,Hypertension Stage 1,Hypertension Stage 1
8,1,158,71.0,110,70,1,1,0,0,1,0,48,28.440955,Normal,Normal
9,1,164,68.0,110,60,1,1,0,0,0,0,54,25.28257,Normal,Normal


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 68205 entries, 0 to 68204
Data columns (total 17 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   id                   68205 non-null  int64  
 1   age                  68205 non-null  int64  
 2   gender               68205 non-null  int64  
 3   height               68205 non-null  int64  
 4   weight               68205 non-null  float64
 5   ap_hi                68205 non-null  int64  
 6   ap_lo                68205 non-null  int64  
 7   cholesterol          68205 non-null  int64  
 8   gluc                 68205 non-null  int64  
 9   smoke                68205 non-null  int64  
 10  alco                 68205 non-null  int64  
 11  active               68205 non-null  int64  
 12  cardio               68205 non-null  int64  
 13  age_years            68205 non-null  int64  
 14  bmi                  68205 non-null  float64
 15  bp_category          68205 non-null 

In [None]:
df = df.drop(columns=['id','age'])
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 68205 entries, 0 to 68204
Data columns (total 15 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   gender               68205 non-null  int64  
 1   height               68205 non-null  int64  
 2   weight               68205 non-null  float64
 3   ap_hi                68205 non-null  int64  
 4   ap_lo                68205 non-null  int64  
 5   cholesterol          68205 non-null  int64  
 6   gluc                 68205 non-null  int64  
 7   smoke                68205 non-null  int64  
 8   alco                 68205 non-null  int64  
 9   active               68205 non-null  int64  
 10  cardio               68205 non-null  int64  
 11  age_years            68205 non-null  int64  
 12  bmi                  68205 non-null  float64
 13  bp_category          68205 non-null  object 
 14  bp_category_encoded  68205 non-null  object 
dtypes: float64(2), int64(11), object(2)


In [44]:
df.to_csv('data/cardio_data_processed.csv', index=False)

In [42]:
df.isna().sum()

gender                 0
height                 0
weight                 0
ap_hi                  0
ap_lo                  0
cholesterol            0
gluc                   0
smoke                  0
alco                   0
active                 0
cardio                 0
age_years              0
bmi                    0
bp_category            0
bp_category_encoded    0
dtype: int64

In [16]:
import os
import zipfile
from typing import Text
import pandas as pd
from absl import logging
from tfx.orchestration import metadata, pipeline
from tfx.orchestration.beam.beam_dag_runner import BeamDagRunner
from modules import components

# Pipelines

## Set Variable Pipelines

In [17]:
PIPELINE_NAME = "faizahmp-pipeline"
 
# pipeline inputs
DATA_ROOT = "data"
TRANSFORM_MODULE_FILE = "modules/cardiovaskular_transform.py"
TUNER_MODULE_FILE = "modules/cardiovaskular_tuner.py"
TRAINER_MODULE_FILE = "modules/cardiovaskular_trainer.py"
# requirement_file = os.path.join(root, "requirements.txt")
 
# pipeline outputs
OUTPUT_BASE = "outputs"
serving_model_dir = os.path.join(OUTPUT_BASE, 'serving_model')
pipeline_root = os.path.join(OUTPUT_BASE, PIPELINE_NAME)
metadata_path = os.path.join(pipeline_root, "metadata.sqlite")


## Initialize local pipeline

In [18]:
def init_local_pipeline(
    components, pipeline_root: Text
) -> pipeline.Pipeline:
    """
    Initialize a TFX pipeline with components.

    Args:
        components: list of TFX components
        pipeline_root: directory to save pipeline artifacts
    Returns:
        TFX pipeline
    """

    logging.info(f"Pipeline root set to: {pipeline_root}")
    beam_args = [
        "--direct_running_mode=multi_processing",
        # 0 auto-detect based on on the number of CPUs available
        # during execution time.
        "----direct_num_workers=0",
    ]

    return pipeline.Pipeline(
        pipeline_name=PIPELINE_NAME,
        pipeline_root=pipeline_root,
        components=components,
        enable_cache=True,
        metadata_connection_config=metadata.sqlite_metadata_connection_config(
            metadata_path),
        beam_pipeline_args=beam_args
    )


In [19]:
logging.set_verbosity(logging.INFO)

from modules.components import init_components

p_components = init_components({
    'data_dir': DATA_ROOT,
    'transform_module': TRANSFORM_MODULE_FILE,
    'tuner_module': TUNER_MODULE_FILE,
    'training_module': TRAINER_MODULE_FILE,
    'training_steps': 500,
    'eval_steps': 100,
    'serving_model_dir': serving_model_dir
})

pipeline = init_local_pipeline(p_components, pipeline_root)
BeamDagRunner().run(pipeline=pipeline)

Trial 20 Complete [00h 00m 08s]
val_binary_accuracy: 0.6850000023841858

Best val_binary_accuracy So Far: 0.7357812523841858
Total elapsed time: 00h 02m 39s
INFO:tensorflow:Oracle triggered exit


INFO:tensorflow:Oracle triggered exit
INFO:absl:Finished tuning... Tuner ID: tuner0
INFO:absl:Best HyperParameters: {'space': [{'class_name': 'Choice', 'config': {'name': 'num_layers', 'default': 1, 'conditions': [], 'values': [1, 2, 3], 'ordered': True}}, {'class_name': 'Int', 'config': {'name': 'dense_units', 'default': None, 'conditions': [], 'min_value': 16, 'max_value': 256, 'step': 16, 'sampling': 'linear'}}, {'class_name': 'Float', 'config': {'name': 'dropout_rate', 'default': 0.1, 'conditions': [], 'min_value': 0.1, 'max_value': 0.7, 'step': 0.1, 'sampling': 'linear'}}, {'class_name': 'Choice', 'config': {'name': 'learning_rate', 'default': 0.001, 'conditions': [], 'values': [0.001, 0.0001, 1e-05], 'ordered': True}}], 'values': {'num_layers': 2, 'dense_units': 80, 'dropout_rate': 0.1, 'learning_rate': 0.001}}
INFO:absl:Best Hyperparameters are written to outputs\faizahmp-pipeline\Tuner\best_hyperparameters\69\best_hyperparameters.txt.


Results summary
Results in outputs\faizahmp-pipeline\Tuner\.system\executor_execution\69\.temp\69\cardiovaskular_kt
Showing 10 best trials
Objective(name="val_binary_accuracy", direction="max")

Trial 15 summary
Hyperparameters:
num_layers: 2
dense_units: 80
dropout_rate: 0.1
learning_rate: 0.001
Score: 0.7357812523841858

Trial 11 summary
Hyperparameters:
num_layers: 2
dense_units: 144
dropout_rate: 0.2
learning_rate: 0.001
Score: 0.7298437356948853

Trial 13 summary
Hyperparameters:
num_layers: 2
dense_units: 96
dropout_rate: 0.6
learning_rate: 0.001
Score: 0.7281249761581421

Trial 02 summary
Hyperparameters:
num_layers: 3
dense_units: 64
dropout_rate: 0.4
learning_rate: 0.001
Score: 0.727343738079071

Trial 05 summary
Hyperparameters:
num_layers: 3
dense_units: 208
dropout_rate: 0.4
learning_rate: 0.001
Score: 0.7232812643051147

Trial 17 summary
Hyperparameters:
num_layers: 1
dense_units: 160
dropout_rate: 0.4
learning_rate: 0.001
Score: 0.7214062213897705

Trial 01 summary
Hyperp

INFO:absl:Tuner results are written to outputs\faizahmp-pipeline\Tuner\tuner_results\69\tuner_results.json.
INFO:absl:Cleaning up stateless execution info.
INFO:absl:Execution 69 succeeded.
INFO:absl:Cleaning up stateful execution info.
INFO:absl:Publishing output artifacts defaultdict(<class 'list'>, {'best_hyperparameters': [Artifact(artifact: uri: "outputs\\faizahmp-pipeline\\Tuner\\best_hyperparameters\\69"
, artifact_type: name: "HyperParameters"
)], 'tuner_results': [Artifact(artifact: uri: "outputs\\faizahmp-pipeline\\Tuner\\tuner_results\\69"
, artifact_type: name: "TunerResults"
)]}) for execution 69
INFO:absl:MetadataStore with DB connection initialized
INFO:absl:node Tuner is finished.
INFO:absl:node Trainer is running.
INFO:absl:Running launcher for node_info {
  type {
    name: "tfx.components.trainer.component.Trainer"
    base_type: TRAIN
  }
  id: "Trainer"
}
contexts {
  contexts {
    type {
      name: "pipeline"
    }
    name {
      field_value {
        string_v

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 bp_category_xf (InputLayer)    [(None, 5)]          0           []                               
                                                                                                  
 bp_category_encoded_xf (InputL  [(None, 5)]         0           []                               
 ayer)                                                                                            
                                                                                                  
 gender_xf (InputLayer)         [(None, 1)]          0           []                               
                                                                                                  
 height_xf (InputLayer)         [(None, 1)]          0           []                         

INFO:tensorflow:struct2tensor is not available.


INFO:tensorflow:tensorflow_decision_forests is not available.


INFO:tensorflow:tensorflow_decision_forests is not available.


INFO:tensorflow:tensorflow_text is not available.


INFO:tensorflow:tensorflow_text is not available.


INFO:tensorflow:Assets written to: outputs\faizahmp-pipeline\Trainer\model\70\Format-Serving\assets


INFO:tensorflow:Assets written to: outputs\faizahmp-pipeline\Trainer\model\70\Format-Serving\assets


You must install pydot (`pip install pydot`) and install graphviz (see instructions at https://graphviz.gitlab.io/download/) for plot_model to work.


INFO:absl:Training complete. Model written to outputs\faizahmp-pipeline\Trainer\model\70\Format-Serving. ModelRun written to outputs\faizahmp-pipeline\Trainer\model_run\70
INFO:absl:Cleaning up stateless execution info.
INFO:absl:Execution 70 succeeded.
INFO:absl:Cleaning up stateful execution info.
INFO:absl:Publishing output artifacts defaultdict(<class 'list'>, {'model': [Artifact(artifact: uri: "outputs\\faizahmp-pipeline\\Trainer\\model\\70"
, artifact_type: name: "Model"
base_type: MODEL
)], 'model_run': [Artifact(artifact: uri: "outputs\\faizahmp-pipeline\\Trainer\\model_run\\70"
, artifact_type: name: "ModelRun"
)]}) for execution 70
INFO:absl:MetadataStore with DB connection initialized
INFO:absl:node Trainer is finished.
INFO:absl:node Evaluator is running.
INFO:absl:Running launcher for node_info {
  type {
    name: "tfx.components.evaluator.component.Evaluator"
    base_type: EVALUATE
  }
  id: "Evaluator"
}
contexts {
  contexts {
    type {
      name: "pipeline"
    }
 



INFO:absl:The 'example_splits' parameter is not set, using 'eval' split.
INFO:absl:Evaluating model.
INFO:absl:udf_utils.get_fn {'fairness_indicator_thresholds': 'null', 'eval_config': '{\n  "metrics_specs": [\n    {\n      "metrics": [\n        {\n          "class_name": "AUC"\n        },\n        {\n          "class_name": "Precision"\n        },\n        {\n          "class_name": "Recall"\n        },\n        {\n          "class_name": "ExampleCount"\n        },\n        {\n          "class_name": "TruePositives"\n        },\n        {\n          "class_name": "FalsePositives"\n        },\n        {\n          "class_name": "TrueNegatives"\n        },\n        {\n          "class_name": "FalseNegatives"\n        },\n        {\n          "class_name": "BinaryAccuracy",\n          "threshold": {\n            "change_threshold": {\n              "absolute": 0.0001,\n              "direction": "HIGHER_IS_BETTER"\n            },\n            "value_threshold": {\n              "lower_bo



INFO:absl:Evaluation complete. Results written to outputs\faizahmp-pipeline\Evaluator\evaluation\71.
INFO:absl:Checking validation results.


Instructions for updating:
Use eager execution and: 
`tf.data.TFRecordDataset(path)`


Instructions for updating:
Use eager execution and: 
`tf.data.TFRecordDataset(path)`
INFO:absl:Blessing result True written to outputs\faizahmp-pipeline\Evaluator\blessing\71.
INFO:absl:Cleaning up stateless execution info.
INFO:absl:Execution 71 succeeded.
INFO:absl:Cleaning up stateful execution info.
INFO:absl:Publishing output artifacts defaultdict(<class 'list'>, {'blessing': [Artifact(artifact: uri: "outputs\\faizahmp-pipeline\\Evaluator\\blessing\\71"
, artifact_type: name: "ModelBlessing"
)], 'evaluation': [Artifact(artifact: uri: "outputs\\faizahmp-pipeline\\Evaluator\\evaluation\\71"
, artifact_type: name: "ModelEvaluation"
)]}) for execution 71
INFO:absl:MetadataStore with DB connection initialized
INFO:absl:node Evaluator is finished.
INFO:absl:node Pusher is running.
INFO:absl:Running launcher for node_info {
  type {
    name: "tfx.components.pusher.component.Pusher"
    base_type: DEPLOY
  }
  id: "Pusher"
}
contexts {
  contexts {
    type {
      name: "pipeline"
    }