In [1]:
# Defining a DAG for model training
# Components:
# 1. Data ingestion
# 2. Data preprocessing
# 3. Model training
# 4. Model evaluation
# 5. Model deployment

In [2]:
# airflow db init
# airflow users  create --role Admin --username admin --email admin --firstname admin --lastname admin --password admin
# airflow webserver -p 8080
# airflow scheduler

In [3]:
# Imports
import os
from datetime import datetime, timedelta
from airflow import DAG
from airflow.decorators import task, dag, task_group
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [4]:
# Create the data.csv file 
# This file will be used for training the model
# Download the iris dataset from the UCI Machine Learning Repository

!mkdir -p data
!wget -O data/iris.csv https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data

--2022-12-06 14:13:27--  https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.252
Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4551 (4.4K) [application/x-httpd-php]
Saving to: ‘data/iris.csv’


2022-12-06 14:13:28 (163 MB/s) - ‘data/iris.csv’ saved [4551/4551]



In [5]:
@dag(dag_id="training_model_pipeline", schedule_interval=None, start_date=datetime(2021, 1, 1), catchup=False
)
def training_model_pipeline():
    @task
    def ingest_data():
        print("Ingesting data")
        df = pd.read_csv("data/iris.csv", header=None)
        df.columns = ["sepal_length", "sepal_width", "petal_length", "petal_width", "target"]
        return df  
    
    @task
    def preprocess_data(df):
        print("Preprocessing data")
        df = df.dropna()
        return df
    
    @task(multiple_outputs=True)
    def train_test_split(df):
        print("Splitting data")
        X = df.drop("target", axis=1)
        y = df["target"]
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
        return {"X_train": X_train, 
                "X_test": X_test,
                "y_train": y_train,
                "y_test": y_test}
    
    @task
    def train_model(X_train, y_train):
        print("Training model")
        model = RandomForestClassifier()
        model.fit(X_train, y_train)
        return model
    
    @task
    def evaluate_model(model, X_test, y_test):
        print("Evaluating model")
        y_pred = model.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        report = classification_report(y_test, y_pred)
        matrix = confusion_matrix(y_test, y_pred)
        return {"accuracy": accuracy, "report": report, "matrix": matrix}
    
    # Build the DAG
    df = ingest_data()
    df = preprocess_data(df)
    split_data = train_test_split(df)
    model = train_model(split_data["X_train"], split_data["y_train"])
    result = evaluate_model(model, split_data["X_test"], split_data["y_test"])
    
# Instantiate the DAG
dag = training_model_pipeline()

