# MLÖpse
The aim of the project is to simulate the realworld process of deploying machine learning models, using the concepts that we have discussed during the classes.

# References
- https://medium.com/analytics-vidhya/manage-your-machine-learning-lifecycle-with-mlflow-in-python-d678d5f3c682
- https://towardsdatascience.com/experiment-tracking-with-mlflow-in-10-minutes-f7c2128b8f2c
- https://towardsdatascience.com/tracking-ml-experiments-using-mlflow-7910197091bb
- https://medium.com/hub-by-littlebigcode/mlops-how-mlflow-effortlessly-tracks-your-experiments-and-helps-you-compare-them-11da9be1fdb7

# Metadata

UTC:
Timestamp UTC seconds

Temperature[C]:
Air Temperature

Humidity[%]:
Air Humidity

TVOC[ppb]:
Total Volatile Organic Compounds; measured in parts per billion

eCO2[ppm]:
CO2 equivalent concentration; calculated from different values like TVOC

Raw H2:
Raw molecular hydrogen; not compensated (Bias, temperature, etc.)

Raw Ethanol:
Raw ethanol gas

Pressure[hPa]:
Air Pressure

PM1.0:
Particulate matter size < 1.0 µm (PM1.0). 1.0 µm < 2.5 µm (PM2.5)

PM2.5:
Particulate matter size < 1.0 µm (PM1.0). 1.0 µm < 2.5 µm (PM2.5)

NC0.5:
Number concentration of particulate matter. This differs from PM because NC gives the actual number of particles in the air. The raw NC is also classified by the particle size: < 0.5 µm (NC0.5); 0.5 µm < 1.0 µm (NC1.0); 1.0 µm < 2.5 µm (NC2.5);

NC1.0:
Number concentration of particulate matter. This differs from PM because NC gives the actual number of particles in the air. The raw NC is also classified by the particle size: < 0.5 µm (NC0.5); 0.5 µm < 1.0 µm (NC1.0); 1.0 µm < 2.5 µm (NC2.5);

NC2.5:
Number concentration of particulate matter. This differs from PM because NC gives the actual number of particles in the air. The raw NC is also classified by the particle size: < 0.5 µm (NC0.5); 0.5 µm < 1.0 µm (NC1.0); 1.0 µm < 2.5 µm (NC2.5);

CNT:
Sample counter

Fire Alarm (Target):
Ground truth is "1" if a fire is there

# Imports

In [None]:
# basics
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import ast
# scikit-learn
colors = sns.cubehelix_palette(n_colors=10, start=2.6, rot=0, dark=0.2, light=0.8)
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PowerTransformer
from sklearn.impute import SimpleImputer
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report
from sklearn.metrics import RocCurveDisplay
from imblearn.over_sampling import SMOTE
# mlflow
import mlflow
# great expectations
import great_expectations as gx
# nannyml
import nannyml as nml
# termcolor
from termcolor import colored

# MLFlow Experiment

FIRST: 
1. open ***mlflow ui*** via terminal

In [None]:
# Create a new experiment
experiment_name = "MLOps Project"

# create new experiment if not existing
try:
    mlflow.create_experiment(experiment_name)
    mlflow.set_experiment(experiment_name)
# if experiment already exists, set experiment
except:
    mlflow.set_experiment(experiment_name)

In [None]:
# get experiment id
experiment_id = mlflow.get_experiment_by_name(experiment_name).experiment_id

# Load Data

In [None]:
dataframe = pd.read_csv('Data/smoke_detection.csv')

In [None]:
dataframe.head(40)

# Split Data

To simulate multiple datasets we're going to order the dataset bei the *UTC* column and split it into 3 equal parts

In [None]:
def split_dataframe(dataframe: pd.DataFrame) -> tuple: #[pd.DataFrame, pd.DataFrame, pd.DataFrame]
    # order dataframe by month
    dataframe = dataframe.sort_values(by='UTC').reset_index(drop=True)
    
    # split dataframe into 3 equal sized datasets
    dataframe_1 = dataframe.iloc[:int(len(dataframe)/3)]
    dataframe_2 = dataframe.iloc[int(len(dataframe)/3):int(len(dataframe)/3)*2]
    dataframe_3 = dataframe.iloc[int(len(dataframe)/3)*2:]
    
    return dataframe_1, dataframe_2, dataframe_3

In [None]:
# pandas dataframes
df1, df2, df3 = split_dataframe(dataframe)

In [None]:
for i in [df1, df2, df3]:
    print(i.shape)

# EDA

In [None]:
print(f'Rows: {df1.shape[0]}\nColumns: {df1.shape[1]}')

In [None]:
# first look at the data
df1.sample(5)

In [None]:
# descriptive statistics, data types and missing values
def dataset_infos(dataframe: pd.DataFrame = dataframe) -> pd.DataFrame:
    # descriptive statistics
    descriptives = dataframe.describe(include='all').T

    # data types column
    descriptives.insert(loc=0, column='dtype', value=dataframe.dtypes)
    
    # missing values
    missing = pd.concat([dataframe.isnull().sum(), dataframe.eq('').sum()], keys=['nulls','empty strings'], axis=1)
    
    return pd.merge(descriptives, missing, left_index=True, right_index=True)

dataset_infos(df1)

In [None]:
# duplicate rows
print(f'Duplicated Rows: {df1.duplicated().sum()}')

# # non-unique values
# non_unique_count = (df1.shape[0] - df1.nunique()).sum()
# print(f'Non-Unique Values: {non_unique_count}')

# # unique values
# print(f'Unique Values: {df1.nunique()}')

# # missing values
# print(f'Missing Values: {df1.isnull().sum().sum()}')

In [None]:
# # plotting a correlation matix to detect dependencies with the dependent variable

# correlation_matrix = df1.corr()
# mask = np.triu(np.ones_like(correlation_matrix, dtype=bool))

# fig, ax = plt.subplots(figsize=(10, 8))
# cmap = sns.diverging_palette(220, 10, as_cmap=True)
# sns.heatmap(correlation_matrix, mask=mask, cmap=cmap, center=0, annot=True,
#             square=True, linewidths=0.5, cbar_kws={"shrink": 0.8})

# plt.title("Correlation Matrix")
# plt.show()

In [None]:
# df1[df1['Fire Alarm']==1].head()
# # properties, when fire alarm is executed
# # 11956 rows

In [None]:
# df1[df1['Fire Alarm']==0].head()
# # properties, when fire alarm is executed-
# # 8920 rows

In [None]:
# df_fire_alarm_1 = df1[df1['Fire Alarm'] == 1]
# df_fire_alarm_0 = df1[df1['Fire Alarm'] == 0]

# sns.boxplot(x='Fire Alarm', y='Pressure[hPa]', data=df1, palette=colors).set(
# title='Box Plot: Pressure[hPa]',
# xlabel='Fire Alarm',
# ylabel='Pressure[hPa]'
# )

# plt.tight_layout()
# plt.show()

In [None]:
unique_values = df1['UTC'].unique()
is_unique = len(unique_values) == len(df1['UTC'])
print(is_unique)

In [None]:
unique_values = df1['CNT'].unique()
is_unique = len(unique_values) == len(df1['UTC'])
print(is_unique)

In [None]:
# plot target distribution (pie chart)
def plot_target_distribution(dataframe: pd.DataFrame = dataframe, target='Fire Alarm'):
    plt.figure(figsize=(10,5))
    plt.pie(dataframe[target].value_counts(), labels=dataframe[target].value_counts().index, autopct='%1.1f%%', startangle=90, colors=colors)
    plt.title('Target Distribution')
    plt.legend()
    plt.show()

plot_target_distribution(df1)

In [None]:
plot_target_distribution(df2)

In [None]:
plot_target_distribution(df3)

# Preprocessing

In [None]:
x, y = df1.drop(columns=['Fire Alarm']), df1['Fire Alarm']

In [None]:
xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.2, stratify=y,random_state=420)

In [None]:
def preprocessing(x: pd.DataFrame, y: pd.Series = None, reset: bool = False) -> pd.DataFrame:
  '''
  Preprocessing function
  Input: dataframe
  Output: transformed data
  '''

  run_name = "Preprocessing Pipeline"

  # preprocessing pipeline
  pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler()),
    # ('power_transformer', PowerTransformer())
    ])
    
  # Search for preprocessing run
  run = mlflow.search_runs(experiment_ids=experiment_id, filter_string=f"tags.mlflow.runName = '{run_name}'")
  # get run id
  run_id = run.iloc[0].run_id if not run.empty else None

  # if preprocessing-run does not exist or retrain
  if reset or run.empty:
    # Delete the last run if it exists
    if run_id is not None:
        mlflow.delete_run(run_id)
        print('Last run deleted')
    # drop sample count (CNT) column
    x = x.drop(columns=['CNT'])
    # convert UTC column to datetime
    x['Timestamp'] = pd.to_datetime(x['UTC'], unit='s')
    # drop UTC column
    x = x.drop(columns=['UTC'])
    # create new columns
    x['Hour'], x['WeekDay'] = x['Timestamp'].dt.hour, x['Timestamp'].dt.dayofweek
    # drop timestamp column
    x = x.drop(columns=['Timestamp'])
    # fit & transform pipeline
    data_preprocessed = pipeline.fit_transform(x)
    # SMOTE
    if y is not None:
      # smote
      smote = SMOTE(random_state=420)
      # fit smote
      x_smote, y_smote = smote.fit_resample(data_preprocessed, y)
    # print fit and transform message
    print('Preprocessing Pipeline: fit and transform')
    # start run
    with mlflow.start_run(run_name=run_name) as run:
      # log pipeline
      mlflow.sklearn.log_model(pipeline, 'PreprocessingPipeline')
      # end run
      mlflow.end_run()

    return x_smote, y_smote
    
  # if preprocessing-run exists
  else:
    # load pipeline
    pipeline = mlflow.sklearn.load_model(f'runs:/{run_id}/PreprocessingPipeline')
    # drop sample count (CNT) column
    x = x.drop(columns=['CNT'])
    # convert UTC column to datetime
    x['Timestamp'] = pd.to_datetime(x['UTC'], unit='s')
    # drop UTC column
    x = x.drop(columns=['UTC'])
    # create new columns
    x['Hour'], x['WeekDay'] = x['Timestamp'].dt.hour, x['Timestamp'].dt.dayofweek
    # drop timestamp column
    x = x.drop(columns=['Timestamp'])
    # transform data
    data_preprocessed = pipeline.transform(x)
    # print transform message
    print('Preprocessing Pipeline: transform')
    # only return x (no smote on new data)
    return data_preprocessed

In [None]:
# preprocess training data
xtrain_prepro, ytrain_prepro = preprocessing(xtrain, ytrain, reset=True)

In [None]:
# preprocess test data
xtest_prepro = preprocessing(xtest)

# Modelling

## Training Multiple Classifiers

In [None]:
# fit prediction model on train dataset
def classifier_comparison(
    xtrain: pd.DataFrame = xtrain_prepro,
    ytrain: pd.DataFrame = ytrain_prepro,
    xtest: pd.DataFrame = xtest_prepro,
    ytest: pd.DataFrame = ytest,
    clf: dict = {
        'Logistic Regression': LogisticRegression(random_state=420),
        'Decision Tree': DecisionTreeClassifier(random_state=420),
        'Random Forest': RandomForestClassifier(random_state=420)
        }
    ):

    run_name = "Classifier Comparison"

    # start run
    with mlflow.start_run(run_name=run_name) as run:
        # build plot
        fig, ax = plt.subplots(1, figsize=(10, 7))
        # iterate over classifiers
        for name, clf in clf.items():
            # fit classifier
            clf.fit(xtrain, ytrain)
            # plot roc curve
            RocCurveDisplay.from_estimator(clf, xtest, ytest, ax=ax, name=name)
        ax.set_title('Receiver Operating Characteristic (ROC)')
        ax.plot([0,1], [0,1], linestyle='--')
        # save plot
        plt.savefig('Plots/ROC Curves.png')
        # log plot
        mlflow.log_figure(fig, 'Plots/ROC Curves.png')
        # end run
        mlflow.end_run()
    
    return plt.show()

In [None]:
classifier_comparison(xtrain=xtrain_prepro, ytrain=ytrain_prepro, xtest=xtest_prepro, ytest=ytest)

## GridSearchCV

In [None]:
def grid_search(
    x: pd.DataFrame,
    y: pd.DataFrame,
    xtest: pd.DataFrame = None,
    ytest: pd.DataFrame = None,
    clf = RandomForestClassifier(random_state=420),
    param_grid: dict = {
        'n_estimators': [100, 200, 300, 400, 500],
        'max_depth': [3, 4, 5, 6, 7, 8, 9, 10]
        },
    predict: bool = False
    ):

    # run name
    clf_name = str(clf).split("(")[0]
    run_name = f"Grid Search - {clf_name}"

    # Search for preprocessing run
    run = mlflow.search_runs(experiment_ids=experiment_id, filter_string=f"tags.mlflow.runName = '{run_name}'")
    # get run id
    run_id = run.iloc[0].run_id if not run.empty else None

    # if no predictions needed
    # fit grid search to data and log model
    if predict is False:
        # delete the last run if it exists
        if run_id is not None:
            mlflow.delete_run(run_id)
            
        # start run
        with mlflow.start_run(run_name=run_name) as run:
            # initialize grid search
            grid = GridSearchCV(clf, param_grid, cv=5, scoring='f1', n_jobs=-1)
            # fit grid search
            grid.fit(x, y)
            # # log model
            mlflow.sklearn.log_model(grid, f'GridSearch_{clf_name}')
            # # log best parameters
            mlflow.log_param('best_params', grid.best_params_)
            # print message
            print('GridSearchCV: fit')
            # end run
            mlflow.end_run()

    elif predict:
        # initialize grid search
        grid = GridSearchCV(clf, param_grid, cv=5, scoring='f1', n_jobs=-1)
        # fit grid search
        grid.fit(x, y)
        # predict
        ypred = grid.predict(xtest)
        # print classification report
        print(classification_report(ytest, ypred))

In [None]:
# logistic regression grid_params
grid_params = {
    'penalty': ['l1', 'l2'],
    'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]
    }

grid_search(
    x=xtrain_prepro,
    y=ytrain_prepro,
    xtest=xtest_prepro,
    ytest=ytest,
    clf=LogisticRegression(random_state=420),
    param_grid=grid_params,
    predict=True
    )

In [None]:
# decision tree grid_params
grid_params = {
    'max_depth': [3, 4, 5, 6, 7, 8, 9, 10],
    'min_samples_split': [2, 3, 4, 5, 6, 7, 8, 9, 10]
    }

grid_search(
    x=xtrain_prepro,
    y=ytrain_prepro,
    xtest=xtest_prepro,
    ytest=ytest,
    clf=DecisionTreeClassifier(random_state=420),
    param_grid=grid_params,
    predict=True
    )

In [None]:
grid_search(
    x=xtrain_prepro,
    y=ytrain_prepro,
    xtest=xtest_prepro,
    ytest=ytest,
    clf=RandomForestClassifier(random_state=420),
    predict=True
    )

## Train Model with GridSearchCV-Parameters and all Training Data

In [None]:
def training(
        x: pd.DataFrame,
        y: pd.DataFrame,
        clf = RandomForestClassifier(random_state=420),
        # reset: bool = False
        ):
    
    # run name
    clf_name = str(clf).split("(")[0]
    run_name = f"Trained Model - {clf_name}"

    # check if grid search run exists
    model_grid_search = f"Grid Search - {clf_name}"

    # Search for grid search run
    run = mlflow.search_runs(experiment_ids=experiment_id, filter_string=f"tags.mlflow.runName = '{model_grid_search}'")
    # get run id
    run_id = run.iloc[0].run_id if not run.empty else None
    
    # if grid search run does not exist
    if run.empty:
        print(f'No grid search run found for {clf_name}')
        print('Model logged with default parameters')
        # initialize default model
        model = clf

    # if grid search run exists
    else:
        print(f'Grid search run found for {clf_name}')
        print('Model logged with optimal parameters')
        # load grid search model
        model = mlflow.sklearn.load_model(f'runs:/{run_id}/GridSearch_{clf_name}')

    # start run
    with mlflow.start_run(run_name=run_name) as run:
        # fit model
        model.fit(x, y)
        # log model
        mlflow.sklearn.log_model(model, 'model')
        # end run
        mlflow.end_run()

In [None]:
# combine train and test data
x, y = preprocessing(x=pd.concat([xtrain, xtest], axis=0), y=pd.concat([ytrain, ytest], axis=0), reset=True)

In [None]:
# logistic regression grid_params
grid_params = {
    'penalty': ['l1', 'l2'],
    'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]
    }
# grid search on full train dataset
grid_search(x=x, y=y, clf=LogisticRegression(random_state=420), param_grid=grid_params)
# train model on full train dataset
training(x=x, y=y, clf=LogisticRegression(random_state=420))

In [None]:
# decision tree grid_params
grid_params = {
    'max_depth': [3, 4, 5, 6, 7, 8, 9, 10],
    'min_samples_split': [2, 3, 4, 5, 6, 7, 8, 9, 10]
    }
# grid search on full train dataset
grid_search(x=x, y=y, clf=DecisionTreeClassifier(random_state=420), param_grid=grid_params)
# train model on full train dataset
training(x=x, y=y, clf=DecisionTreeClassifier(random_state=420))

In [None]:
# random forest grid_params
grid_params = {
    'n_estimators': [100, 200, 300, 400, 500],
    'max_depth': [3, 4, 5, 6, 7, 8, 9, 10]
    }
# grid search on full train dataset
grid_search(x=x, y=y, clf=RandomForestClassifier(random_state=420), param_grid=grid_params)
# train model on full train dataset
training(x=x, y=y, clf=RandomForestClassifier(random_state=420))

## Prediction

In [None]:
def predict_new_data(x, y, clf = RandomForestClassifier(random_state=420)):

    # get dataframe name
    dataframe_name = [i for i in globals() if globals()[i] is x][0]

    # run name
    clf_name = str(clf).split("(")[0]
    run_name = f"{clf_name} Prediction - {dataframe_name}"

    # check if classifier already got trained
    trained_model_search = f"Trained Model - {clf_name}"

    # search run
    run = mlflow.search_runs(experiment_ids=experiment_id, filter_string=f"tags.mlflow.runName = '{trained_model_search}'")
    
    # if no trained model found
    if run.empty:
        print(f'No trained model found for {clf_name}')
    # if trained model found
    else:
        print(f'Trained model found for {clf_name}\n\n')
        # get run id
        run_id = run.iloc[0].run_id if not run.empty else None
        # load model (optimal)
        model = mlflow.sklearn.load_model(f'runs:/{run_id}/model')

    # start run
    with mlflow.start_run(run_name=run_name) as run:
        # predict on new data
        ypred = model.predict(x)
        # log model accuracy
        mlflow.log_metric(f'{dataframe_name}_accuracy', accuracy_score(y, ypred))
        # log model precision
        mlflow.log_metric(f'{dataframe_name}_precision', precision_score(y, ypred))
        # log model recall
        mlflow.log_metric(f'{dataframe_name}_recall', recall_score(y, ypred))
        # log model f1-score
        mlflow.log_metric(f'{dataframe_name}_f1_score', f1_score(y, ypred))
        # log parameters
        # mlflow.log_param(f'{dataframe_name}_params', model.get_params())
        # log model
        mlflow.sklearn.log_model(model, f'{dataframe_name}_model')
        # print classification report
        print(f'Classification Report:\n\n{classification_report(y, ypred)}\n\n')
        # end run
        mlflow.end_run()

# Great Expectations

1. Installation

- Open Anaconda Prompt Terminal
- After satisfying !pip install great_expectations, run great_expectations init and confirm with 'Y'
- Verify the installation by running the version !great_expectations --version
- Our version used: version 0.16.13


More information: https://docs.greatexpectations.io/docs/tutorials/quickstart/

Für meine Süßen zum Verstehen: https://www.youtube.com/watch?v=YLy4NsioUVI

*Context*: A context in is the main object that manages the overall configuration and execution of the data expectations. It serves as a container for storing and organizing expectations, data sources, and validation results. The context allows to define, execute, and manage our data expectations.

*Validator*: A validator is responsible for evaluating expectations on a given batch of data. Validators are used to validate data against a set of predefined expectations. They help to assess data quality, perform data validation, and monitor data pipelines.

*Suite*: An Expectation Suite is a collection of expectations that define the desired properties and characteristics of our data. It serves as a set of rules against which your data can be validated. The suite contains a set of expectations that can be applied to one or more batches of data. 

*Batch*: A batch represents a subset of data that we want to evaluate against our expectations. It can be a collection of rows, a partitioned dataset, a file, a table, or any other logical grouping of data. Batches are used as inputs to validation processes and contain the data you want to validate.

*Checkpoint*: A Checkpoint is a way to operationalize data validation using Expectation Suites. It allows you to define a pipeline-like flow for performing data validation on batches of data. It helps automate the validation process by defining the steps to be executed on data batches and tracking the results.

In [None]:
# Testing

import random
# Select a random column index
random_column = random.choice(df2.columns)

# Drop the random column
df2 = df2.drop(random_column, axis=1)
df2.head()

In [None]:
# First step: Split data frame and save each one as a csv-file
df1.to_csv('df_one.csv', index=False)
df2.to_csv('df_two.csv', index=False)
df3.to_csv('df_three.csv', index=False)

In [None]:
# Set up
import great_expectations as gx
from great_expectations.checkpoint import SimpleCheckpoint

# Create data context
context = gx.get_context()

# Connect to data
validator = context.sources.pandas_default.read_csv("Data/smoke_detection.csv")

# # get column mánmes from initial df, here df1
# gx_columns = df1.get_table_columns()
# # get dtypes from initial dataframe
# gx_dtypes = df1.dtypes.apply(lambda x: str(x)).to_dict()

column_names = [f"{column_name}" for column_name in validator.columns()]
print(f"Columns: {', '.join(column_names)}.")
print(validator.head(n_rows=5, fetch_all=False))

# Create expectation suite
suite = context.create_expectation_suite("smoke_detection_expectations",overwrite_existing=True)

Now we use that data source for profiling, validation and documentation. More information regarding expectations, can be found here: https://legacy.docs.greatexpectations.io/en/latest/reference/glossary_of_expectations.html

In [None]:
# Use the Validator to create and run an Expectation
validator.expect_table_columns_to_match_ordered_list(column_names)

In [None]:
validator.expect_column_values_to_be_unique("UTC")

In [None]:
validator.expect_column_to_exist("Temperature[C]")

In [None]:
validator.expect_column_to_exist("Humidity[%]")

In [None]:
validator.expect_column_to_exist("TVOC[ppb]")

In [None]:
validator.expect_column_to_exist("eCO2[ppm]")

In [None]:
validator.expect_column_to_exist("Raw H2")

In [None]:
validator.expect_column_to_exist("Raw Ethanol")

In [None]:
validator.expect_column_to_exist("PM1.0")

In [None]:
validator.expect_column_to_exist("PM2.5")

In [None]:
validator.expect_column_to_exist("NC0.5")

In [None]:
validator.expect_column_to_exist("NC1.0")

In [None]:
validator.expect_column_to_exist("NC2.5")

In [None]:
validator.expect_column_values_to_be_unique("CNT")

In [None]:
validator.expect_column_values_to_be_in_set("Fire Alarm",[0,1])

In [None]:
results = validator.validate()
# print(results)

# print(checkpoint_result)

# print(checkpoint)

In [None]:
# Review and save our expectation suite
print(validator.get_expectation_suite(discard_failed_expectations=False))
validator.save_expectation_suite(discard_failed_expectations=False)


# Create a checkpoint
checkpoint = SimpleCheckpoint(
    'smoke_detection_checkpoint',
    context,
    validator=validator,
    batches=[
        {
            "batch_kwargs": {"path": "df_two.csv"},
            "expectation_suite_names": 'smoke_detection_expectations',
        },
        {
            "batch_kwargs": {"path": "df_three.csv"},
            "expectation_suite_names": 'smoke_detection_expectations',
        },
    ],
)

# # Create checkpoint
# checkpoint = SimpleCheckpoint(
#     "smoke-detection",
#     context,
#     validator=validator,
# )

# Run checkpoint to validate data 
checkpoint_result = checkpoint.run()

# View results
context.build_data_docs()

validation_result_identifier = checkpoint_result.list_validation_result_identifiers()[0]
context.open_data_docs(resource_identifier=validation_result_identifier)

In [None]:
# Convert the test dataframes to Great Expectations' data objects
test_data_gx = [
    gx.from_pandas(df2),
    gx.from_pandas(df3)
]

# Validate the test datasets against the expectation suite
test_results = []
for test_df in test_data_gx:
    result = test_df.validate(expectation_suite=suite)
    test_results.append(result)

for i, result in enumerate(test_results):
    if result["success"]:
        print(f"Test {i+1}: All Data Quality Tests are green!")
    else:
        raise Exception(f"Test {i+1}: Data Quality Tests failed.")

# Access the results
for i, result in enumerate(test_results):
    print(f"Test {i+1} Results:")
    print(result)

References:
- https://github.com/datarootsio/tutorial-great-expectations/blob/main/tutorial_great_expectations.ipynb
- https://medium.com/@mostsignificant/python-data-validation-made-easy-with-the-great-expectations-package-8d1be266fd3f
- https://www.linkedin.com/pulse/data-validation-great-expectations-ensure-your-meets-standards-kundi/?trk=pulse-article_more-articles_related-content-card
- https://towardsdatascience.com/great-expectations-automated-testing-for-data-science-and-engineering-teams-1e7c78f1d2d5
- https://towardsdatascience.com/a-great-python-library-great-expectations-6ac6d6fe822e

# Dataset 2

In [None]:
# Great expectation as data unit test tool
# Lukas did: check column to exist/ datatypes, missing values, duplicates, distributions, outliers
# For me to add: Column unique values of UTC and CNT, column value distribution of binary variable

# I will not define the range, as we already look for outliers
# For binary classification, a balanced dataset would ideally have a 50:50 distribution between the two classes
# A threshold here could be between 30/70 and 20/80

def great_expectations(check_dataframe, initial_dataframe=df1):

    initial_df = gx.dataset.PandasDataset(initial_dataframe)
    check_df = gx.dataset.PandasDataset(check_dataframe)
    
    # get column names from initial dataframe
    gx_columns = initial_df.get_table_columns()
    # get dtypes from initial dataframe
    gx_dtypes = initial_df.dtypes.apply(lambda x: str(x)).to_dict()

    # check if column are the same in all dataframes
    column_check = check_df.expect_table_columns_to_match_ordered_list(column_list=gx_columns)
    if column_check['success']:
        print('Columns Check: ', colored('OK', 'green'))
    else:
        print('Columns Check: ', colored('Failed', 'red'))

    # check if dtypes are the same in all dataframes
    check_set = set()
    for col, data_type in gx_dtypes.items():
        datatype_check = check_df.expect_column_values_to_be_of_type(column=col, type_=data_type)
        check_set.add(datatype_check['success'])
    if all(check_set) == True:
        print('Datatype Check: ', colored('OK', 'green'))
    else:
        print('Datatype Check: ', colored('Failed', 'red'))

    # check if dataframe has missing values
    check_set = set()
    for col in gx_columns:
        missing_values_check = check_df.expect_column_values_to_not_be_null(column=col)
        check_set.add(missing_values_check['success'])
    if all(check_set) == True:
        print('Missing Values Check: ', colored('OK', 'green'))
    else:
        print('Missing Values Check: ', colored('Failed', 'red'))

    # check if dataframe has duplicates
    duplicates_check = check_df.expect_table_row_count_to_be_between(min_value=check_df.shape[0], max_value=check_df.shape[0])
    if duplicates_check['success']:
        print('Duplicates Check: ', colored('OK', 'green'))
    else:
        print('Duplicates Check: ', colored('Failed', 'red'))

    # check if dataframe distribution is the same as in initial dataframe
    check_set = set()
    for col in gx_columns:
        distribution_check = check_df.expect_column_kl_divergence_to_be_less_than(column=col, partition_object={'values': initial_df[col].value_counts().index.tolist(), 'weights': initial_df[col].value_counts(normalize=True).tolist()}, threshold=0.6)
        check_set.add(distribution_check['success'])
    if all(check_set) == True:
        print('Distribution Check: ', colored('OK', 'green'))
    else:
        print('Distribution Check: ', colored('Failed', 'red'))

    # check if dataframe has outliers
    check_set = set()
    for col in gx_columns:
        outliers_check = check_df.expect_column_values_to_be_between(column=col, min_value=initial_df[col].min(), max_value=initial_df[col].max())
        check_set.add(outliers_check['success'])
    if all(check_set) == True:
        print('Outlier Check: ', colored('OK', 'green'))
    else:
        print('Outlier Check: ', colored('Failed', 'red'))

In [None]:
great_expectations(check_dataframe=df2, initial_dataframe=df1)

In [None]:
df2_prepro, df2_y = preprocessing(df2.drop(columns=['Fire Alarm'])), df2['Fire Alarm']

In [None]:
predict_new_data(df2_prepro, df2_y, clf=LogisticRegression(random_state=420))

In [None]:
predict_new_data(df2_prepro, df2_y, clf=DecisionTreeClassifier(random_state=420))

In [None]:
predict_new_data(df2_prepro, df2_y, clf=RandomForestClassifier(random_state=420))

# Dataset 3

In [None]:
great_expectations(check_dataframe=df3, initial_dataframe=df1)

In [None]:
df3_prepro, df3_y = preprocessing(df3.drop(columns=['Fire Alarm'])), df3['Fire Alarm']

In [None]:
predict_new_data(df3_prepro, df3_y, clf=LogisticRegression(random_state=420))

In [None]:
predict_new_data(df3_prepro, df3_y, clf=DecisionTreeClassifier(random_state=420))

In [None]:
predict_new_data(df3_prepro, df3_y, clf=RandomForestClassifier(random_state=420))