## CPE393 Machine Learning Operations Homework: Model Mornitoring

### Import Packages

In [2]:
import pandas as pd
from datetime import datetime, timedelta
from sklearn import datasets
from scipy import stats
import numpy as np
from evidently.legacy.test_suite import TestSuite
from evidently.legacy.test_preset import DataDriftTestPreset, DataQualityTestPreset, DataStabilityTestPreset

### Create simulation data for reference and current dataset

In [3]:
df=pd.read_csv("DelayedFlights.csv")
ref_data=df[1:500000]
curr_data=df[500000:700000]

In [4]:
numerical_columns = [
    'Month', 'DayofMonth', 'DayOfWeek', 'DepTime', 'CRSDepTime', 'CRSArrTime',
    'FlightNum', 'CRSElapsedTime', 'AirTime', 'DepDelay',
    'Distance', 'TaxiIn', 'TaxiOut', 'CarrierDelay', 'WeatherDelay', 'NASDelay',
    'SecurityDelay', 'LateAircraftDelay'
]
df = df.drop(['Unnamed: 0', 'Year', 'CancellationCode', 'TailNum', 'Diverted', 'Cancelled', 'ArrTime', 'ActualElapsedTime'], axis=1)
delay_colns = ['CarrierDelay', 'WeatherDelay', 'NASDelay', 'SecurityDelay', 'LateAircraftDelay']

# Impute missing values with 0 for these columns
df[delay_colns] = df[delay_colns].fillna(0)

# Impute missing values with the median for these columns
columns_to_impute = ['AirTime', 'ArrDelay', 'TaxiIn', 'CRSElapsedTime']
df[columns_to_impute] = df[columns_to_impute].fillna(df[columns_to_impute].median())

df = pd.get_dummies(df, columns=['UniqueCarrier', 'Origin', 'Dest'], drop_first=True)

z_threshold = 3
z_scores = np.abs(stats.zscore(df[numerical_columns]))
outliers = np.where(z_scores > z_threshold)
df_no_outliers = df[(z_scores <= z_threshold).all(axis=1)]


### Develop a model quality report using evidentlyAI

#### Data Drifts

In [6]:
data_drift_suite = TestSuite(tests=[DataDriftTestPreset()])
data_drift_suite.run(reference_data=ref_data, current_data=curr_data)

if not data_drift_suite.as_dict()['summary']['all_passed']:
    data_drift_suite.save_html("./data_drift_suite.html")   


In [7]:
model_quality_suite = TestSuite(tests=[DataQualityTestPreset(), DataStabilityTestPreset()])
model_quality_suite.run(reference_data=ref_data, current_data=curr_data)

if not model_quality_suite.as_dict()['summary']['all_passed']:
    model_quality_suite.save_html("./model_quality_suite.html")

### All reports link

- The model quality report will be shown in the HTML file [Click here](./model_quality_suite.html)
- The data drift report will be shown in the HTML file [Click here](./data_drift_suite.html)