In [3]:
import time
import uuid
import numpy as np
import pandas as pd
import json
from evidently.report import Report
from evidently.metric_preset import DataDriftPreset
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
    matthews_corrcoef,
    balanced_accuracy_score,
    jaccard_score,
    cohen_kappa_score,
)

class AIModelProcessor:
    def __init__(self, model, training_data, target_column, model_version="1.0", model_type="Model", log_file="model_logs.json"):
        self.model = model
        self.model_type = model_type
        self.model_version = model_version
        self.log_file = log_file
        self.training_data = training_data
        self.target_column = target_column

        self.x_train = None
        self.y_train = None
        self.column_names = list(training_data.columns.drop(target_column))
        
        self._initialize_data()
        self._process_data()
        self._log_instance("Initialized model processor")

    def _initialize_data(self):
        self.y_train = self.training_data[self.target_column]
        self.x_train = self.training_data.drop(columns=[self.target_column])

    def _process_data(self):
        # Perform cross-validation
        cross_val_scores = self.cross_validation_metrics()

        # Calculate metrics
        y_pred = self.model.predict(self.x_train)
        y_prob = self.model.predict_proba(self.x_train)
        metrics = self.calculate_metrics(self.y_train, y_pred, y_prob)

        # Generate drift report
        drift_report = json.loads(self.evidentlyAi(self.x_train))

        # Save logs
        log_data = {
            "Model Metrics": metrics,
            "Cross Validation Scores": cross_val_scores.tolist(),
            "Data Drift Report": drift_report,
        }
        self.save_log(log_data)

    def evidentlyAi(self, test_data):

        reference_data = self.training_data.drop(columns=[self.target_column])

        # If test_data is a feature set, make sure to add the target column for the drift report
        if isinstance(test_data, np.ndarray):
            test_data = pd.DataFrame(test_data, columns=self.column_names)

        # Run Evidently drift detection using the full reference data and features of the test data
        report = Report(metrics=[DataDriftPreset()])
        report.run(current_data=test_data, reference_data=reference_data)
        return report.json()

    def calculate_metrics(self, y_true, y_pred, y_prob):
        metrics = {
            "Accuracy": accuracy_score(y_true, y_pred),
            "Precision": precision_score(y_true, y_pred, average="weighted"),
            "Recall": recall_score(y_true, y_pred, average="weighted"),
            "F1 Score": f1_score(y_true, y_pred, average="weighted"),
            "AUC-ROC": roc_auc_score(y_true, y_prob[:, 1], multi_class="ovr"),
            "Matthews Correlation Coefficient": matthews_corrcoef(y_true, y_pred),
            "Balanced Accuracy": balanced_accuracy_score(y_true, y_pred),
            "Jaccard Index": jaccard_score(y_true, y_pred, average="weighted"),
            "Cohen's Kappa": cohen_kappa_score(y_true, y_pred),
        }
        print("Metrics calculated:", metrics)
        self._log_instance("Metrics calculated", additional_data=metrics)
        return metrics

    def cross_validation_metrics(self):
        cross_val_scores = cross_val_score(self.model, self.x_train, self.y_train, cv=5, scoring="accuracy")
        print(f"Cross-validation scores: {cross_val_scores}")
        print(f"Mean cross-validation score: {np.mean(cross_val_scores)}")
        self._log_instance("Cross-validation performed", additional_data={"Cross Validation Scores": cross_val_scores.tolist()})
        return cross_val_scores

    def save_log(self, log_data):
        timestamp = time.strftime("%Y-%m-%d %H:%M:%S")
        log_entry = {
            "Timestamp": timestamp,
            "Log Data": log_data
        }

        try:
            with open(self.log_file, 'a') as f:  # Append to log file
                json.dump(log_entry, f, indent=4)
                f.write("\n")  # Ensure each log entry is on a new line
        except Exception as e:
            print(f"Error saving log to file: {e}")

    def process_input(self, input_data):
        input_df = pd.DataFrame([input_data], columns=self.column_names)
        prediction = self.model.predict(input_df)
        prediction_proba = self.model.predict_proba(input_df)

        log_data = {
            "Input Data": input_data,
            "Prediction": prediction.tolist(),
            "Prediction Probability": prediction_proba.tolist(),
        }

        self._log_instance("Processed input data", additional_data=log_data)
        return prediction, prediction_proba

    def _log_instance(self, message, additional_data=None):
        timestamp = time.strftime("%Y-%m-%d %H:%M:%S")
        log_entry = {
            "Timestamp": timestamp,
            "Event": message,
        }
        if additional_data:
            log_entry["Details"] = additional_data

        try:
            with open(self.log_file, 'a') as f:
                json.dump(log_entry, f, indent=4)
                f.write("\n")
        except Exception as e:
            print(f"Error logging instance event: {e}")

# Example Usage
file_path = 'loan_data.csv'
df = pd.read_csv(file_path)

target_column = 'loan_status'
X = df.drop([target_column], axis=1)
y = df[target_column]

for col in X.columns:
    if X[col].dtype == 'object':
        X[col] = X[col].astype('category').cat.codes

data = pd.concat([X, y], axis=1)

from sklearn.neighbors import KNeighborsClassifier
model = KNeighborsClassifier(n_neighbors=5)
model.fit(X, y)

processor = AIModelProcessor(model, training_data=data, target_column=target_column)

# Process a single input
test_input = X.iloc[0].to_dict()
prediction, prediction_proba = processor.process_input(test_input)


Cross-validation scores: [0.80611111 0.83055556 0.82188889 0.84155556 0.85677778]
Mean cross-validation score: 0.8313777777777778
Metrics calculated: {'Accuracy': 0.8805777777777778, 'Precision': 0.8756209610382447, 'Recall': 0.8805777777777778, 'F1 Score': 0.8748263791880365, 'AUC-ROC': 0.93567315, 'Matthews Correlation Coefficient': 0.6321123516376986, 'Balanced Accuracy': 0.7868357142857143, 'Jaccard Index': 0.7889753726800517, "Cohen's Kappa": 0.6241373950885918}
