# Step 2. Training

## Imports

In [None]:
!pip install xgboost --quiet
!pip install seaborn --quiet

In [None]:
import joblib
import os

import pandas as pd
import numpy as np
from matplotlib import pyplot
import seaborn as sns

import xgboost as xgb
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
import time

# Mute warnings
import warnings
warnings.filterwarnings("ignore")

## Connecting to Hopsworks Feature Store

In [None]:
import hopsworks

from ipython_secrets import *
KEY = get_secret('HOPSWORKS_API_KEY')
project = hopsworks.login(host="c.app.hopsworks.ai", api_key_value=KEY)

fs = project.get_feature_store()

### Feature Selection

In [None]:
# Retrieve feature groups
trans_fg = fs.get_feature_group(
    name='transactions_fraud_streaming_fg_' + str(project.id), 
    version=1,
)
window_aggs_fg = fs.get_feature_group(
    name='transactions_aggs_fraud_streaming_fg_' + str(project.id), 
    version=1,
)

In [None]:
# Select features for training data.
selected_features = trans_fg.select(["fraud_label", "category", "amount", "date_time", "age_at_transaction", "days_until_card_expires"])\
    .join(window_aggs_fg.select_except(["cc_num", "date_time"]))

In [None]:
# Uncomment this if you would like to view your selected features
selected_features.read()

### Transformation Functions

In [None]:
# Load transformation functions.
label_encoder = fs.get_transformation_function(name="label_encoder")

# Map features to transformations.
transformation_functions = {
    "category": label_encoder,
}

## Feature View Creation

In [None]:
# Get or create the 'transactions_view_fraud_batch_fv' feature view
feature_view = fs.get_or_create_feature_view(
    name='transactions_view_streaming_fv',
    version=1,
    query=selected_features,
    labels=["fraud_label"],
    transformation_functions=transformation_functions,
)

## Training Dataset Creation

In [None]:
TEST_SIZE = 0.2

X_train, X_test, y_train, y_test = feature_view.train_test_split(
    test_size = TEST_SIZE
)

In [None]:
# Sort the X_train DataFrame based on the "date_time" column in ascending order
X_train = X_train.sort_values("date_time")

# Reindex the y_train Series to match the order of rows in the sorted X_train DataFrame
y_train = y_train.reindex(X_train.index)

In [None]:
# Sort the X_test DataFrame based on the "date_time" column in ascending order
X_test = X_test.sort_values("date_time")

# Reindex the y_test Series to match the order of rows in the sorted X_test DataFrame
y_test = y_test.reindex(X_test.index)

In [None]:
# Drop the "date_time" column from the X_train DataFrame along the specified axis (axis=1 means columns)
X_train.drop(["date_time"], axis=1, inplace=True)

# Drop the "date_time" column from the X_test DataFrame along the specified axis (axis=1 means columns)
X_test.drop(["date_time"], axis=1, inplace=True)

In [None]:
X_train.sample()

In [None]:
# Display the normalized value counts of the y_train Series
y_train.value_counts(normalize=True)

---

## Modeling

In [None]:
# Create an instance of the XGBClassifier
clf = xgb.XGBClassifier()

# Fit the classifier on the training data
clf.fit(X_train.values, y_train)

In [None]:
# Predict the training data using the trained classifier
y_pred_train = clf.predict(X_train.values)

# Predict the test data using the trained classifier
y_pred_test = clf.predict(X_test.values)

In [None]:
# Compute f1 score
metrics = {
    "f1_score": f1_score(y_test, y_pred_test, average='macro')
}
metrics

In [None]:
# Generate the confusion matrix using the true labels (y_test) and predicted labels (y_pred_test)
results = confusion_matrix(y_test, y_pred_test)

# Print the confusion matrix
print(results)

In [None]:
# Create a DataFrame from the confusion matrix results with appropriate labels
df_cm = pd.DataFrame(
    results, 
    ['True Normal', 'True Fraud'],
    ['Pred Normal', 'Pred Fraud'],
)

# Create a heatmap using seaborn with annotations
cm = sns.heatmap(df_cm, annot=True)

# Get the figure from the heatmap and display it
fig = cm.get_figure()
fig.show()

---

## Model Schema

In [None]:
from hsml.schema import Schema
from hsml.model_schema import ModelSchema

# Define the input schema using the values of X_train
input_schema = Schema(X_train.values)

# Define the output schema using y_train
output_schema = Schema(y_train)

# Create a ModelSchema object specifying the input and output schemas
model_schema = ModelSchema(input_schema=input_schema, output_schema=output_schema)

# Convert the model schema to a dictionary for further inspection or serialization
model_schema.to_dict()

## Register Model

In [None]:
# Specify the directory where the model will be saved
model_dir = "fraud_streaming_model"

# Check if the directory exists, and create it if it doesn't
if not os.path.isdir(model_dir):
    os.mkdir(model_dir)

# Save the trained XGBoost model using joblib
joblib.dump(clf, model_dir + '/xgboost_fraud_streaming_model.pkl')

# Save the confusion matrix heatmap as an image in the model directory
fig.savefig(model_dir + "/confusion_matrix.png")

In [None]:
# Get the model registry
mr = project.get_model_registry()

# Create a new model in the model registry
fraud_model = mr.python.create_model(
    name="xgboost_fraud_streaming_model",     # Name for the model
    metrics=metrics,                      # Metrics used for evaluation
    model_schema=model_schema,            # Schema defining the model's input and output
    input_example=X_train.sample(),       # Example input data for reference
    description="Fraud Batch Predictor",  # Description of the model
)

# Save the model to the specified directory
fraud_model.save(model_dir)

## Model Deployment

### Predictor script for Python Models

In [None]:
%%writefile predict_example.py
import os
import numpy as np
import hsfs
import joblib


class Predict(object):

    def __init__(self):
        """ Initializes the serving state, reads a trained model"""        
        # Get feature store handle
        fs_conn = hsfs.connection()
        self.fs = fs_conn.get_feature_store()
        
        # Get feature view
        self.fv = self.fs.get_feature_view("transactions_view_streaming_fv", 1)
        
        # Initialize serving
        self.fv.init_serving(1)

        # Load the trained model
        self.model = joblib.load(os.environ["ARTIFACT_FILES_PATH"] + "/xgboost_fraud_streaming_model.pkl")
        print("Initialization Complete")

    def predict(self, inputs):
        """ Serves a prediction request usign a trained model"""
        feature_vector = self.fv.get_feature_vector({"cc_num": inputs[0][0]}, return_type="pandas").drop(["date_time"], axis=1).values
        return self.model.predict(feature_vector.reshape(1, -1)).tolist() # Numpy Arrays are not JSON serializable

In [None]:
# Get the dataset API for the current project
dataset_api = project.get_dataset_api()

# Specify the local file path of the Python script to be uploaded
local_script_path = "predict_example.py"

# Upload the Python script to the "Models", and overwrite if it already exists
uploaded_file_path = dataset_api.upload(local_script_path, "Models", overwrite=True)

# Create the full path to the uploaded script for future reference
predictor_script_path = os.path.join("/Projects", project.name, uploaded_file_path)

### Create the deployment

In [None]:
# Deploy the fraud model
deployment = fraud_model.deploy(
    name="fraudonlinemodeldeployment",  # Specify a name for the deployment
    script_file=predictor_script_path,  # Provide the path to the Python script for prediction
)

In [None]:
# Print the name of the deployment
print("Deployment: " + deployment.name)

# Display information about the deployment
deployment.describe()

In [None]:
print("Deployment is warming up...")
time.sleep(45)

#### The deployment has now been registered. However, to start it you need to run the following command:

In [None]:
# Start the deployment and wait for it to be in a running state for up to 300 seconds
deployment.start(await_running=300)

In [None]:
# Get the current state of the deployment
deployment.get_state().describe()

In [None]:
# To troubleshoot you can use `get_logs()` method
deployment.get_logs(component='predictor')

### Stop Deployment

In [None]:
# Stop the deployment and wait for it to be in a stopped state for up to 180 seconds
deployment.stop(await_stopped=180)