# Quickstart for Classification Models

This notebooks provides a quick introduction to documenting a model using the ValidMind developer framework. We will use sample datasets provided by the library and train a simple classification model.

## Initialize ValidMind

In [1]:
%load_ext dotenv
%dotenv .env
%matplotlib inline

import validmind as vm
import xgboost as xgb

vm.init(
  api_host = "http://localhost:3000/api/v1/tracking",
  project = "clhqvlen0005b0i8h6plie0yo"
)

Connected to ValidMind. Project: [Demo] 2 (clhqvlen0005b0i8h6plie0yo)


## Load the Demo Dataset

In [2]:
# You can also import taiwan_credit like this:
from validmind.datasets.classification import taiwan_credit as demo_dataset
# from validmind.datasets.classification import customer_churn as demo_dataset

df = demo_dataset.load_data()

In [3]:
vm_dataset = vm.init_dataset(
    dataset=df,
    target_column=demo_dataset.target_column,
    class_labels=demo_dataset.class_labels
)

Pandas dataset detected. Initializing VM Dataset instance...
Inferring dataset types...


## Run the Data Validation Test Plan

In [4]:
tabular_suite = vm.run_test_suite("tabular_dataset", dataset=vm_dataset)

HBox(children=(Label(value='Running test suite...'), IntProgress(value=0, max=24)))

VBox(children=(HTML(value='<h2>Test Suite Results: <i style="color: #DE257E">Tabular Dataset</i></h2><hr>'), H…

In [5]:
import plotly.figure_factory as ff

In [6]:
corr_matrix = df.corr(numeric_only=True)

In [7]:
colorscale = [[0, '#FFFFFF'], [1, '#DE257E']]

# fig = ff.create_annotated_heatmap(
#     z=corr_matrix.values,
#     x=list(corr_matrix.columns),
#     y=list(corr_matrix.index),
#     annotation_text=corr_matrix.round(2).values,
#     colorscale=colorscale,
#     showscale=True)

# fig.update_layout(title_text='Correlation Matrix',
#                   title_x=0.5,
#                   titlefont=dict(size=24))

In [8]:
import plotly.graph_objects as go

# Assuming 'corr_matrix' is your correlation matrix
# Calculate correlation matrix
corr_matrix = df.corr()

# Create heatmap
heatmap = go.Heatmap(z=corr_matrix.values, 
                     x=list(corr_matrix.columns), 
                     y=list(corr_matrix.index))

# Create annotations
annotations = []
for i, row in enumerate(corr_matrix.values):
    for j, value in enumerate(row):
        annotations.append(go.layout.Annotation(text=str(round(value, 2)),
                                                x=corr_matrix.columns[j],
                                                y=corr_matrix.index[i],
                                                showarrow=False,
                                                font=dict(color='white')))

# Create layout
layout = go.Layout(annotations=annotations,
                   xaxis=dict(side='top'),
                   yaxis=dict(scaleanchor="x", scaleratio=1),
                   autosize=False,
                   width=1024,
                   height=1024,
                   title_text='Correlation Matrix',
                   title_x=0.5,
                   titlefont=dict(size=24))

# Create figure
fig = go.Figure(data=[heatmap], layout=layout)

fig.show()

In [9]:
fig.write_image("correlation_matrix.svg")
fig.write_image("correlation_matrix.jpg")
fig.write_image("correlation_matrix.png")

## Run the Model Validation Test Plan

We will need to preprocess the dataset and produce the training, test and validation splits first.

### Prepocess the Raw Dataset

In [None]:
train_df, validation_df, test_df = demo_dataset.preprocess(df)

In [None]:
x_train = train_df.drop(demo_dataset.target_column, axis=1)
y_train = train_df[demo_dataset.target_column]
x_val = validation_df.drop(demo_dataset.target_column, axis=1)
y_val = validation_df[demo_dataset.target_column]

model = xgb.XGBClassifier(early_stopping_rounds=10)
model.set_params(
    eval_metric=["error", "logloss", "auc"],
)
model.fit(
    x_train,
    y_train,
    eval_set=[(x_val, y_val)],
    verbose=False,
)

In [None]:
vm_train_ds = vm.init_dataset(
    dataset=train_df,
    type="generic",
    target_column=demo_dataset.target_column
)

vm_test_ds = vm.init_dataset(
    dataset=test_df,
    type="generic",
    target_column=demo_dataset.target_column
)

vm_model = vm.init_model(
    model,
    train_ds=vm_train_ds,
    test_ds=vm_test_ds,
)

### Run the Binary Classification Test Plan

In [None]:
model_suite = vm.run_test_suite("binary_classifier_model_validation", model=vm_model)