# Creating a Training Set

This example notebook registers the training dataset used to train the model with your Domino Project.

Once a dataset is registered, Domino Model Monitoring can use it as a reference for data drift detection.

In [1]:
import domino
from domino.training_sets import TrainingSetClient, model
import pandas as pd
import os

In [9]:
# Path to model's original training dataset
path = '/mnt/code/Test&TrainData/churnTrainingData.csv'

training_df = pd.read_csv(path)
training_df.head()

Unnamed: 0,custid,dropperc,mins,consecmonths,income,age,churn_Y
0,844336,0.016364,550,28,89.2,45,0
1,146041,0.018349,545,33,54.2,43,0
2,847745,0.018519,378,41,55.3,41,0
3,285565,0.014493,552,32,66.8,31,0
4,754611,0.012132,577,4,87.2,43,0


In [22]:
# Sub list of feature neames that will be used in Model Monitoring. 
# You may not need to monitor every single column in the original dataset for drift.

feature_names = ["custid", "dropperc", "mins", "consecmonths", "income", "age", "churn_Y"]
training_df_subset= training_df[feature_names].copy()

In [11]:
training_df_subset["custid"] = training_df_subset.index + 1

In [12]:
training_df_subset.head()

Unnamed: 0,custid,dropperc,mins,consecmonths,income,age,churn_Y
0,1,0.016364,550,28,89.2,45,0
1,2,0.018349,545,33,54.2,43,0
2,3,0.018519,378,41,55.3,41,0
3,4,0.014493,552,32,66.8,31,0
4,5,0.012132,577,4,87.2,43,0


In [13]:
# Register the TrainingSet version using the TrainingSetClient.

tsv = TrainingSetClient.create_training_set_version(
    training_set_name="customer-churn-DMM-demo-",
    df=training_df_subset,
    key_columns=["custid"],
    target_columns=["churn_Y"],
    exclude_columns=[],
    meta={"DMM_meta_data": "0.1"},
    monitoring_meta=model.MonitoringMeta(**{
        "categorical_columns": [],
        "timestamp_columns": [],
        "ordinal_columns": []
    }),
    project_name=str(os.environ.get('DOMINO_PROJECT_OWNER')+os.environ.get('DOMINO_PROJECT_NAME'))
)

print(f"TrainingSetVersion {tsv.training_set_name}:{tsv.number}")

TrainingSetVersion customer-churn-DMM-demo-:1


In [31]:
# Check the TrainingSet Columns and Data Types

print("Columns: ", tsv.all_columns)

print("\nData Types:\n\n", tsv.load_raw_pandas().dtypes)

Columns:  ['custid', 'dropperc', 'mins', 'consecmonths', 'income', 'age', 'churn_Y']

Data Types:

 custid            int64
dropperc        float64
mins              int64
consecmonths      int64
income          float64
age               int64
churn_Y           int64
dtype: object


In [33]:
# Review existing TraininSets in your Project

versions = TrainingSetClient.list_training_set_versions(
    training_set_name="customer-churn-DMM-demo-",
    meta={"DMM_meta_data": "0.1"},
)

print(versions)

[TrainingSetVersion(training_set_name='customer-churn-DMM-demo-', number=1, description=<training_set_api_client.types.Unset object at 0x7fd788c5bfd0>, key_columns=['custid'], target_columns=['churn_Y'], exclude_columns=[], all_columns=['custid', 'dropperc', 'mins', 'consecmonths', 'income', 'age', 'churn_Y'], monitoring_meta=MonitoringMeta(timestamp_columns=[], categorical_columns=[], ordinal_columns=[]), meta={'DMM_meta_data': '0.1'}, path='/trainingset/64d53362612d3a70a8060b0a/64d54028b5c13e02c7ddc5c4/64d54028b5c13e02c7ddc5c5', container_path='64d54028b5c13e02c7ddc5c4/64d54028b5c13e02c7ddc5c5', pending=False)]
