# Creating a Training Set

This example notebook registers the training dataset used to train the model with your Domino Project.

Once a dataset is registered, Domino Model Monitoring can use it as a reference for data drift detection.

In [47]:
from domino.training_sets import TrainingSetClient, model
import pandas as pd
import numpy as np
import os
import random

In [48]:
# defining the column names
cols_car = ['mpg','cylinders','displacement','horsepower','weight',
                'acceleration', 'model year', 'origin']
 
# Read the training data file using pandas
df = pd.read_csv('./data/data.csv', names=cols_car, na_values = "?",
                comment = '\t',
                sep= " ",
                skipinitialspace=True)

# Sub list of feature neames that will be used in Model Monitoring. 
# You may not need to monitor every single column in the original dataset for drift.

df = df[["mpg", "displacement", "horsepower", "weight", "origin"]]
df.reset_index(inplace=True)
df.rename(columns={'index':'customer_id'}, inplace=True)

# Make a copy of the dataframe
data = df.copy()
 
# imputing the values with median (due to presence of outliers)
median = data.iloc[:,3].median()
data.iloc[:,3] = data.iloc[:,3].fillna(median)
data.info()
 
training_df = data
 
training_df['origin'] = training_df['origin'].astype(float)

training_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   customer_id   398 non-null    int64  
 1   mpg           398 non-null    float64
 2   displacement  398 non-null    float64
 3   horsepower    398 non-null    float64
 4   weight        398 non-null    float64
 5   origin        398 non-null    int64  
dtypes: float64(4), int64(2)
memory usage: 18.8 KB


Unnamed: 0,customer_id,mpg,displacement,horsepower,weight,origin
0,0,18.0,307.0,130.0,3504.0,1.0
1,1,15.0,350.0,165.0,3693.0,1.0
2,2,18.0,318.0,150.0,3436.0,1.0
3,3,16.0,304.0,150.0,3433.0,1.0
4,4,17.0,302.0,140.0,3449.0,1.0


In [49]:
tsv = TrainingSetClient.create_training_set_version(
    training_set_name="auto-prediction-training-data-{}-{}".format(os.environ['DOMINO_PROJECT_OWNER'], os.environ['DOMINO_PROJECT_ID']),
    df=training_df,
    key_columns=['customer_id'],
    target_columns=["mpg"],
    exclude_columns=[],
    meta={"meta_data": "1"},
    monitoring_meta=model.MonitoringMeta(**{
        "categorical_columns": [],
        "timestamp_columns": [],
        "ordinal_columns": []
    }),
    project_name=os.environ['DOMINO_PROJECT_NAME']
)
 
print(f"TrainingSetVersion {tsv.training_set_name}:{tsv.number}")

TrainingSetVersion auto-prediction-training-data-dave_heinicke-64d53362612d3a70a8060b0a:3


In [50]:
# Check the TrainingSet Columns and Data Types

print("Columns: ", tsv.all_columns)

print("\nData Types:\n\n", tsv.load_raw_pandas().dtypes)

Columns:  ['customer_id', 'mpg', 'displacement', 'horsepower', 'weight', 'origin']

Data Types:

 customer_id       int64
mpg             float64
displacement    float64
horsepower      float64
weight          float64
origin          float64
dtype: object


In [54]:
# Review existing TraininSets in your Project

versions = TrainingSetClient.list_training_set_versions(
    training_set_name="auto-prediction-training-data-{}-{}".format(os.environ['DOMINO_PROJECT_OWNER'], os.environ['DOMINO_PROJECT_ID']),
    meta={"meta_data": "1"},
)

print(versions)

[TrainingSetVersion(training_set_name='auto-prediction-training-data-dave_heinicke-64d53362612d3a70a8060b0a', number=1, description=<training_set_api_client.types.Unset object at 0x7fd788c5bfd0>, key_columns=['customer_id'], target_columns=['mpg'], exclude_columns=[], all_columns=['customer_id', 'mpg', 'displacement', 'horsepower', 'weight', 'origin'], monitoring_meta=MonitoringMeta(timestamp_columns=[], categorical_columns=[], ordinal_columns=[]), meta={'meta_data': '1'}, path='/trainingset/64d53362612d3a70a8060b0a/64d55dcc612d3a70a8060bed/64d55dcc612d3a70a8060bee', container_path='64d55dcc612d3a70a8060bed/64d55dcc612d3a70a8060bee', pending=False), TrainingSetVersion(training_set_name='auto-prediction-training-data-dave_heinicke-64d53362612d3a70a8060b0a', number=2, description=<training_set_api_client.types.Unset object at 0x7fd788c5bfd0>, key_columns=['customer_id'], target_columns=['mpg'], exclude_columns=[], all_columns=['customer_id', 'mpg', 'displacement', 'horsepower', 'weight',