In [1]:
import domino
from domino.training_sets import TrainingSetClient, model
import pandas as pd
import os

In [2]:
path = str('/domino/datasets/local/{}/WineQualityData.csv'.format(os.environ.get('DOMINO_PROJECT_NAME')))
training_df = pd.read_csv(path)
training_df.head()

Unnamed: 0,id,type,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,is_red
0,0,white,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,5.58,0
1,1,white,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,5.04,0
2,2,white,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,5.34,0
3,3,white,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,4.92,0
4,4,white,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,5.16,0


In [3]:
#Find all pearson correlations of numerical variables with quality
corr_values = training_df.corr().sort_values(by = 'quality')['quality'].drop('quality',axis=0)

#Keep all variables with above a 8% pearson correlation
important_feats=corr_values[abs(corr_values)>0.08]

#Drop NA rows
training_df = training_df.dropna(how='any',axis=0)

#Put pieces together
training_df = training_df[['id']+list(important_feats.keys())+['quality']]

#rename volatile acidity to volatile_acidity
training_df.rename({'volatile acidity': 'volatile_acidity'}, axis =1, inplace=True)

In [4]:
training_df

Unnamed: 0,id,density,volatile_acidity,chlorides,is_red,alcohol,quality
0,0,1.00100,0.270,0.045,0,8.8,5.58
1,1,0.99400,0.300,0.049,0,9.5,5.04
2,2,0.99510,0.280,0.050,0,10.1,5.34
3,3,0.99560,0.230,0.058,0,9.9,4.92
4,4,0.99560,0.230,0.058,0,9.9,5.16
...,...,...,...,...,...,...,...
6491,6491,0.99651,0.620,0.068,1,9.5,5.10
6492,6492,0.99490,0.600,0.090,1,10.5,5.60
6494,6494,0.99574,0.510,0.076,1,11.0,6.18
6495,6495,0.99547,0.645,0.075,1,10.2,5.65


In [6]:
os.environ.get('DOMINO_PROJECT_OWNER')

'integration-test'

In [7]:
tsv = TrainingSetClient.create_training_set_version(
    training_set_name="winequality-training-",
    df=training_df,
    key_columns=["id"],
    target_columns=["quality"],
    exclude_columns=[],
    meta={"elliotts_meta_data": "0.1"},
    monitoring_meta=model.MonitoringMeta(**{
        "categorical_columns": [],
        "timestamp_columns": [],
        "ordinal_columns": []
    }),
    project_name=str(os.environ.get('DOMINO_PROJECT_OWNER')+os.environ.get('DOMINO_PROJECT_NAME'))
)

print(f"TrainingSetVersion {tsv.training_set_name}:{tsv.number}")

TrainingSetVersion winequality-training-:1


In [24]:
tsv.all_columns

['id',
 'density',
 'volatile_acidity',
 'chlorides',
 'is_red',
 'alcohol',
 'quality']