## Initialize Fiddler Client


In [1]:
import fiddler as fdl

client = fdl.FiddlerApi(
    url='http://host.docker.internal:4100', 
    org_id='onebox', 
    auth_token='Sxcg0eCz8aJKUXnrAMOB6EQ8FUuS-hgZvqlQXg-0R6c'
)


## Load dataset


In [3]:
import pandas as pd
df = pd.read_csv('/app/fiddler_samples/samples/datasets/winequality/train.csv')
df_schema = fdl.DatasetInfo.from_dataframe(df, max_inferred_cardinality=1000)


In [4]:
df_schema

DatasetInfo:
  display_name: 
  files: []
  columns:
                      column    dtype count(possible_values) is_nullable  \
    0                 row_id  INTEGER                              False   
    1          fixed acidity    FLOAT                              False   
    2       volatile acidity    FLOAT                              False   
    3            citric acid    FLOAT                              False   
    4         residual sugar    FLOAT                              False   
    5              chlorides    FLOAT                              False   
    6    free sulfur dioxide    FLOAT                              False   
    7   total sulfur dioxide    FLOAT                              False   
    8                density    FLOAT                              False   
    9                     pH    FLOAT                              False   
    10             sulphates    FLOAT                              False   
    11               alcohol    FLO

In [5]:
upload_result = client.upload_dataset(
    dataset={'train': df}, 
    dataset_id='wine_quality')
upload_result

Heads up! We are inferring the details of your dataset from the dataframe(s) provided. Please take a second to check our work.

If the following DatasetInfo is an incorrect representation of your data, you can construct a DatasetInfo with the DatasetInfo.from_dataframe() method and modify that object to reflect the correct details of your dataset.

After constructing a corrected DatasetInfo, please re-upload your dataset with that DatasetInfo object explicitly passed via the `info` parameter of FiddlerApi.upload_dataset().

You may need to delete the initially uploaded versionvia FiddlerApi.delete_dataset('wine_quality').

Inferred DatasetInfo to check:
  DatasetInfo:
    display_name: 
    files: []
    columns:
                        column    dtype count(possible_values) is_nullable  \
      0                 row_id  INTEGER                              False   
      1          fixed acidity    FLOAT                              False   
      2       volatile acidity    FLOAT    

{'row_count': 1119,
 'col_count': 13,
 'log': ['Importing dataset wine_quality',
  'Creating table for wine_quality',
  'Importing data file: train.csv']}

In [7]:
feature_columns = ['fixed acidity', 'volatile acidity', 'citric acid',
                   'residual sugar','chlorides','free sulfur dioxide','total sulfur dioxide',
                   'density','pH','sulphates','alcohol']
target = 'quality'
model_info = fdl.ModelInfo.from_dataset_info(
    dataset_info=client.get_dataset_info('wine_quality'),
    target=target, 
    features=feature_columns,
    display_name='sklearn model',
    description='this is a sklearn model from tutorial'
)
model_info

ModelInfo:
  display_name: sklearn model
  description: this is a sklearn model from tutorial
  input_type: ModelInputType.TABULAR
  model_task: ModelTask.REGRESSION
  inputs:
                      column  dtype count(possible_values) is_nullable  \
    0          fixed acidity  FLOAT                              False   
    1       volatile acidity  FLOAT                              False   
    2            citric acid  FLOAT                              False   
    3         residual sugar  FLOAT                              False   
    4              chlorides  FLOAT                              False   
    5    free sulfur dioxide  FLOAT                              False   
    6   total sulfur dioxide  FLOAT                              False   
    7                density  FLOAT                              False   
    8                     pH  FLOAT                              False   
    9              sulphates  FLOAT                              False   
    10    

In [15]:
import sklearn.linear_model
import sklearn.pipeline
import sklearn.preprocessing

train_input = df.drop(columns=['row_id', 'quality'])
train_target = df['quality']

regressor = sklearn.linear_model.LinearRegression()

full_model = sklearn.pipeline.Pipeline(steps=[
        ('standard_scaling', sklearn.preprocessing.StandardScaler()),
        ('model_name', regressor),
    ])

full_model.fit(train_input, train_target)
full_model.predict(train_input)


array([6.50506782, 6.65427237, 5.35208865, ..., 5.27678525, 6.36560789,
       4.44488096])

In [34]:
import pathlib
import shutil
import pickle
import yaml

model_dir = pathlib.Path('wine_quality_model')
shutil.rmtree(model_dir, ignore_errors=True)
model_dir.mkdir()

with open(model_dir / 'model.pkl', 'wb') as pkl_file:
    pickle.dump(full_model, pkl_file)

with open(model_dir / 'model.yaml', 'w') as yaml_file:
    yaml.dump({'model': model_info.to_dict()}, yaml_file)


In [36]:
%%writefile wine_quality_model/package.py

import pickle
from pathlib import Path
import pandas as pd

PACKAGE_PATH = Path(__file__).parent

class SklearnModelPackage:
    is_classifier = False
    output_columns = ['predicted_quality']

    def __init__(self):
        with open(PACKAGE_PATH / 'model.pkl', 'rb') as infile:
            self.model = pickle.load(infile)

    def predict(self, input_df):
        f = self.model.predict if not self.is_classifier else self.model.predict_proba
        return pd.DataFrame(f(input_df), columns=self.output_columns)
    
def get_model():
    return SklearnModelPackage()



Writing wine_quality_model/package.py


In [37]:
project_id = 'wine_quality'
model_id = 'linear_regression'
client.upload_model_package(model_dir, project_id, model_id)