In [1]:
import pandas as pd

from sklearn.datasets import load_diabetes
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error,mean_absolute_error

def rmse(y_true,y_pred):
    return mean_squared_error(y_true,y_pred,squared=False)

# Experimentation cycle

### Load

In [107]:
diabetes = load_diabetes(as_frame=True)
X = diabetes.data
y = diabetes.target

X_train = X.iloc[:300]
X_test = X.iloc[300:]

y_train = y.iloc[:300]
y_test = y.iloc[300:]


### Preprocess

In [108]:
def preprocess_data(df_input,scaler):
    df = df_input.copy()
    df.loc[(df_input['sex'] == -0.044642),['sex']] = 1
    df.loc[df_input['sex'] != -0.044642,['sex']] = 0
    df = scaler.transform(df)    
    return df

scaler = MinMaxScaler()
scaler.fit(X_train)

X_train_features = preprocess_data(X_train,scaler=scaler)
X_test_features = preprocess_data(X_test,scaler=scaler)

### Train

In [109]:
model = LinearRegression()
model.fit(X_train_features,y_train)

LinearRegression()

### Predict

In [110]:
y_pred = model.predict(X_test_features)

### Evaluate

In [111]:
mean_squared_error(y_pred=y_pred,y_true=y_test,squared=False)

53.531572293853515

# Putting code into production format

## Defining Contract Classes

In [2]:
from abc import ABC, abstractmethod
from typing import Callable,Union

class DataProcessor(ABC):

    @abstractmethod
    def load_data(self):
        '''Method to load the data into memory from a given path. Data should be stored in this class using the data property.'''
        pass

    @abstractmethod
    def preprocess(self,data:pd.DataFrame)-> pd.DataFrame:
        '''Method to preprocess the data. Data should be stored in this class using the data property.'''
        pass

    @property
    def raw_data(self):
        return self._raw_data

    @raw_data.setter
    def raw_data(self,raw_data):
        self._raw_data = raw_data

    @property
    def processed_data(self):
        return self._processed_data

    @processed_data.setter
    def processed_data(self,processed_data):
        self._processed_data = processed_data

class FeatureGenerator(ABC):

    @abstractmethod
    def generate_features(self,data:pd.DataFrame)-> pd.DataFrame:
        '''Method to generate feature data. Data should be stored in this class using the data property.'''
        pass

    @abstractmethod
    def generate_target(self,data:pd.DataFrame):
        pass

    @property
    def features(self):
        return self._features
    
    @features.setter
    def features(self,features):
        self._features = features
        
    @property
    def target(self):
        return self._target
    
    @target.setter
    def target(self,target):
        self._target = target

class Model(ABC):

    @abstractmethod
    def fit(self,X,y):
        '''Method to train the machine learning model.'''
        pass

    @abstractmethod
    def predict(self,X):
        '''Method to make predictions with the machine learning model.'''
        pass
    
    @property
    def parameters(self):
        return self._parameters

    @parameters.setter
    def parameters(self,params):
        self._parameters = params
    
    @property
    def model(self):
        return self._model

    @model.setter
    def model(self,model):
        self._model = model

class ModelEvaluator:

    def __init__(self,model):
        self.model = model

    def evaluate(self,X,y,metrics:Union[Callable,list[Callable]]):

        y_pred = self.model.predict(X) #Improvement: some metrics might not work with prediction, but with prediction_proba.

        metric_values = {}

        for metric in metrics:
            metric_name = metric.__name__
            metric_values[metric_name] = metric(y_pred,y) 
        self.metrics = metric_values
        
        return self.metrics
        
    @property
    def metrics(self):
        return self._metrics

    @metrics.setter
    def metrics(self,metrics:dict):
        self._metrics = metrics

## Transforming experimentation into production code

In [3]:
from sklearn.model_selection import train_test_split

class DiabetesDataProcessor(DataProcessor):
    
    def __init__(self,loader) -> None:
        self._loader = loader
    
    def load_data(self) -> pd.DataFrame:
        diabetes = self._loader(as_frame=True)
        df = diabetes.data
        df['target'] = diabetes.target
        self.data = df
        return df

    def preprocess(self,df:pd.DataFrame) -> pd.DataFrame:
        df = self._encode_sex_feature(df)
        self.data = df
        return df

    def _encode_sex_feature(self,df:pd.DataFrame):
        df_encoded = df.copy()
        df_encoded.loc[(df['sex'] == -0.044642),['sex']] = 1
        df_encoded.loc[df['sex'] != -0.044642,['sex']] = 0
        return df_encoded

class DiabetesFeatureGenerator(FeatureGenerator):
    
    def generate_features(self, data: pd.DataFrame) -> pd.DataFrame:
        if 'target' in data.columns:
            self.features = data.drop(columns=['target'])
        else:
            self.features = data
        
    def generate_target(self, data: pd.DataFrame) -> pd.DataFrame:
        try:
            self.target = data['target']
        except Exception as e:
            print(e)

    def train_test_split(self) -> tuple[pd.DataFrame]:
        X_train,X_test,y_train,y_test = train_test_split(self.features,self.target,train_size=0.8)

        return X_train,X_test,y_train,y_test

class DiabetesModel(Model):
    
    def __init__(self, model_class, scaler, model_params) -> None:
        self.model_class = model_class
        self.scaler = scaler
        self.parameters = model_params

    def fit(self,X:pd.DataFrame,y:pd.DataFrame):
        self.scaler.fit(X)
        X_norm = self.scaler.transform(X)
        self.model = self.model_class(**self.parameters)
        self.model.fit(X_norm,y)
        
    def predict(self, X):
        X_norm = self.scaler.transform(X)
        return self.model.predict(X_norm)

### Training

In [193]:
data_processor = DiabetesDataProcessor(loader=load_diabetes)
raw_data = data_processor.load_data()
processed_data = data_processor.preprocess(df=raw_data)

feature_generator = DiabetesFeatureGenerator()
feature_generator.generate_features(processed_data)
feature_generator.generate_target(processed_data)
X_train,X_test,y_train,y_test = feature_generator.train_test_split()

model = DiabetesModel(model_class=LinearRegression,scaler=MinMaxScaler(),model_params={'fit_intercept':True})
model.fit(X_train,y_train)

evaluator = ModelEvaluator(model=model)
results_train = evaluator.evaluate(X=X_train,y=y_train,metrics=[rmse,mean_squared_error,mean_absolute_error])
results = evaluator.evaluate(X=X_test,y=y_test,metrics=[rmse,mean_squared_error,mean_absolute_error])

print("Train results:",results_train)
print("\nTest results:",results)

Train results: {'rmse': 53.099240188856264, 'mean_squared_error': 2819.5293086338484, 'mean_absolute_error': 43.310675381138445}

Test results: {'rmse': 60.17943582802767, 'mean_squared_error': 3621.5644965797005, 'mean_absolute_error': 50.0644484106903}


### Inference

In [None]:
data_processor = DiabetesDataProcessor(loader=load_diabetes)
raw_data = data_processor.load_data()
processed_data = data_processor.preprocess(df=raw_data)

feature_generator = DiabetesFeatureGenerator()
feature_generator.generate_features(processed_data)

y_pred = model.predict(X=feature_generator.features)
y_pred[:10]

array([218.97021087,  62.52911359, 190.21717535, 152.73986456,
       117.66767049, 102.84502549,  90.57947603, 128.04868383,
       171.77995468, 205.53512636])

In [None]:

# Inputs do Matheus:
# Requisito: MLFlow expects the folowing: model (sklearn models for now), metrics (dict), parameters(dict), tags (dict) optional.
# Resolucão do problema do sklearn: I can inherit the Model class from the sklearn BaseEstimator.
# Sugestão: Params is an yaml.
# Sugestão: herdar a classe workflow e criar um MachineLearningWorkflow para treinar ou para orquestrar o pipeline do experimento (feature generation + model training)

## Train - Using Workflow and Tasks paradigms

In [4]:
from cd4ml.task import Task
from cd4ml.workflow import Workflow

In [None]:
data_processor = DiabetesDataProcessor(loader=load_diabetes)
raw_data = data_processor.load_data()
processed_data = data_processor.preprocess(df=raw_data)

feature_generator = DiabetesFeatureGenerator()
feature_generator.generate_features(processed_data)
feature_generator.generate_target(processed_data)
X_train,X_test,y_train,y_test = feature_generator.train_test_split()

model = DiabetesModel(model_class=LinearRegression,scaler=MinMaxScaler(),model_params={'fit_intercept':True})
model.fit(X_train,y_train)

evaluator = ModelEvaluator(model=model)
results_train = evaluator.evaluate(X=X_train,y=y_train,metrics=[rmse,mean_squared_error,mean_absolute_error])
results = evaluator.evaluate(X=X_test,y=y_test,metrics=[rmse,mean_squared_error,mean_absolute_error])

print("Train results:",results_train)
print("\nTest results:",results)

In [13]:
def dummy_function():#TODO: How to make this callable task run a method of a class?
    pass
load_data = Task(name='load_data', task=dummy_function)
preprocess_data = Task(name='preprocess_data', task=dummy_function)
generate_features = Task(name='generate_features', task=dummy_function)
generate_target = Task(name='generate_target', task=dummy_function)
train_test_split = Task(name='train_test_split', task=dummy_function)
train_model = Task(name='train_model', task=dummy_function)
evaluate_model_on_train = Task(name='evaluate_model_on_train', task=dummy_function)
evaluate_model_on_test = Task(name='evaluate_model_on_test', task=dummy_function)


w = Workflow()
w.add_task(load_data)
w.add_task(preprocess_data,dependency=['load_data'])
w.add_task(generate_features,dependency=['preprocess_data'])
w.add_task(generate_target,dependency=['preprocess_data'])
w.add_task(train_test_split,dependency=['generate_features','generate_target']) #Question: There is a dependency but no input parameters is passed. How does it work?
w.add_task(train_model,dependency=['train_test_split']) #Similar case here, but only a portion of inputs parameters is passed. X_train and y_train.
w.add_task(evaluate_model_on_train,dependency=['train_model'])
w.add_task(evaluate_model_on_test,dependency=['train_model'])


In [8]:
run_config = {
    # "download_folha": {
    #     'params': {'url': "https://feeds.folha.uol.com.br/emcimadahora/rss091.xml"},
    #     'output': 'download_folha'
    # },
    # "download_g1": {
    #     'params': {'url': "https://g1.globo.com/rss/g1/"},
    #     'output': 'download_g1'
    # },
    # "download_g1_brasil": {
    #     'params': {'url': "https://g1.globo.com/rss/g1/brasil"},
    #     'output': 'download_g1_brasil'
    # },
    # "feeds_aggregate": {
    #     'params': None,
    #     'output': 'feeds_aggregate'
    # },
    # "preprocess": {
    #     'params': None,
    #     'output': 'preprocess'
    # }
}

output = w.run(run_config=run_config, executor='local')