In [1]:
from pprint import pprint
from IPython.display import display, Markdown

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, cross_validate, StratifiedKFold, RandomizedSearchCV
from sklearn.metrics import get_scorer, confusion_matrix
from sklearn.metrics import f1_score, precision_score, roc_auc_score

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
from sklearn.inspection import permutation_importance

In [2]:
pd.options.display.float_format = '{:.4f}'.format
random_state = 123

In [3]:
def title(title, n=3):
    return Markdown('{} {}'.format('#'*(max(n%4, 1)), title))

In [4]:
# filename = 'KaggleV2-May-2016.csv'
filename = 'https://github.com/dm6801/noshow_dataset/raw/master/KaggleV2-May-2016.csv'

In [5]:
# !wc -l "$filename"
# !head -n2 "$filename"

#### Variables' description:

    PatientId - identification of a patient;
    AppointmentID - identification of each appointment;
    Gender - male or female;
    ScheduledDay - day when a patient registered for an appointment;
    AppointmentDay - day of actuall appointment;
    Age - patient age;
    Neighbourhood - where the appointment takes place (hospital location);
    Scholarship - 1 for True, 0 for False. For more details read the article (the link is here: https://bit.ly/3AYv4GF);
    Hipertension - 1 for True, 0 for False;
    Diabetes - 1 for True, 0 for False;
    Alcoholism - 1 for True, 0 for False;
    Handcap - number of disabilities a patient has;
    SMS_received - 1 if one or more messages were sent to the patient;
    No-show - 'No' if the patient showed up to their appointment, 'Yes' if they did not show up.

In [6]:
df = pd.read_csv(filename)

In [7]:
df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 110527 entries, 0 to 110526
Data columns (total 14 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   PatientId       110527 non-null  float64
 1   AppointmentID   110527 non-null  int64  
 2   Gender          110527 non-null  object 
 3   ScheduledDay    110527 non-null  object 
 4   AppointmentDay  110527 non-null  object 
 5   Age             110527 non-null  int64  
 6   Neighbourhood   110527 non-null  object 
 7   Scholarship     110527 non-null  int64  
 8   Hipertension    110527 non-null  int64  
 9   Diabetes        110527 non-null  int64  
 10  Alcoholism      110527 non-null  int64  
 11  Handcap         110527 non-null  int64  
 12  SMS_received    110527 non-null  int64  
 13  No-show         110527 non-null  object 
dtypes: float64(1), int64(8), object(5)
memory usage: 44.3 MB


In [8]:
df.head()

Unnamed: 0,PatientId,AppointmentID,Gender,ScheduledDay,AppointmentDay,Age,Neighbourhood,Scholarship,Hipertension,Diabetes,Alcoholism,Handcap,SMS_received,No-show
0,29872499824296.0,5642903,F,2016-04-29T18:38:08Z,2016-04-29T00:00:00Z,62,JARDIM DA PENHA,0,1,0,0,0,0,No
1,558997776694438.0,5642503,M,2016-04-29T16:08:27Z,2016-04-29T00:00:00Z,56,JARDIM DA PENHA,0,0,0,0,0,0,No
2,4262962299951.0,5642549,F,2016-04-29T16:19:04Z,2016-04-29T00:00:00Z,62,MATA DA PRAIA,0,0,0,0,0,0,No
3,867951213174.0,5642828,F,2016-04-29T17:29:31Z,2016-04-29T00:00:00Z,8,PONTAL DE CAMBURI,0,0,0,0,0,0,No
4,8841186448183.0,5642494,F,2016-04-29T16:07:23Z,2016-04-29T00:00:00Z,56,JARDIM DA PENHA,0,1,1,0,0,0,No


In [9]:
df['No-show'] = df['No-show'].map(dict(No=0, Yes=1))

In [10]:
df['Gender'] = df['Gender'].map(dict(F=0, M=1))

In [11]:
bad_dates = (lambda x:x[x.dt.days<0].dt.days
            )(pd.to_datetime(df['AppointmentDay']).dt.date - pd.to_datetime(df['ScheduledDay']).dt.date)
bad_dates

27033   -1
55226   -1
64175   -1
71533   -6
72362   -1
dtype: int64

In [12]:
df.drop(bad_dates.index, inplace=True)

In [13]:
X = df.drop('No-show', axis=1)
y = df['No-show']

In [14]:
def cyclicalEncode(x, end, start=0, names={}):
    
    sin = np.sin(2 * np.pi * (x+float(start))/end)
    if 'sin' in names:
        sin.rename(names['sin'], inplace=True)
        
    cos = np.cos(2 * np.pi * (x+float(start))/end)
    if 'cos' in names:
        cos.rename(names['cos'], inplace=True)
        
    return sin, cos

In [15]:
class DebugTransformer(BaseEstimator, TransformerMixin):
    
    def __init__(self, name='', verbose=[], **kwargs):
        self.name = name if name else self.__class__.__name__
        self.tag = f'{self.name:<30}'
        self.verbose = verbose
        self.col_tf = kwargs.get('col_tf')
        
        
    def fit(self, X, y=None):
        print(self.tag, 'fit',
              '\t\tX shape:', X.shape, 
              '\ty shape:', y.shape if isinstance(y, pd.Series) else None)
        
        # if 'fit' in self.verbose:
        #     display(X if isinstance(X, pd.DataFrame) else pd.DataFrame(X),
        #             y if isinstance(y, pd.Series) else pd.Series(y) if y != None else None)
        #     print()
        
        return self
    

    def transform(self, X, y=None):
        print(self.tag, 'transform', 
              '\tX shape:', X.shape, 
              '\ty shape:', y.shape if isinstance(y, pd.Series) else None)
        
        if 'transform' in self.verbose:
            columns = self.col_tf.get_feature_names_out() if self.col_tf != None else None
            
            if not isinstance(X, pd.DataFrame):
                _X = pd.DataFrame(X, columns=columns)
            elif columns != None:
                _X = X.copy()
                _X.columns = columns
            else:
                _X = X
                
            display(_X,
                    y if isinstance(y, pd.Series) else pd.Series(y) if y != None else None)
            
            display(_X.describe().T)
            if isinstance(y, pd.Series):
                display(y.describe())
            elif y != None:
                display(pd.Series(y).describe())
                
            print()
        
        return X

In [16]:
class FeaturerEngineering(BaseEstimator, TransformerMixin):
    
    def __init__(self, name='', features=None, verbose=[]):
        self.name = name if name else self.__class__.__name__
        self.tag = f'{self.name:<30}'
        self.verbose = verbose
        self.features = features
        
        
    def fit(self, X, y=None):
        # print(f'{self.name:<30}', 'fit',
        #       '\t\tX shape:', X.shape, 
        #       '\ty shape:', y.shape if isinstance(y, pd.Series) else '')
        if 'fit' in self.verbose:
            print(self.tag, 'fit')
        
        if 'neighbourhood' in self.features:
            self.neighbourhood_ratio = (X.join(y)
                                        .pivot_table(
                                            index='Neighbourhood',
                                            columns=y.name,
                                            values='Age',
                                            aggfunc='count')
                                        .apply(lambda x:x[1]/x[0], axis=1))
        
        if 'previous_no_show' in self.features:
            self.previous_no_show_ratio = (X.join(y)
                                           .pivot_table(
                                               index='PatientId',
                                               columns=y.name,
                                               values='Age',
                                               aggfunc='count')
                                           .apply(lambda x:x[1]/x[0], axis=1))
        
        if 'previous_appointments' in self.features:
            self.previous_appointments = X.groupby('PatientId')['AppointmentID'].nunique()
        
        return self
    

    def transform(self, X, y=None):
        # print(f'{self.name:<30}', 'transform', 
        #       '\tX shape:', X.shape, 
        #       '\ty shape:', y.shape if isinstance(y, pd.Series) else '')
        
        if 'transform' in self.verbose:
            print(self.tag, 'transform')
            
        features = []
            
        if 'neighbourhood' in self.features:
            features.append(X['Neighbourhood']
                            .map(self.neighbourhood_ratio)
                            .fillna(self.neighbourhood_ratio.median())
                            .rename('neighbourhood'))
        
        if 'previous_no_show' in self.features:
            features.append(X['PatientId']
                            .map(self.previous_no_show_ratio)
                            .fillna(0)
                            .rename('previous_no_show'))

        if 'previous_appointments' in self.features:
            features.append(X['PatientId']
                            .map(self.previous_appointments)
                            .fillna(0)
                            .rename('previous_appointments'))
        
        scheduled_datetime = pd.to_datetime(X['ScheduledDay'])
        appointmt_datetime = pd.to_datetime(X['AppointmentDay'])
        
        features.append((appointmt_datetime.dt.date
                         - scheduled_datetime.dt.date).dt.days.rename('days_diff'))
        
        features.append(scheduled_datetime.dt.dayofweek.rename('scheduled_day'))
        features.append(scheduled_datetime.dt.hour.rename('scheduled_hour'))
        features.append(appointmt_datetime.dt.dayofweek.rename('appointment_day'))
        
        if any([f.endswith('sin') or f.endswith('cos') for f in self.features]):
            features.extend(cyclicalEncode(scheduled_datetime.dt.month, 12, 
                                           names={'sin': 'scheduled_month_sin',
                                                  'cos': 'scheduled_month_cos'}))

            features.extend(cyclicalEncode(scheduled_datetime.dt.dayofweek, 7, 
                                           names={'sin': 'scheduled_day_sin',
                                                  'cos': 'scheduled_day_cos'}))

            features.extend(cyclicalEncode(scheduled_datetime.dt.hour, 24, 
                                           names={'sin': 'scheduled_hour_sin',
                                                  'cos': 'scheduled_hour_cos'}))

            features.extend(cyclicalEncode(appointmt_datetime.dt.month, 12, 
                                           names={'sin': 'appointmt_month_sin',
                                                  'cos': 'appointmt_month_cos'}))

            features.extend(cyclicalEncode(appointmt_datetime.dt.dayofweek, 7, 
                                           names={'sin': 'appointmt_day_sin',
                                                  'cos': 'appointmt_day_cos'}))
        
        return X.join(features)

In [17]:
df.columns

Index(['PatientId', 'AppointmentID', 'Gender', 'ScheduledDay',
       'AppointmentDay', 'Age', 'Neighbourhood', 'Scholarship', 'Hipertension',
       'Diabetes', 'Alcoholism', 'Handcap', 'SMS_received', 'No-show'],
      dtype='object')

In [18]:
column_transformers = {
    'StandardScaler': [
        'Age',
        'Handcap',
        'neighbourhood',
#         'previous_no_show',
        'previous_appointments',
        'days_diff',
        'scheduled_day',
        'scheduled_hour',
        'appointment_day',
    ],
#     'MinMaxScaler': [

#     ],
#     'OneHotEncoder': [
#         ('kwargs', dict(drop='first')),
#         'Gender'
#     ],
    'passthrough': [
        'Gender',
        'SMS_received',
        'Scholarship',
        'Hipertension',
        'Diabetes',
        'Alcoholism',
#         'scheduled_month_sin',
#         'scheduled_month_cos',
#         'scheduled_day_sin',
#         'scheduled_day_cos',
#         'scheduled_hour_sin',
#         'scheduled_hour_cos',
#         'appointmt_month_sin',
#         'appointmt_month_cos',
#         'appointmt_day_sin', 
#         'appointmt_day_cos',
    ],
}

In [19]:
class Predictor:
    """
    Hi.
    This is Predictor.
    """
    
    
    # plumbing
    
    def __init__(self, model, n_jobs=-1, random_state=123, verbose=0):
        self.tag = self.__class__.__name__
        self.verbose = verbose
        self.random_state = random_state
        self.n_jobs = n_jobs
        self.model = model
        self.pipeline = None
        self.X = None
        self.X_train = None
        self.X_test = None
        self.y = None
        self.y_train = None
        self.y_test = None
    
    
    def __repr__(self):
        return f'{self.tag}(\n  {self.model}\n)'
    
    
    def _title(self, title, n=3):
        return Markdown('{} {}'.format('#'*(max(n%4, 1)), title))
    
    
    def _print(self, title, *args, **kwargs):
        display(self._title(title))
        if isinstance(args, (map, dict, list)):
            pprint(args, width=1, sort_dicts=False, **kwargs)
        else:
            print(*args, **kwargs)
        print()
    
    
    def _verbose(self, title, func, *args, **kwargs):
        out = None
        
        if self.verbose:
            display(self._title(title))
            
        if callable(func):
            out = func(*args, **kwargs)
        elif self.verbose:
            if isinstance(func, (map, dict, list)):
                pprint(func, width=1, sort_dicts=False)
            else:
                print(func)
            
        if self.verbose:
            print()
            
        return out
    
    
    def _update_kwargs(self, kwargs, keys=[]):
        if 'random_state' in keys:
            kwargs['random_state'] = self.random_state
            
        if 'n_jobs' in keys and 'n_jobs' not in kwargs:
            kwargs['n_jobs'] = self.n_jobs
    
    
    
    # pipeline
    
    def _create_column_transformer(self, transformers, remainder='drop'):
        # pprint(transformers or Predictor.column_transformers, width=1, sort_dicts=False)
            
        out = {}
        
        for class_name, columns in transformers.items():
            # print(class_name, columns)
            
            kwargs = columns.pop(0)[1] \
                        if columns and isinstance(columns[0], tuple) and columns[0][0] == 'kwargs' \
                        else {}
            # print(kwargs)
            
            if class_name != 'passthrough' and class_name != 'drop':
                transformer = globals()[class_name](**kwargs)
            else:
                transformer = class_name
                
            out[transformer] = columns
            
        # if self.verbose:
        #     self._verbose('column transformer', out)
        
        out = list(out.items()) 
        return make_column_transformer(*out, remainder=remainder, n_jobs=self.n_jobs)
        
        
    def create_pipeline(self, *steps, column_transformer=None):
        if steps:
            steps.append(('clf', self.model))
            self.pipeline = Pipeline(steps)
        else:
            if not isinstance(column_transformer, ColumnTransformer):
                features = [v for e in column_transformer.values() for v in e]
                column_transformer = self._create_column_transformer(column_transformer)
            
            if self.verbose:
                steps = [
                    ('before', 
                         DebugTransformer('before')),
                    ('feature_engineering',
                         FeaturerEngineering(features=features,
                                             verbose=['fit', 'transform'] if self.verbose>0 else [])),
                    ('after_feature_eng',
                         DebugTransformer('after FeaturerEngineering', ['transform'] if self.verbose==2 else [])),
                    ('column_transformer', 
                         column_transformer),
                    ('after_column_transformer', 
                     DebugTransformer('after ColumnTransformer', 
                                      ['transform'] if self.verbose==2 else [],
                                      col_tf=column_transformer)),
                ]
            else:
                steps = [
                    ('feature_engineering', FeaturerEngineering(features=features)),
                    ('column_transformer', column_transformer),
                ]
                
            steps.append(('clf', self.model))
            self.pipeline = Pipeline(steps)
            
        self._verbose('pipeline', self.pipeline)
    
    
    
    # data
        
    def _exists(self, data):
        return isinstance(data, pd.DataFrame) or isinstance(data, pd.Series)
    
    
    def set_data(self, X, y):
        self.X = X
        self.y = y
        
        
    def train_test_split(self, *args, **kwargs):
        self._update_kwargs(kwargs, ['random_state'])
        self.X_train, self.X_test, \
        self.y_train, self.y_test = train_test_split(X, y, **kwargs)
        
        
    def _compute_feature_importance(self, X=[], y=[], scoring=None, repeats=30):
        
        def _extract(values):
            # print('type:', values.dtype, '\tshape:', values.shape, '\tndim', values.ndim, '\tvalues:', values)

            if len(values.shape) > 1:
                if values.shape[0] > 1:
                    return values.mean(axis=1)
                else:
                    return values[0]

            return values

        model = self.pipeline.steps[-1][1]

        for attr in ['feature_importances_', 'coefs_', 'coef_']:
            if hasattr(model, attr):
                values = getattr(model, attr)
                # print(attr, values)

                return pd.Series(_extract(values),
                                 # index=range(X.shape[1]),
                                 name=attr)

        if len(X) == 0 or len(y) == 0:
            return pd.Series(np.NaN, index=range(X.shape[1]))

        importance = permutation_importance(model, X, y, scoring=scoring, n_repeats=repeats, 
                                            n_jobs=-1, random_state=random_state)
        # pprint(importance)

        return pd.Series(_extract(importance['importances']),
                         # index=range(X.shape[1]),
                         name='permutation_importance')


    def feature_importance(self, scoring=None):
        
        def _extract(values):
            # print('type:', values.dtype, '\tshape:', values.shape, '\tndim', values.ndim, '\tvalues:', values)

            if len(values.shape) > 1:
                if values.shape[0] > 1:
                    return values.mean(axis=1)
                else:
                    return values[0]

            return values

        importance = None
        model = self.pipeline.steps[-1][1]
        features = self.pipeline.named_steps['column_transformer'].get_feature_names_out()
        # X = self.pipeline[:-1].transform(self.X_train)
        
        for attr in ['feature_importances_', 'coefs_', 'coef_']:
            if hasattr(model, attr):
                values = getattr(model, attr)
                # print(attr, values)

                importance = pd.Series(_extract(values), name=attr)# , index=range(X.shape[1]))
                break
                
        if not self._exists(importance):
            if len(X) == 0 or len(y) == 0:
                importance = pd.Series(np.NaN, index=range(X.shape[1]))
            else:
                intermediate = permutation_importance(self.pipeline, self.X_train, self.y_train,
                                                      scoring=scoring, n_repeats=30, 
                                                      n_jobs=self.n_jobs, random_state=self.random_state)
                # pprint(intermediate)

                importance = pd.Series(_extract(intermediate['importances']),
                                       # index=range(X.shape[1]),
                                       name='permutation_importance')
                
        importance.index = features
        self._print('feature importance', importance)
    
    
    
    # model operations    
    
    def cross_validate(self, X=None, y=None, **kwargs):
        self._update_kwargs(kwargs, ['n_jobs'])
        cv = cross_validate(self.pipeline, X or self.X_train, y or self.y_train, **kwargs)
        display(title('cross validate'))
        pprint(cv, width=1, sort_dicts=False)
        return cv
    
    
    def fit_predict(self, estimator=None):
        if estimator == None:
            estimator = self.pipeline
        
        self._verbose('fitting', estimator.fit, self.X_train, self.y_train)
        self.y_pred = self._verbose('predicting', estimator.predict, self.X_test)
        
        if hasattr(estimator, 'predict_proba'):
            self.y_proba = self._verbose('predicting probabilities',
                                         estimator.predict_proba, self.X_test)[:, 1]
    
    
    def tune_hyperparams(self, params, scoring, **kwargs):
        self._update_kwargs(kwargs, ['random_state', 'n_jobs'])
        search = RandomizedSearchCV(self.pipeline, params, scoring=scoring, 
                                    cv=StratifiedKFold(shuffle=True, random_state=self.random_state),
                                    **kwargs)
        
        self.fit_predict(search)
        
        display(self._title('Best parameters'))
        pprint(search.best_params_, width=1, sort_dicts=False)
        
        self.pipeline = search.best_estimator_
    
    
    def scores(self, scoring, y_test=None, y_pred=None, y_proba=None):
        if y_test is None:
            y_test = self.y_test
            
        if y_pred is None:
            y_pred = self.y_pred
            
        if y_proba is None:
            y_proba = self.y_proba
            
        if isinstance(scoring, str):
            scoring = [scoring]
        
        # confusion matrix
        display(title('confusion matrix'))
        pprint(confusion_matrix(y_test, y_pred), width=1)
        print()
        pprint(confusion_matrix(y_test, y_pred, normalize='true').round(2), width=1)
        print()
        
        # scores
        display(title('scores'))
        if isinstance(scoring, str):
            score_func = get_scorer(scoring)._score_func
            score = score_func(y_test, y_pred)
            print(score_func.__name__ , round(score, 3))
            print()
            return {score_func.__name__: score}
        else:
            scores = {f.__name__: 
                      f(y_test, y_proba) if 'y_score' in f.__code__.co_varnames else f(y_test, y_pred)
                      for f in map(lambda x:get_scorer(x)._score_func, scoring)}
            pprint({k: round(v, 3) for k, v in scores.items()}, width=1, sort_dicts=False)
            print()
            return scores
    
    
    
    def execute(self, X=None, y=None,
                scoring='accuracy',
                train_cv=False,
                feature_importance=False,
                column_transformer=None,
                hypertune=None):
        
        display(self._title(self.model.__class__.__name__))
        
        if not self.pipeline:
            self.create_pipeline(column_transformer=column_transformer)
        
        if self._exists(X) and self._exists(y):
            self.set_data(X, y)
            
        if self._exists(X) or not _exists(self.X_train):
            self.train_test_split(test_size=.25, shuffle=True, stratify=self.y)
        
        if train_cv:
            self.cross_validate(scoring=scoring)
            
        if hypertune:
            self.tune_hyperparams(params=hypertune[1], scoring=hypertune[0])
        else:
            self.fit_predict()
        
        if feature_importance or self.verbose:
            self.feature_importance()
    
        return self.scores(scoring=scoring)

In [20]:
# _test = Predictor(RandomForestClassifier(class_weight='balanced'), verbose=2)
# _test._create_column_transformer()
# _test.create_pipeline()
# _test.set_data(X, y)
# _test.train_test_split(test_size=.25, shuffle=True, stratify=y)
# _test.cross_validate(scoring='f1')
# _test.execute(X, y, scoring=['precision', 'f1', 'roc_auc'])

# (Predictor(RandomForestClassifier(class_weight='balanced'))
#  .execute(X, y, scoring=['precision', 'f1', 'roc_auc'], feature_importance=True));

In [21]:
%%time
(Predictor(RandomForestClassifier(class_weight='balanced'))
 .execute(X, y,
          scoring=['precision', 'f1', 'roc_auc'],
          column_transformer=column_transformers,
          feature_importance=True, 
          hypertune=('f1', {
              'clf__n_estimators': [100, 120, 130,],
              'clf__max_depth': [None, 5, 10, 15],
          }),
));

### RandomForestClassifier

### Best parameters

{'clf__n_estimators': 120,
 'clf__max_depth': 10}


### feature importance

standardscaler__Age                     0.0889
standardscaler__Handcap                 0.0050
standardscaler__neighbourhood           0.0596
standardscaler__previous_appointments   0.0604
standardscaler__days_diff               0.5988
standardscaler__scheduled_day           0.0288
standardscaler__scheduled_hour          0.0436
standardscaler__appointment_day         0.0247
passthrough__Gender                     0.0086
passthrough__SMS_received               0.0578
passthrough__Scholarship                0.0065
passthrough__Hipertension               0.0081
passthrough__Diabetes                   0.0040
passthrough__Alcoholism                 0.0051
Name: feature_importances_, dtype: float64



### confusion matrix

array([[12201,  9851],
       [ 1062,  4517]])

array([[0.55, 0.45],
       [0.19, 0.81]])



### scores

{'precision_score': 0.314,
 'f1_score': 0.453,
 'roc_auc_score': 0.737}

CPU times: user 9.02 s, sys: 768 ms, total: 9.79 s
Wall time: 1min 9s


{'precision_score': 0.3143791759465479,
 'f1_score': 0.4529001854915526,
 'roc_auc_score': 0.7368453475688661}

In [22]:
%%time
(Predictor(RandomForestClassifier(class_weight='balanced'), verbose=2)
 .execute(X, y,
          scoring=['precision', 'f1', 'roc_auc'],
          column_transformer=column_transformers,
          feature_importance=True, 
          train_cv=True,
          hypertune=('f1', {
              'clf__n_estimators': [100, 120, 130,],
              'clf__max_depth': [None, 5, 10, 15],
          }),
));

### RandomForestClassifier

### pipeline

Pipeline(steps=[('before', DebugTransformer(name='before')),
                ('feature_engineering',
                 FeaturerEngineering(features=['Age', 'Handcap',
                                               'neighbourhood',
                                               'previous_appointments',
                                               'days_diff', 'scheduled_day',
                                               'scheduled_hour',
                                               'appointment_day', 'Gender',
                                               'SMS_received', 'Scholarship',
                                               'Hipertension', 'Diabetes',
                                               'Alcoholism'],
                                     name='FeaturerEngineering',
                                     verbose=['fit'...
                                                   'previous_appointments',
                                                   'days_diff', 'sche

### cross validate

{'fit_time': array([7.71996212, 7.76129723, 7.72310662, 7.80634594, 7.73441267]),
 'score_time': array([1.36715508, 1.35869074, 1.34940124, 1.32613206, 1.32695317]),
 'test_precision': array([0.47711089, 0.5075846 , 0.50434783, 0.49728556, 0.50110132]),
 'test_f1': array([0.21662818, 0.20694577, 0.21748301, 0.21462043, 0.21386604]),
 'test_roc_auc': array([0.73243405, 0.75172614, 0.73698105, 0.73529309, 0.74421439])}


### fitting

before                         fit 		X shape: (82891, 13) 	y shape: (82891,)
before                         transform 	X shape: (82891, 13) 	y shape: None
FeaturerEngineering            fit
FeaturerEngineering            transform
after FeaturerEngineering      fit 		X shape: (82891, 19) 	y shape: (82891,)
after FeaturerEngineering      transform 	X shape: (82891, 19) 	y shape: None


Unnamed: 0,PatientId,AppointmentID,Gender,ScheduledDay,AppointmentDay,Age,Neighbourhood,Scholarship,Hipertension,Diabetes,Alcoholism,Handcap,SMS_received,neighbourhood,previous_appointments,days_diff,scheduled_day,scheduled_hour,appointment_day
77402,76512955655478.0000,5662085,0,2016-05-05T08:05:34Z,2016-05-12T00:00:00Z,46,SANTO ANTÔNIO,0,0,0,0,0,0,0.1991,1,7,3,8,3
18814,87594932516651.0000,5690619,0,2016-05-12T10:22:54Z,2016-05-19T00:00:00Z,12,FORTE SÃO JOÃO,0,0,0,0,0,0,0.2276,5,7,3,10,3
11412,3527476296983.0000,5679822,0,2016-05-10T10:33:03Z,2016-05-10T00:00:00Z,31,MONTE BELO,0,0,0,0,0,0,0.2446,2,0,1,10,1
85094,28732115641876.0000,5750021,0,2016-05-31T08:26:45Z,2016-06-01T00:00:00Z,43,NAZARETH,0,0,0,0,0,0,0.2500,1,1,1,8,2
51885,957553927947.0000,5686092,0,2016-05-11T12:01:22Z,2016-05-11T00:00:00Z,44,BARRO VERMELHO,0,0,0,0,0,0,0.2720,1,0,2,12,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29409,72229735539839.0000,5613104,0,2016-04-25T09:02:10Z,2016-05-10T00:00:00Z,43,MARUÍPE,0,0,0,0,0,1,0.3034,3,15,0,9,1
73603,96244428927879.0000,5719482,1,2016-05-19T11:26:28Z,2016-05-19T00:00:00Z,16,MARIA ORTIZ,0,0,0,0,0,0,0.2681,1,0,3,11,3
90007,9982166128811.0000,5719209,0,2016-05-19T10:45:53Z,2016-06-03T00:00:00Z,66,SÃO PEDRO,0,0,0,1,0,1,0.2713,1,15,3,10,4
96202,9845148296311.0000,5677190,0,2016-05-09T18:43:20Z,2016-06-07T00:00:00Z,45,JARDIM DA PENHA,0,1,0,0,0,1,0.1848,1,29,0,18,1


None

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
PatientId,82891.0,147565308114695.88,256190264198510.44,93779.5293,4155037299997.0,31747567447961.0,94393811856983.02,999981631772427.0
AppointmentID,82891.0,5675580.54,71294.5785,5030230.0,5640548.5,5680797.0,5725857.5,5790484.0
Gender,82891.0,0.3516,0.4775,0.0,0.0,0.0,1.0,1.0
Age,82891.0,37.094,23.1049,-1.0,18.0,37.0,55.0,115.0
Scholarship,82891.0,0.098,0.2973,0.0,0.0,0.0,0.0,1.0
Hipertension,82891.0,0.1976,0.3982,0.0,0.0,0.0,0.0,1.0
Diabetes,82891.0,0.0717,0.2579,0.0,0.0,0.0,0.0,1.0
Alcoholism,82891.0,0.0309,0.1731,0.0,0.0,0.0,0.0,1.0
Handcap,82891.0,0.0222,0.1616,0.0,0.0,0.0,0.0,4.0
SMS_received,82891.0,0.3206,0.4667,0.0,0.0,0.0,1.0,1.0



after ColumnTransformer        fit 		X shape: (82891, 14) 	y shape: (82891,)
after ColumnTransformer        transform 	X shape: (82891, 14) 	y shape: None


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,0.3855,-0.1375,-1.2240,-0.3789,-0.2072,0.8338,-0.8608,0.8337,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000
1,-1.0861,-0.1375,-0.5948,0.4151,-0.2072,0.8338,-0.2395,0.8337,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000
2,-0.2638,-0.1375,-0.2191,-0.1804,-0.6660,-0.6187,-0.2395,-0.6267,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000
3,0.2556,-0.1375,-0.1004,-0.3789,-0.6004,-0.6187,-0.8608,0.1035,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000
4,0.2989,-0.1375,0.3843,-0.3789,-0.6660,0.1075,0.3818,0.1035,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
82886,0.2556,-0.1375,1.0783,0.0181,0.3171,-1.3450,-0.5501,-0.6267,0.0000,1.0000,0.0000,0.0000,0.0000,0.0000
82887,-0.9130,-0.1375,0.2986,-0.3789,-0.6660,0.8338,0.0712,0.8337,1.0000,0.0000,0.0000,0.0000,0.0000,0.0000
82888,1.2511,-0.1375,0.3691,-0.3789,0.3171,0.8338,-0.2395,1.5640,0.0000,1.0000,0.0000,0.0000,0.0000,1.0000
82889,0.3422,-0.1375,-1.5393,-0.3789,1.2346,-1.3450,2.2458,-0.6267,0.0000,1.0000,0.0000,1.0000,0.0000,0.0000


None

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
0,82891.0,-0.0,1.0,-1.6488,-0.8264,-0.0041,0.775,3.3719
1,82891.0,0.0,1.0,-0.1375,-0.1375,-0.1375,-0.1375,24.6182
2,82891.0,-0.0,1.0,-2.4644,-0.7185,-0.11,0.3691,3.2414
3,82891.0,0.0,1.0,-0.3789,-0.3789,-0.1804,0.0181,13.3165
4,82891.0,0.0,1.0,-0.666,-0.666,-0.4038,0.3171,11.065
5,82891.0,-0.0,1.0,-1.345,-0.6187,0.1075,0.8338,2.2864
6,82891.0,0.0,1.0,-1.4821,-0.8608,-0.2395,0.6925,3.1778
7,82891.0,0.0,1.0,-1.357,-0.6267,0.1035,0.8337,2.2942
8,82891.0,0.3516,0.4775,0.0,0.0,0.0,1.0,1.0
9,82891.0,0.3206,0.4667,0.0,0.0,0.0,1.0,1.0






### predicting

before                         transform 	X shape: (27631, 13) 	y shape: None
FeaturerEngineering            transform
after FeaturerEngineering      transform 	X shape: (27631, 19) 	y shape: None


Unnamed: 0,PatientId,AppointmentID,Gender,ScheduledDay,AppointmentDay,Age,Neighbourhood,Scholarship,Hipertension,Diabetes,Alcoholism,Handcap,SMS_received,neighbourhood,previous_appointments,days_diff,scheduled_day,scheduled_hour,appointment_day
47409,7538475193565.0000,5667225,0,2016-05-06T07:31:55Z,2016-05-20T00:00:00Z,60,SANTOS DUMONT,0,1,1,0,0,0,0.4015,0.0000,14,4,7,4
87002,3663395134.0000,5537911,0,2016-04-01T10:49:14Z,2016-06-06T00:00:00Z,23,JARDIM CAMBURI,0,0,0,0,0,1,0.2382,1.0000,66,4,10,0
83733,26767454519622.0000,5617514,0,2016-04-25T16:24:42Z,2016-05-19T00:00:00Z,62,JARDIM CAMBURI,0,0,0,0,0,0,0.2382,5.0000,24,0,16,3
15279,4763898746127.0000,5560394,0,2016-04-08T08:54:31Z,2016-05-09T00:00:00Z,49,JESUS DE NAZARETH,0,0,0,0,0,0,0.3214,2.0000,31,4,8,0
10678,12351448425546.0000,5732102,0,2016-05-24T10:59:14Z,2016-05-31T00:00:00Z,46,FORTE SÃO JOÃO,0,0,0,0,0,1,0.2276,2.0000,7,1,10,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
53007,21332472571367.0000,5718669,1,2016-05-19T09:53:28Z,2016-05-19T00:00:00Z,6,SANTO ANDRÉ,0,0,0,0,0,0,0.2502,2.0000,0,3,9,3
95466,53634133724686.0000,5707048,0,2016-05-17T09:07:40Z,2016-06-07T00:00:00Z,29,SANTO ANDRÉ,1,0,0,0,0,1,0.2502,2.0000,21,1,9,1
24852,6149611287128.0000,5695892,1,2016-05-13T11:12:09Z,2016-05-20T00:00:00Z,11,SÃO JOSÉ,0,0,0,0,0,0,0.2692,0.0000,7,4,11,4
51828,8264217284619.0000,5731608,1,2016-05-24T10:16:51Z,2016-05-24T00:00:00Z,31,PARQUE MOSCOSO,0,0,0,0,0,0,0.2424,0.0000,0,1,10,1


None

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
PatientId,27631.0,147278122338998.72,255779897079567.6,39217.8444,4244389252741.5,31665863684154.0,94372228965726.0,999934989273974.1
AppointmentID,27631.0,5674477.9404,71300.0402,5134249.0,5639633.0,5679770.0,5724478.5,5790377.0
Gender,27631.0,0.3453,0.4755,0.0,0.0,0.0,1.0,1.0
Age,27631.0,37.0741,23.1261,0.0,18.0,37.0,55.0,115.0
Scholarship,27631.0,0.0992,0.2989,0.0,0.0,0.0,0.0,1.0
Hipertension,27631.0,0.1962,0.3972,0.0,0.0,0.0,0.0,1.0
Diabetes,27631.0,0.0725,0.2593,0.0,0.0,0.0,0.0,1.0
Alcoholism,27631.0,0.0288,0.1674,0.0,0.0,0.0,0.0,1.0
Handcap,27631.0,0.0223,0.1612,0.0,0.0,0.0,0.0,4.0
SMS_received,27631.0,0.3225,0.4674,0.0,0.0,0.0,1.0,1.0



after ColumnTransformer        transform 	X shape: (27631, 14) 	y shape: None


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,0.9914,-0.1375,3.2414,-0.5773,0.2515,1.5601,-1.1715,1.5640,0.0000,0.0000,0.0000,1.0000,1.0000,0.0000
1,-0.6100,-0.1375,-0.3615,-0.3789,3.6594,1.5601,-0.2395,-1.3570,0.0000,1.0000,0.0000,0.0000,0.0000,0.0000
2,1.0780,-0.1375,-0.3615,0.4151,0.9069,-1.3450,1.6245,0.8337,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000
3,0.5153,-0.1375,1.4756,-0.1804,1.3656,1.5601,-0.8608,-1.3570,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000
4,0.3855,-0.1375,-0.5948,-0.1804,-0.2072,-0.6187,-0.2395,-0.6267,0.0000,1.0000,0.0000,0.0000,0.0000,0.0000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27626,-1.3458,-0.1375,-0.0968,-0.1804,-0.6660,0.8338,-0.5501,0.8337,1.0000,0.0000,0.0000,0.0000,0.0000,0.0000
27627,-0.3503,-0.1375,-0.0968,-0.1804,0.7103,-0.6187,-0.5501,-0.6267,0.0000,1.0000,1.0000,0.0000,0.0000,0.0000
27628,-1.1294,-0.1375,0.3239,-0.5773,-0.2072,1.5601,0.0712,1.5640,1.0000,0.0000,0.0000,0.0000,0.0000,0.0000
27629,-0.2638,-0.1375,-0.2676,-0.5773,-0.6660,-0.6187,-0.2395,-0.6267,1.0000,0.0000,0.0000,0.0000,0.0000,0.0000


None

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
0,27631.0,-0.0009,1.0009,-1.6055,-0.8264,-0.0041,0.775,3.3719
1,27631.0,0.0005,0.9979,-0.1375,-0.1375,-0.1375,-0.1375,24.6182
2,27631.0,-0.0102,0.9949,-2.4644,-0.7185,-0.11,0.3691,3.2414
3,27631.0,-0.2006,0.9796,-0.5773,-0.5773,-0.3789,-0.1804,13.3165
4,27631.0,0.0058,0.999,-0.666,-0.666,-0.4038,0.3171,11.065
5,27631.0,0.0001,1.0048,-1.345,-0.6187,0.1075,0.8338,2.2864
6,27631.0,0.0045,0.9966,-1.4821,-0.8608,-0.2395,1.0032,2.8671
7,27631.0,-0.0,1.0066,-1.357,-0.6267,0.1035,0.8337,2.2942
8,27631.0,0.3453,0.4755,0.0,0.0,0.0,1.0,1.0
9,27631.0,0.3225,0.4674,0.0,0.0,0.0,1.0,1.0






### predicting probabilities

before                         transform 	X shape: (27631, 13) 	y shape: None
FeaturerEngineering            transform
after FeaturerEngineering      transform 	X shape: (27631, 19) 	y shape: None


Unnamed: 0,PatientId,AppointmentID,Gender,ScheduledDay,AppointmentDay,Age,Neighbourhood,Scholarship,Hipertension,Diabetes,Alcoholism,Handcap,SMS_received,neighbourhood,previous_appointments,days_diff,scheduled_day,scheduled_hour,appointment_day
47409,7538475193565.0000,5667225,0,2016-05-06T07:31:55Z,2016-05-20T00:00:00Z,60,SANTOS DUMONT,0,1,1,0,0,0,0.4015,0.0000,14,4,7,4
87002,3663395134.0000,5537911,0,2016-04-01T10:49:14Z,2016-06-06T00:00:00Z,23,JARDIM CAMBURI,0,0,0,0,0,1,0.2382,1.0000,66,4,10,0
83733,26767454519622.0000,5617514,0,2016-04-25T16:24:42Z,2016-05-19T00:00:00Z,62,JARDIM CAMBURI,0,0,0,0,0,0,0.2382,5.0000,24,0,16,3
15279,4763898746127.0000,5560394,0,2016-04-08T08:54:31Z,2016-05-09T00:00:00Z,49,JESUS DE NAZARETH,0,0,0,0,0,0,0.3214,2.0000,31,4,8,0
10678,12351448425546.0000,5732102,0,2016-05-24T10:59:14Z,2016-05-31T00:00:00Z,46,FORTE SÃO JOÃO,0,0,0,0,0,1,0.2276,2.0000,7,1,10,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
53007,21332472571367.0000,5718669,1,2016-05-19T09:53:28Z,2016-05-19T00:00:00Z,6,SANTO ANDRÉ,0,0,0,0,0,0,0.2502,2.0000,0,3,9,3
95466,53634133724686.0000,5707048,0,2016-05-17T09:07:40Z,2016-06-07T00:00:00Z,29,SANTO ANDRÉ,1,0,0,0,0,1,0.2502,2.0000,21,1,9,1
24852,6149611287128.0000,5695892,1,2016-05-13T11:12:09Z,2016-05-20T00:00:00Z,11,SÃO JOSÉ,0,0,0,0,0,0,0.2692,0.0000,7,4,11,4
51828,8264217284619.0000,5731608,1,2016-05-24T10:16:51Z,2016-05-24T00:00:00Z,31,PARQUE MOSCOSO,0,0,0,0,0,0,0.2424,0.0000,0,1,10,1


None

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
PatientId,27631.0,147278122338998.72,255779897079567.6,39217.8444,4244389252741.5,31665863684154.0,94372228965726.0,999934989273974.1
AppointmentID,27631.0,5674477.9404,71300.0402,5134249.0,5639633.0,5679770.0,5724478.5,5790377.0
Gender,27631.0,0.3453,0.4755,0.0,0.0,0.0,1.0,1.0
Age,27631.0,37.0741,23.1261,0.0,18.0,37.0,55.0,115.0
Scholarship,27631.0,0.0992,0.2989,0.0,0.0,0.0,0.0,1.0
Hipertension,27631.0,0.1962,0.3972,0.0,0.0,0.0,0.0,1.0
Diabetes,27631.0,0.0725,0.2593,0.0,0.0,0.0,0.0,1.0
Alcoholism,27631.0,0.0288,0.1674,0.0,0.0,0.0,0.0,1.0
Handcap,27631.0,0.0223,0.1612,0.0,0.0,0.0,0.0,4.0
SMS_received,27631.0,0.3225,0.4674,0.0,0.0,0.0,1.0,1.0



after ColumnTransformer        transform 	X shape: (27631, 14) 	y shape: None


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,0.9914,-0.1375,3.2414,-0.5773,0.2515,1.5601,-1.1715,1.5640,0.0000,0.0000,0.0000,1.0000,1.0000,0.0000
1,-0.6100,-0.1375,-0.3615,-0.3789,3.6594,1.5601,-0.2395,-1.3570,0.0000,1.0000,0.0000,0.0000,0.0000,0.0000
2,1.0780,-0.1375,-0.3615,0.4151,0.9069,-1.3450,1.6245,0.8337,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000
3,0.5153,-0.1375,1.4756,-0.1804,1.3656,1.5601,-0.8608,-1.3570,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000
4,0.3855,-0.1375,-0.5948,-0.1804,-0.2072,-0.6187,-0.2395,-0.6267,0.0000,1.0000,0.0000,0.0000,0.0000,0.0000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27626,-1.3458,-0.1375,-0.0968,-0.1804,-0.6660,0.8338,-0.5501,0.8337,1.0000,0.0000,0.0000,0.0000,0.0000,0.0000
27627,-0.3503,-0.1375,-0.0968,-0.1804,0.7103,-0.6187,-0.5501,-0.6267,0.0000,1.0000,1.0000,0.0000,0.0000,0.0000
27628,-1.1294,-0.1375,0.3239,-0.5773,-0.2072,1.5601,0.0712,1.5640,1.0000,0.0000,0.0000,0.0000,0.0000,0.0000
27629,-0.2638,-0.1375,-0.2676,-0.5773,-0.6660,-0.6187,-0.2395,-0.6267,1.0000,0.0000,0.0000,0.0000,0.0000,0.0000


None

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
0,27631.0,-0.0009,1.0009,-1.6055,-0.8264,-0.0041,0.775,3.3719
1,27631.0,0.0005,0.9979,-0.1375,-0.1375,-0.1375,-0.1375,24.6182
2,27631.0,-0.0102,0.9949,-2.4644,-0.7185,-0.11,0.3691,3.2414
3,27631.0,-0.2006,0.9796,-0.5773,-0.5773,-0.3789,-0.1804,13.3165
4,27631.0,0.0058,0.999,-0.666,-0.666,-0.4038,0.3171,11.065
5,27631.0,0.0001,1.0048,-1.345,-0.6187,0.1075,0.8338,2.2864
6,27631.0,0.0045,0.9966,-1.4821,-0.8608,-0.2395,1.0032,2.8671
7,27631.0,-0.0,1.0066,-1.357,-0.6267,0.1035,0.8337,2.2942
8,27631.0,0.3453,0.4755,0.0,0.0,0.0,1.0,1.0
9,27631.0,0.3225,0.4674,0.0,0.0,0.0,1.0,1.0






### Best parameters

{'clf__n_estimators': 100,
 'clf__max_depth': 10}


### feature importance

standardscaler__Age                     0.0922
standardscaler__Handcap                 0.0048
standardscaler__neighbourhood           0.0589
standardscaler__previous_appointments   0.0626
standardscaler__days_diff               0.6081
standardscaler__scheduled_day           0.0221
standardscaler__scheduled_hour          0.0410
standardscaler__appointment_day         0.0246
passthrough__Gender                     0.0091
passthrough__SMS_received               0.0531
passthrough__Scholarship                0.0067
passthrough__Hipertension               0.0079
passthrough__Diabetes                   0.0041
passthrough__Alcoholism                 0.0048
Name: feature_importances_, dtype: float64



### confusion matrix

array([[12388,  9664],
       [ 1092,  4487]])

array([[0.56, 0.44],
       [0.2 , 0.8 ]])



### scores

{'precision_score': 0.317,
 'f1_score': 0.455,
 'roc_auc_score': 0.738}

CPU times: user 10.6 s, sys: 679 ms, total: 11.3 s
Wall time: 1min 23s


{'precision_score': 0.3170800650130733,
 'f1_score': 0.4548403446528129,
 'roc_auc_score': 0.7381670414699055}