In [1]:
from pprint import pprint
from IPython.display import display, Markdown

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, cross_validate, StratifiedKFold, RandomizedSearchCV
from sklearn.metrics import get_scorer, confusion_matrix
from sklearn.metrics import f1_score, precision_score, roc_auc_score

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.compose import make_column_transformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder

In [2]:
pd.options.display.float_format = '{:.4f}'.format

In [3]:
random_state = 100

In [4]:
def title(title, n=3):
    return Markdown('{} {}'.format('#'*(max(n%4, 1)), title))

In [5]:
# !ls -lah

In [6]:
# filename = 'KaggleV2-May-2016.csv'
filename = 'https://github.com/dm6801/noshow_dataset/raw/master/KaggleV2-May-2016.csv'

In [7]:
# !wc -l "$filename"
# !head -n2 "$filename"

#### Variables' description:

    PatientId - identification of a patient;
    AppointmentID - identification of each appointment;
    Gender - male or female;
    ScheduledDay - day when a patient registered for an appointment;
    AppointmentDay - day of actuall appointment;
    Age - patient age;
    Neighbourhood - where the appointment takes place (hospital location);
    Scholarship - 1 for True, 0 for False. For more details read the article (the link is here: https://bit.ly/3AYv4GF);
    Hipertension - 1 for True, 0 for False;
    Diabetes - 1 for True, 0 for False;
    Alcoholism - 1 for True, 0 for False;
    Handcap - number of disabilities a patient has;
    SMS_received - 1 if one or more messages were sent to the patient;
    No-show - 'No' if the patient showed up to their appointment, 'Yes' if they did not show up.

In [8]:
df = pd.read_csv(filename)

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 110527 entries, 0 to 110526
Data columns (total 14 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   PatientId       110527 non-null  float64
 1   AppointmentID   110527 non-null  int64  
 2   Gender          110527 non-null  object 
 3   ScheduledDay    110527 non-null  object 
 4   AppointmentDay  110527 non-null  object 
 5   Age             110527 non-null  int64  
 6   Neighbourhood   110527 non-null  object 
 7   Scholarship     110527 non-null  int64  
 8   Hipertension    110527 non-null  int64  
 9   Diabetes        110527 non-null  int64  
 10  Alcoholism      110527 non-null  int64  
 11  Handcap         110527 non-null  int64  
 12  SMS_received    110527 non-null  int64  
 13  No-show         110527 non-null  object 
dtypes: float64(1), int64(8), object(5)
memory usage: 11.8+ MB


In [10]:
df.head()

Unnamed: 0,PatientId,AppointmentID,Gender,ScheduledDay,AppointmentDay,Age,Neighbourhood,Scholarship,Hipertension,Diabetes,Alcoholism,Handcap,SMS_received,No-show
0,29872499824296.0,5642903,F,2016-04-29T18:38:08Z,2016-04-29T00:00:00Z,62,JARDIM DA PENHA,0,1,0,0,0,0,No
1,558997776694438.0,5642503,M,2016-04-29T16:08:27Z,2016-04-29T00:00:00Z,56,JARDIM DA PENHA,0,0,0,0,0,0,No
2,4262962299951.0,5642549,F,2016-04-29T16:19:04Z,2016-04-29T00:00:00Z,62,MATA DA PRAIA,0,0,0,0,0,0,No
3,867951213174.0,5642828,F,2016-04-29T17:29:31Z,2016-04-29T00:00:00Z,8,PONTAL DE CAMBURI,0,0,0,0,0,0,No
4,8841186448183.0,5642494,F,2016-04-29T16:07:23Z,2016-04-29T00:00:00Z,56,JARDIM DA PENHA,0,1,1,0,0,0,No


In [11]:
df['No-show'] = df['No-show'].map(dict(No=0, Yes=1))

In [12]:
df['Gender'] = df['Gender'].map(dict(F=0, M=1))

In [13]:
(lambda x:x[x.dt.days<0].dt.days
)(pd.to_datetime(df['AppointmentDay']).dt.date - pd.to_datetime(df['ScheduledDay']).dt.date)

27033   -1
55226   -1
64175   -1
71533   -6
72362   -1
dtype: int64

In [14]:
df.drop((lambda x:x[x.dt.days<0].dt.days
)(pd.to_datetime(df['AppointmentDay']).dt.date - pd.to_datetime(df['ScheduledDay']).dt.date).index, inplace=True)

In [15]:
X = df.drop('No-show', axis=1)
y = df['No-show']

In [16]:
def cyclicalEncode(x, end, start=0, names={}):
    
    sin = np.sin(2 * np.pi * (x+float(start))/end)
    if 'sin' in names:
        sin.rename(names['sin'], inplace=True)
        
    cos = np.cos(2 * np.pi * (x+float(start))/end)
    if 'cos' in names:
        cos.rename(names['cos'], inplace=True)
        
    return sin, cos

In [17]:
class DebugTransformer(BaseEstimator, TransformerMixin):
    
    def __init__(self, name='', verbose=[], **kwargs):
        self.name = name if name else self.__class__.__name__
        self.tag = f'{self.name:<30}'
        self.verbose = verbose
        self.col_tf = kwargs.get('col_tf')
        
        
    def fit(self, X, y=None):
        print(self.tag, 'fit',
              '\t\tX shape:', X.shape, 
              '\ty shape:', y.shape if isinstance(y, pd.Series) else None)
        
        # if 'fit' in self.verbose:
        #     display(X if isinstance(X, pd.DataFrame) else pd.DataFrame(X),
        #             y if isinstance(y, pd.Series) else pd.Series(y) if y != None else None)
        #     print()
        
        return self
    

    def transform(self, X, y=None):
        print(self.tag, 'transform', 
              '\tX shape:', X.shape, 
              '\ty shape:', y.shape if isinstance(y, pd.Series) else None)
        
        if 'transform' in self.verbose:
            columns = self.col_tf.get_feature_names_out() if self.col_tf != None else None
            
            if not isinstance(X, pd.DataFrame):
                _X = pd.DataFrame(X, columns=columns)
            elif columns != None:
                _X = X.copy()
                _X.columns = columns
            else:
                _X = X
                
            display(_X,
                    y if isinstance(y, pd.Series) else pd.Series(y) if y != None else None)
            
            display(_X.describe().T)
            if isinstance(y, pd.Series):
                display(y.describe())
            elif y != None:
                display(pd.Series(y).describe())
                
            print()
        
        return X

In [18]:
class FeaturerEngineering(BaseEstimator, TransformerMixin):
    
    def __init__(self, name='', verbose=[]):
        self.name = name if name else self.__class__.__name__
        self.tag = f'{self.name:<30}'
        self.verbose = verbose
        
        
    def fit(self, X, y=None):
        # print(f'{self.name:<30}', 'fit',
        #       '\t\tX shape:', X.shape, 
        #       '\ty shape:', y.shape if isinstance(y, pd.Series) else '')
        if 'fit' in self.verbose:
            print(self.tag, 'fit')
        
        self.neighbourhood_ratio = (X.join(y)
                                    .pivot_table(
                                        index='Neighbourhood',
                                        columns=y.name,
                                        values='Age',
                                        aggfunc='count')
                                    .apply(lambda x:x[1]/x[0], axis=1))
        
        self.previous_no_show_ratio = (X.join(y)
                                       .pivot_table(
                                           index='PatientId',
                                           columns=y.name,
                                           values='Age',
                                           aggfunc='count')
                                       .apply(lambda x:x[1]/x[0], axis=1))
        
        return self
    

    def transform(self, X, y=None):
        # print(f'{self.name:<30}', 'transform', 
        #       '\tX shape:', X.shape, 
        #       '\ty shape:', y.shape if isinstance(y, pd.Series) else '')
        if 'transform' in self.verbose:
            print(self.tag, 'transform')
        
        neighbourhood = (X['Neighbourhood']
                         .map(self.neighbourhood_ratio)
                         .fillna(self.neighbourhood_ratio.median())
                         .rename('neighbourhood'))
        
        previous_no_show = (X['PatientId']
                            .map(self.previous_no_show_ratio)
                            .fillna(self.previous_no_show_ratio.median())
                            .rename('previous_no_show'))
        
        scheduled_datetime = pd.to_datetime(X['ScheduledDay'])
        appointmt_datetime = pd.to_datetime(X['AppointmentDay'])
        
        days_diff = (appointmt_datetime.dt.date
                     - scheduled_datetime.dt.date).dt.days.rename('days_diff')
        
        scheduled_day_of_week = scheduled_datetime.dt.dayofweek.rename('scheduled_day')
        scheduled_hour = scheduled_datetime.dt.hour.rename('scheduled_hour')
        
        appointmt_day_of_week = appointmt_datetime.dt.dayofweek.rename('appointment_day')
        
#         scheduled_month_sin, \
#         scheduled_month_cos = cyclicalEncode(scheduled_datetime.dt.month, 12, 
#                                              names={'sin': 'scheduled_month_sin',
#                                                     'cos': 'scheduled_month_cos'})
        
#         scheduled_day_sin, \
#         scheduled_day_cos = cyclicalEncode(scheduled_datetime.dt.dayofweek, 7, 
#                                            names={'sin': 'scheduled_day_sin',
#                                                   'cos': 'scheduled_day_cos'})
        
#         scheduled_hour_sin, \
#         scheduled_hour_cos = cyclicalEncode(scheduled_datetime.dt.hour, 24, 
#                                             names={'sin': 'scheduled_hour_sin',
#                                                    'cos': 'scheduled_hour_cos'})
        
#         appointmt_month_sin, \
#         appointmt_month_cos = cyclicalEncode(appointmt_datetime.dt.month, 12, 
#                                              names={'sin': 'appointmt_month_sin',
#                                                     'cos': 'appointmt_month_cos'})
        
#         appointmt_day_sin, \
#         appointmt_day_cos = cyclicalEncode(appointmt_datetime.dt.dayofweek, 7, 
#                                            names={'sin': 'appointmt_day_sin',
#                                                   'cos': 'appointmt_day_cos'})
        
        return X.join([
            days_diff,
            neighbourhood,
            previous_no_show,
            scheduled_day_of_week,
            scheduled_hour,
            appointmt_day_of_week,
#             scheduled_month_sin,
#             scheduled_month_cos,
#             scheduled_day_sin,
#             scheduled_day_cos,
#             scheduled_hour_sin,
#             scheduled_hour_cos,
#             appointmt_month_sin,
#             appointmt_month_cos,
#             appointmt_day_sin, 
#             appointmt_day_cos,
        ])

In [19]:
df.columns

Index(['PatientId', 'AppointmentID', 'Gender', 'ScheduledDay',
       'AppointmentDay', 'Age', 'Neighbourhood', 'Scholarship', 'Hipertension',
       'Diabetes', 'Alcoholism', 'Handcap', 'SMS_received', 'No-show'],
      dtype='object')

In [20]:
def my_pipeline(estimator, remainder='drop', verbose=0):
    
    column_transformer = make_column_transformer(
            (StandardScaler(), [
                'Age',
                'Handcap',
                'previous_no_show',
                'neighbourhood',
                'days_diff',
                'scheduled_day',
                'scheduled_hour',
                'appointment_day',
            ]),
#             (MinMaxScaler(), [
#                 'scheduled_month_sin',
#                 'scheduled_month_cos',
#                 'scheduled_day_sin',
#                 'scheduled_day_cos',
#                 'scheduled_hour_sin',
#                 'scheduled_hour_cos',
#                 'appointmt_month_sin',
#                 'appointmt_month_cos',
#                 'appointmt_day_sin', 
#                 'appointmt_day_cos',
#             ]),
#             (OneHotEncoder(drop='first'), [
#                 'Gender',
#             ]),
            ('passthrough', [
                'SMS_received',
                'Scholarship',
                'Hipertension',
                'Diabetes',
                'Alcoholism',
            ]),
            remainder=remainder,
            n_jobs=-1)
    
    if verbose:
        steps = [
            DebugTransformer('before'),
            FeaturerEngineering(verbose=['fit', 'transform'] if verbose>0 else []),
            DebugTransformer('after FeaturerEngineering', ['transform'] if verbose==2 else []),
            column_transformer,
            DebugTransformer('after ColumnTransformer', ['transform'] if verbose==2 else [],
                             col_tf=column_transformer),
        ]
    else:
        steps = [
            FeaturerEngineering(),
            column_transformer,
        ]
    
    return make_pipeline(*steps, estimator)

In [21]:
def scores(y_test, y_pred, y_proba, scoring):
    
    # confusion matrix
    display(title('confusion matrix'))
    pprint(confusion_matrix(y_test, y_pred), width=1)
    print()
    pprint(confusion_matrix(y_test, y_pred, normalize='true').round(2), width=1)
    print()
    
    
    # scores
    display(title('scores'))
    if isinstance(scoring, str):
        score_func = get_scorer(scoring)._score_func
        print(score_func.__name__ , round(score_func(y_test, y_pred), 3))
    else:
        scores = {f.__name__: 
                  f(y_test, y_proba) if 'y_score' in f.__code__.co_varnames else f(y_test, y_pred)
                  for f in map(lambda x:get_scorer(x)._score_func, scoring)}
        pprint({k: round(v, 3) for k, v in scores.items()}, width=1)

In [22]:
def execute(estimator, X, y, scoring=None, remainder='drop', verbose=0, *args, **kwargs):
    
    
    # verbose
    def _print(_title, func, *args, **kwargs):
        if verbose: display(title(_title))
        out = func(*args, **kwargs)
        if verbose: print()
        return out
    
    
    # create pipeline
    if isinstance(estimator, Pipeline):
        pipeline = estimator
    else:
        pipeline = my_pipeline(estimator, remainder, verbose)
    display(title(pipeline.steps[-1][1].__class__.__name__, 2))
    pprint([s[1] for s in pipeline.steps], width=1)
    
    
    # split data
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=random_state,
                                                        test_size=.2, shuffle=True, stratify=y)
    
    
    # cross validate
    cross = cross_validate(pipeline, X_train, y_train, scoring=scoring, n_jobs=-1,
                           cv=StratifiedKFold(shuffle=True, random_state=random_state))
    display(title('cross_validate'))
    pprint({k: v.round(3) for k, v in cross.items()}, width=1)
    print()

    
    # fit, predict
    _print('fitting', pipeline.fit, X_train, y_train)
    y_pred = _print('predicting', pipeline.predict, X_test)
    y_proba = _print('predicting probabilities', pipeline.predict_proba, X_test)[:, 1]
    
    
    # scores
    scores(y_test, y_pred, y_proba, scoring)

In [23]:
# explicit pipeline

execute(make_pipeline(StandardScaler(),
                       RandomForestClassifier(random_state=random_state, class_weight='balanced',
                                              n_estimators=100, max_depth=13)),
         X.drop(['PatientId', 'AppointmentID', 'ScheduledDay', 'AppointmentDay', 'Neighbourhood'], axis=1),
         y,
         scoring=['precision', 'f1', 'roc_auc'],
)

## RandomForestClassifier

[StandardScaler(),
 RandomForestClassifier(class_weight='balanced', max_depth=13, random_state=100)]


### cross_validate

{'fit_time': array([3.112, 3.096, 3.086, 3.068, 3.055]),
 'score_time': array([0.347, 0.344, 0.347, 0.347, 0.348]),
 'test_f1': array([0.346, 0.325, 0.338, 0.349, 0.334]),
 'test_precision': array([0.281, 0.262, 0.274, 0.279, 0.272]),
 'test_roc_auc': array([0.609, 0.589, 0.596, 0.605, 0.592])}



### confusion matrix

array([[12456,  5186],
       [ 2434,  2029]])

array([[0.71, 0.29],
       [0.55, 0.45]])



### scores

{'f1_score': 0.347,
 'precision_score': 0.281,
 'roc_auc_score': 0.604}


In [24]:
# custom implicit pipeline

execute(RandomForestClassifier(random_state=random_state, class_weight='balanced',
                                n_estimators=100, max_depth=13),
         X,
         y,
         scoring=['precision', 'f1', 'roc_auc'],
         verbose=1,
        )

## RandomForestClassifier

[DebugTransformer(name='before'),
 FeaturerEngineering(name='FeaturerEngineering', verbose=['fit', 'transform']),
 DebugTransformer(name='after FeaturerEngineering'),
 ColumnTransformer(n_jobs=-1,
                  transformers=[('standardscaler', StandardScaler(),
                                 ['Age', 'Handcap', 'previous_no_show',
                                  'neighbourhood', 'days_diff', 'scheduled_day',
                                  'scheduled_hour', 'appointment_day']),
                                ('passthrough', 'passthrough',
                                 ['SMS_received', 'Scholarship', 'Hipertension',
                                  'Diabetes', 'Alcoholism'])]),
 DebugTransformer(name='after ColumnTransformer'),
 RandomForestClassifier(class_weight='balanced', max_depth=13, random_state=100)]


### cross_validate

{'fit_time': array([5.883, 5.894, 5.806, 5.76 , 5.832]),
 'score_time': array([0.539, 0.533, 0.534, 0.704, 0.524]),
 'test_f1': array([0.453, 0.451, 0.455, 0.442, 0.45 ]),
 'test_precision': array([0.335, 0.335, 0.337, 0.329, 0.334]),
 'test_roc_auc': array([0.744, 0.745, 0.745, 0.737, 0.74 ])}



### fitting

before                         fit 		X shape: (88417, 13) 	y shape: (88417,)
before                         transform 	X shape: (88417, 13) 	y shape: None
FeaturerEngineering            fit
FeaturerEngineering            transform
after FeaturerEngineering      fit 		X shape: (88417, 19) 	y shape: (88417,)
after FeaturerEngineering      transform 	X shape: (88417, 19) 	y shape: None
after ColumnTransformer        fit 		X shape: (88417, 13) 	y shape: (88417,)
after ColumnTransformer        transform 	X shape: (88417, 13) 	y shape: None



### predicting

before                         transform 	X shape: (22105, 13) 	y shape: None
FeaturerEngineering            transform
after FeaturerEngineering      transform 	X shape: (22105, 19) 	y shape: None
after ColumnTransformer        transform 	X shape: (22105, 13) 	y shape: None



### predicting probabilities

before                         transform 	X shape: (22105, 13) 	y shape: None
FeaturerEngineering            transform
after FeaturerEngineering      transform 	X shape: (22105, 19) 	y shape: None
after ColumnTransformer        transform 	X shape: (22105, 13) 	y shape: None



### confusion matrix

array([[11462,  6180],
       [ 1360,  3103]])

array([[0.65, 0.35],
       [0.3 , 0.7 ]])



### scores

{'f1_score': 0.451,
 'precision_score': 0.334,
 'roc_auc_score': 0.743}


In [25]:
# custom implicit pipeline, verbose=2

execute(RandomForestClassifier(random_state=random_state, class_weight='balanced',
                                n_estimators=100, max_depth=13),
         X,
         y,
         scoring=['precision', 'f1', 'roc_auc'],
         # remainder='passthrough',
         verbose=2,
        )

## RandomForestClassifier

[DebugTransformer(name='before'),
 FeaturerEngineering(name='FeaturerEngineering', verbose=['fit', 'transform']),
 DebugTransformer(name='after FeaturerEngineering', verbose=['transform']),
 ColumnTransformer(n_jobs=-1,
                  transformers=[('standardscaler', StandardScaler(),
                                 ['Age', 'Handcap', 'previous_no_show',
                                  'neighbourhood', 'days_diff', 'scheduled_day',
                                  'scheduled_hour', 'appointment_day']),
                                ('passthrough', 'passthrough',
                                 ['SMS_received', 'Scholarship', 'Hipertension',
                                  'Diabetes', 'Alcoholism'])]),
 DebugTransformer(name='after ColumnTransformer', verbose=['transform']),
 RandomForestClassifier(class_weight='balanced', max_depth=13, random_state=100)]


### cross_validate

{'fit_time': array([6.066, 6.121, 6.136, 6.141, 6.178]),
 'score_time': array([0.876, 0.845, 0.739, 0.741, 0.738]),
 'test_f1': array([0.453, 0.451, 0.455, 0.442, 0.45 ]),
 'test_precision': array([0.335, 0.335, 0.337, 0.329, 0.334]),
 'test_roc_auc': array([0.744, 0.745, 0.745, 0.737, 0.74 ])}



### fitting

before                         fit 		X shape: (88417, 13) 	y shape: (88417,)
before                         transform 	X shape: (88417, 13) 	y shape: None
FeaturerEngineering            fit
FeaturerEngineering            transform
after FeaturerEngineering      fit 		X shape: (88417, 19) 	y shape: (88417,)
after FeaturerEngineering      transform 	X shape: (88417, 19) 	y shape: None


Unnamed: 0,PatientId,AppointmentID,Gender,ScheduledDay,AppointmentDay,Age,Neighbourhood,Scholarship,Hipertension,Diabetes,Alcoholism,Handcap,SMS_received,days_diff,neighbourhood,previous_no_show,scheduled_day,scheduled_hour,appointment_day
71399,52943776752179.0000,5646394,0,2016-05-02T11:18:43Z,2016-05-16T00:00:00Z,82,CARATOÍRA,0,1,1,0,0,0,14,0.2854,1.0000,0,11,0
29617,929877688574.0000,5736715,0,2016-05-25T08:43:49Z,2016-05-25T00:00:00Z,3,REDENÇÃO,0,0,0,0,0,0,0,0.2218,1.0000,2,8,2
102468,93794655229715.0000,5782066,1,2016-06-07T10:31:32Z,2016-06-07T00:00:00Z,75,FORTE SÃO JOÃO,0,0,0,0,0,0,0,0.2302,1.0000,1,10,1
84631,119338344775698.0000,5663200,0,2016-05-05T09:39:35Z,2016-06-01T00:00:00Z,50,SOLON BORGES,0,0,0,0,0,1,27,0.1789,1.0000,3,9,2
4690,8424931735794.0000,5636669,0,2016-04-28T15:49:33Z,2016-05-02T00:00:00Z,55,BOA VISTA,0,0,0,0,0,1,4,0.2063,1.0000,3,15,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
96719,34597828157767.0000,5762309,1,2016-06-02T07:33:44Z,2016-06-06T00:00:00Z,55,JOANA D´ARC,0,1,0,0,0,1,4,0.2179,1.0000,3,7,0
18710,7943512389497.0000,5674964,0,2016-05-09T12:35:32Z,2016-05-17T00:00:00Z,34,FORTE SÃO JOÃO,0,0,0,0,0,0,8,0.2302,0.2000,0,12,1
9953,846927776733873.0000,5661140,0,2016-05-05T07:14:31Z,2016-05-11T00:00:00Z,62,FORTE SÃO JOÃO,0,0,0,0,0,0,6,0.2302,1.0000,3,7,2
50003,27865249452328.0000,5691747,1,2016-05-12T13:36:31Z,2016-05-16T00:00:00Z,39,MATA DA PRAIA,0,0,0,0,0,0,4,0.1958,1.0000,3,13,0


None

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
PatientId,88417.0,148038588366760.6,256470763049342.8,39217.8444,4186886999161.0,31762294475468.0,94555657279956.98,999968578354865.9
AppointmentID,88417.0,5675372.9417,71398.6179,5030230.0,5640383.0,5680666.0,5725559.0,5790484.0
Gender,88417.0,0.3504,0.4771,0.0,0.0,0.0,1.0,1.0
Age,88417.0,37.02,23.0907,0.0,18.0,37.0,55.0,115.0
Scholarship,88417.0,0.0989,0.2986,0.0,0.0,0.0,0.0,1.0
Hipertension,88417.0,0.1967,0.3975,0.0,0.0,0.0,0.0,1.0
Diabetes,88417.0,0.0719,0.2582,0.0,0.0,0.0,0.0,1.0
Alcoholism,88417.0,0.0299,0.1704,0.0,0.0,0.0,0.0,1.0
Handcap,88417.0,0.0225,0.1623,0.0,0.0,0.0,0.0,4.0
SMS_received,88417.0,0.3203,0.4666,0.0,0.0,0.0,1.0,1.0



after ColumnTransformer        fit 		X shape: (88417, 13) 	y shape: (88417,)
after ColumnTransformer        transform 	X shape: (88417, 13) 	y shape: None


Unnamed: 0,standardscaler__Age,standardscaler__Handcap,standardscaler__previous_no_show,standardscaler__neighbourhood,standardscaler__days_diff,standardscaler__scheduled_day,standardscaler__scheduled_hour,standardscaler__appointment_day,passthrough__SMS_received,passthrough__Scholarship,passthrough__Hipertension,passthrough__Diabetes,passthrough__Alcoholism
0,1.9480,-0.1383,0.1285,0.7199,0.2504,-1.3443,0.0716,-1.3582,0.0000,0.0000,1.0000,1.0000,0.0000
1,-1.4733,-0.1383,0.1285,-0.7581,-0.6650,0.1060,-0.8606,0.0998,0.0000,0.0000,0.0000,0.0000,0.0000
2,1.6448,-0.1383,0.1285,-0.5624,-0.6650,-0.6192,-0.2392,-0.6292,0.0000,0.0000,0.0000,0.0000,0.0000
3,0.5621,-0.1383,0.1285,-1.7553,1.1004,0.8311,-0.5499,0.0998,1.0000,0.0000,0.0000,0.0000,0.0000
4,0.7787,-0.1383,0.1285,-1.1189,-0.4035,0.8311,1.3144,-1.3582,1.0000,0.0000,0.0000,0.0000,0.0000
...,...,...,...,...,...,...,...,...,...,...,...,...,...
88412,0.7787,-0.1383,0.1285,-0.8495,-0.4035,0.8311,-1.1713,-1.3582,1.0000,0.0000,1.0000,0.0000,0.0000
88413,-0.1308,-0.1383,-1.9659,-0.5624,-0.1419,-1.3443,0.3823,-0.6292,0.0000,0.0000,0.0000,0.0000,0.0000
88414,1.0818,-0.1383,0.1285,-0.5624,-0.2727,0.8311,-1.1713,0.0998,0.0000,0.0000,0.0000,0.0000,0.0000
88415,0.0857,-0.1383,0.1285,-1.3625,-0.4035,0.8311,0.6930,-1.3582,0.0000,0.0000,0.0000,0.0000,0.0000


None

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
standardscaler__Age,88417.0,0.0,1.0,-1.6033,-0.8237,-0.0009,0.7787,3.3771
standardscaler__Handcap,88417.0,-0.0,1.0,-0.1383,-0.1383,-0.1383,-0.1383,24.5105
standardscaler__previous_no_show,88417.0,0.0,1.0,-2.4521,0.1285,0.1285,0.1285,23.6908
standardscaler__neighbourhood,88417.0,-0.0,1.0,-3.5101,-0.5796,-0.0732,0.5455,3.152
standardscaler__days_diff,88417.0,0.0,1.0,-0.665,-0.665,-0.4035,0.3158,11.0387
standardscaler__scheduled_day,88417.0,-0.0,1.0,-1.3443,-0.6192,0.106,0.8311,2.2814
standardscaler__scheduled_hour,88417.0,-0.0,1.0,-1.4821,-0.8606,-0.2392,0.693,3.1788
standardscaler__appointment_day,88417.0,0.0,1.0,-1.3582,-0.6292,0.0998,0.8287,2.2866
passthrough__SMS_received,88417.0,0.3203,0.4666,0.0,0.0,0.0,1.0,1.0
passthrough__Scholarship,88417.0,0.0989,0.2986,0.0,0.0,0.0,0.0,1.0






### predicting

before                         transform 	X shape: (22105, 13) 	y shape: None
FeaturerEngineering            transform
after FeaturerEngineering      transform 	X shape: (22105, 19) 	y shape: None


Unnamed: 0,PatientId,AppointmentID,Gender,ScheduledDay,AppointmentDay,Age,Neighbourhood,Scholarship,Hipertension,Diabetes,Alcoholism,Handcap,SMS_received,days_diff,neighbourhood,previous_no_show,scheduled_day,scheduled_hour,appointment_day
15332,2216934948346.0000,5354988,1,2016-02-16T14:32:50Z,2016-05-03T00:00:00Z,1,JESUS DE NAZARETH,0,0,0,0,0,1,77,0.3220,1.0000,1,14,1
93674,61598855925449.0000,5775761,0,2016-06-06T09:54:25Z,2016-06-06T00:00:00Z,62,BARRO VERMELHO,0,0,0,0,0,0,0,0.2510,1.0000,0,9,0
35232,6385853358899.0000,5609132,0,2016-04-20T15:13:24Z,2016-05-02T00:00:00Z,53,SANTA CECÍLIA,0,0,0,0,0,1,12,0.3778,1.0000,2,15,0
57781,54263839445181.0000,5657234,1,2016-05-04T09:05:57Z,2016-05-04T00:00:00Z,0,NOVA PALESTINA,0,0,0,0,0,0,0,0.2026,1.0000,2,9,2
72867,8454696544714.0000,5657045,0,2016-05-04T08:51:33Z,2016-05-04T00:00:00Z,46,ANDORINHAS,0,0,1,0,0,0,0,0.3001,1.0000,2,8,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
54574,9671627622513.0000,5677407,0,2016-05-10T07:11:52Z,2016-05-16T00:00:00Z,31,ILHA DO PRÍNCIPE,0,0,0,0,0,0,6,0.2978,0.5000,1,7,0
59715,489676858689.0000,5648021,0,2016-05-02T14:54:15Z,2016-05-02T00:00:00Z,64,ILHA DE SANTA MARIA,0,0,0,0,0,0,0,0.2512,1.0000,0,14,0
81984,7216116453835.0000,5751342,0,2016-05-31T09:59:12Z,2016-05-31T00:00:00Z,78,ITARARÉ,0,1,1,0,1,0,0,0.3634,1.0000,1,9,1
39342,98214546148899.0000,5632553,0,2016-04-28T08:09:20Z,2016-05-17T00:00:00Z,55,ITARARÉ,0,1,0,0,0,0,19,0.3634,2.0000,3,8,1


None

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
PatientId,22105.0,145313272366444.53,254538334482094.47,54223998.0,3989174191769.0,31627943533386.0,93284795988219.02,999981631772427.0
AppointmentID,22105.0,5675032.668,70891.1468,5139848.0,5639888.0,5680256.0,5725427.0,5790464.0
Gender,22105.0,0.3486,0.4765,0.0,0.0,0.0,1.0,1.0
Age,22105.0,37.365,23.1858,-1.0,18.0,37.0,56.0,102.0
Scholarship,22105.0,0.0957,0.2942,0.0,0.0,0.0,0.0,1.0
Hipertension,22105.0,0.1995,0.3996,0.0,0.0,0.0,0.0,1.0
Diabetes,22105.0,0.0719,0.2584,0.0,0.0,0.0,0.0,1.0
Alcoholism,22105.0,0.0322,0.1766,0.0,0.0,0.0,0.0,1.0
Handcap,22105.0,0.0214,0.1583,0.0,0.0,0.0,0.0,4.0
SMS_received,22105.0,0.3239,0.468,0.0,0.0,0.0,1.0,1.0



after ColumnTransformer        transform 	X shape: (22105, 13) 	y shape: None


Unnamed: 0,standardscaler__Age,standardscaler__Handcap,standardscaler__previous_no_show,standardscaler__neighbourhood,standardscaler__days_diff,standardscaler__scheduled_day,standardscaler__scheduled_hour,standardscaler__appointment_day,passthrough__SMS_received,passthrough__Scholarship,passthrough__Hipertension,passthrough__Diabetes,passthrough__Alcoholism
0,-1.5599,-0.1383,0.1285,1.5726,4.3695,-0.6192,1.0037,-0.6292,1.0000,0.0000,0.0000,0.0000,0.0000
1,1.0818,-0.1383,0.1285,-0.0801,-0.6650,-1.3443,-0.5499,-1.3582,0.0000,0.0000,0.0000,0.0000,0.0000
2,0.6921,-0.1383,0.1285,2.8690,0.1196,0.1060,1.3144,-1.3582,1.0000,0.0000,0.0000,0.0000,0.0000
3,-1.6033,-0.1383,0.1285,-1.2041,-0.6650,0.1060,-0.5499,0.0998,0.0000,0.0000,0.0000,0.0000,0.0000
4,0.3889,-0.1383,0.1285,1.0638,-0.6650,0.1060,-0.8606,0.0998,0.0000,0.0000,0.0000,1.0000,0.0000
...,...,...,...,...,...,...,...,...,...,...,...,...,...
22100,-0.2607,-0.1383,-1.1805,1.0099,-0.2727,-0.6192,-1.1713,-1.3582,0.0000,0.0000,0.0000,0.0000,0.0000
22101,1.1684,-0.1383,0.1285,-0.0732,-0.6650,-1.3443,1.0037,-1.3582,0.0000,0.0000,0.0000,0.0000,0.0000
22102,1.7747,6.0239,0.1285,2.5350,-0.6650,-0.6192,-0.5499,-0.6292,0.0000,0.0000,1.0000,1.0000,0.0000
22103,0.7787,-0.1383,2.7466,2.5350,0.5773,0.8311,-0.8606,-0.6292,0.0000,0.0000,1.0000,0.0000,0.0000


None

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
standardscaler__Age,22105.0,0.0149,1.0041,-1.6466,-0.8237,-0.0009,0.822,2.8141
standardscaler__Handcap,22105.0,-0.0068,0.9755,-0.1383,-0.1383,-0.1383,-0.1383,24.5105
standardscaler__previous_no_show,22105.0,0.0396,0.8009,-2.4521,0.1285,0.1285,0.1285,23.6908
standardscaler__neighbourhood,22105.0,0.0009,0.9961,-3.5101,-0.5796,-0.0732,0.5455,3.152
standardscaler__days_diff,22105.0,0.0045,0.9871,-0.665,-0.665,-0.4035,0.3158,10.8425
standardscaler__scheduled_day,22105.0,-0.007,0.9981,-1.3443,-0.6192,0.106,0.8311,2.2814
standardscaler__scheduled_hour,22105.0,0.0074,0.9967,-1.4821,-0.8606,-0.2392,1.0037,3.1788
standardscaler__appointment_day,22105.0,-0.0178,0.9994,-1.3582,-0.6292,0.0998,0.8287,2.2866
passthrough__SMS_received,22105.0,0.3239,0.468,0.0,0.0,0.0,1.0,1.0
passthrough__Scholarship,22105.0,0.0957,0.2942,0.0,0.0,0.0,0.0,1.0






### predicting probabilities

before                         transform 	X shape: (22105, 13) 	y shape: None
FeaturerEngineering            transform
after FeaturerEngineering      transform 	X shape: (22105, 19) 	y shape: None


Unnamed: 0,PatientId,AppointmentID,Gender,ScheduledDay,AppointmentDay,Age,Neighbourhood,Scholarship,Hipertension,Diabetes,Alcoholism,Handcap,SMS_received,days_diff,neighbourhood,previous_no_show,scheduled_day,scheduled_hour,appointment_day
15332,2216934948346.0000,5354988,1,2016-02-16T14:32:50Z,2016-05-03T00:00:00Z,1,JESUS DE NAZARETH,0,0,0,0,0,1,77,0.3220,1.0000,1,14,1
93674,61598855925449.0000,5775761,0,2016-06-06T09:54:25Z,2016-06-06T00:00:00Z,62,BARRO VERMELHO,0,0,0,0,0,0,0,0.2510,1.0000,0,9,0
35232,6385853358899.0000,5609132,0,2016-04-20T15:13:24Z,2016-05-02T00:00:00Z,53,SANTA CECÍLIA,0,0,0,0,0,1,12,0.3778,1.0000,2,15,0
57781,54263839445181.0000,5657234,1,2016-05-04T09:05:57Z,2016-05-04T00:00:00Z,0,NOVA PALESTINA,0,0,0,0,0,0,0,0.2026,1.0000,2,9,2
72867,8454696544714.0000,5657045,0,2016-05-04T08:51:33Z,2016-05-04T00:00:00Z,46,ANDORINHAS,0,0,1,0,0,0,0,0.3001,1.0000,2,8,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
54574,9671627622513.0000,5677407,0,2016-05-10T07:11:52Z,2016-05-16T00:00:00Z,31,ILHA DO PRÍNCIPE,0,0,0,0,0,0,6,0.2978,0.5000,1,7,0
59715,489676858689.0000,5648021,0,2016-05-02T14:54:15Z,2016-05-02T00:00:00Z,64,ILHA DE SANTA MARIA,0,0,0,0,0,0,0,0.2512,1.0000,0,14,0
81984,7216116453835.0000,5751342,0,2016-05-31T09:59:12Z,2016-05-31T00:00:00Z,78,ITARARÉ,0,1,1,0,1,0,0,0.3634,1.0000,1,9,1
39342,98214546148899.0000,5632553,0,2016-04-28T08:09:20Z,2016-05-17T00:00:00Z,55,ITARARÉ,0,1,0,0,0,0,19,0.3634,2.0000,3,8,1


None

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
PatientId,22105.0,145313272366444.53,254538334482094.47,54223998.0,3989174191769.0,31627943533386.0,93284795988219.02,999981631772427.0
AppointmentID,22105.0,5675032.668,70891.1468,5139848.0,5639888.0,5680256.0,5725427.0,5790464.0
Gender,22105.0,0.3486,0.4765,0.0,0.0,0.0,1.0,1.0
Age,22105.0,37.365,23.1858,-1.0,18.0,37.0,56.0,102.0
Scholarship,22105.0,0.0957,0.2942,0.0,0.0,0.0,0.0,1.0
Hipertension,22105.0,0.1995,0.3996,0.0,0.0,0.0,0.0,1.0
Diabetes,22105.0,0.0719,0.2584,0.0,0.0,0.0,0.0,1.0
Alcoholism,22105.0,0.0322,0.1766,0.0,0.0,0.0,0.0,1.0
Handcap,22105.0,0.0214,0.1583,0.0,0.0,0.0,0.0,4.0
SMS_received,22105.0,0.3239,0.468,0.0,0.0,0.0,1.0,1.0



after ColumnTransformer        transform 	X shape: (22105, 13) 	y shape: None


Unnamed: 0,standardscaler__Age,standardscaler__Handcap,standardscaler__previous_no_show,standardscaler__neighbourhood,standardscaler__days_diff,standardscaler__scheduled_day,standardscaler__scheduled_hour,standardscaler__appointment_day,passthrough__SMS_received,passthrough__Scholarship,passthrough__Hipertension,passthrough__Diabetes,passthrough__Alcoholism
0,-1.5599,-0.1383,0.1285,1.5726,4.3695,-0.6192,1.0037,-0.6292,1.0000,0.0000,0.0000,0.0000,0.0000
1,1.0818,-0.1383,0.1285,-0.0801,-0.6650,-1.3443,-0.5499,-1.3582,0.0000,0.0000,0.0000,0.0000,0.0000
2,0.6921,-0.1383,0.1285,2.8690,0.1196,0.1060,1.3144,-1.3582,1.0000,0.0000,0.0000,0.0000,0.0000
3,-1.6033,-0.1383,0.1285,-1.2041,-0.6650,0.1060,-0.5499,0.0998,0.0000,0.0000,0.0000,0.0000,0.0000
4,0.3889,-0.1383,0.1285,1.0638,-0.6650,0.1060,-0.8606,0.0998,0.0000,0.0000,0.0000,1.0000,0.0000
...,...,...,...,...,...,...,...,...,...,...,...,...,...
22100,-0.2607,-0.1383,-1.1805,1.0099,-0.2727,-0.6192,-1.1713,-1.3582,0.0000,0.0000,0.0000,0.0000,0.0000
22101,1.1684,-0.1383,0.1285,-0.0732,-0.6650,-1.3443,1.0037,-1.3582,0.0000,0.0000,0.0000,0.0000,0.0000
22102,1.7747,6.0239,0.1285,2.5350,-0.6650,-0.6192,-0.5499,-0.6292,0.0000,0.0000,1.0000,1.0000,0.0000
22103,0.7787,-0.1383,2.7466,2.5350,0.5773,0.8311,-0.8606,-0.6292,0.0000,0.0000,1.0000,0.0000,0.0000


None

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
standardscaler__Age,22105.0,0.0149,1.0041,-1.6466,-0.8237,-0.0009,0.822,2.8141
standardscaler__Handcap,22105.0,-0.0068,0.9755,-0.1383,-0.1383,-0.1383,-0.1383,24.5105
standardscaler__previous_no_show,22105.0,0.0396,0.8009,-2.4521,0.1285,0.1285,0.1285,23.6908
standardscaler__neighbourhood,22105.0,0.0009,0.9961,-3.5101,-0.5796,-0.0732,0.5455,3.152
standardscaler__days_diff,22105.0,0.0045,0.9871,-0.665,-0.665,-0.4035,0.3158,10.8425
standardscaler__scheduled_day,22105.0,-0.007,0.9981,-1.3443,-0.6192,0.106,0.8311,2.2814
standardscaler__scheduled_hour,22105.0,0.0074,0.9967,-1.4821,-0.8606,-0.2392,1.0037,3.1788
standardscaler__appointment_day,22105.0,-0.0178,0.9994,-1.3582,-0.6292,0.0998,0.8287,2.2866
passthrough__SMS_received,22105.0,0.3239,0.468,0.0,0.0,0.0,1.0,1.0
passthrough__Scholarship,22105.0,0.0957,0.2942,0.0,0.0,0.0,0.0,1.0






### confusion matrix

array([[11462,  6180],
       [ 1360,  3103]])

array([[0.65, 0.35],
       [0.3 , 0.7 ]])



### scores

{'f1_score': 0.451,
 'precision_score': 0.334,
 'roc_auc_score': 0.743}


<br>

# Tune hyperparameters

In [26]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=random_state,
                                                    test_size=.2, shuffle=True, stratify=y)

In [27]:
search_params = RandomizedSearchCV(
    estimator=my_pipeline(RandomForestClassifier(random_state=random_state, class_weight='balanced')),
    param_distributions={
        'randomforestclassifier__n_estimators': [100, 110, 120, 130],
        'randomforestclassifier__max_depth': [None, 5, 10, 15]
    },
    n_jobs=-1,
    random_state=random_state)

In [28]:
%%time
search_params.fit(X_train, y_train)
search_params.best_params_

CPU times: user 12 s, sys: 603 ms, total: 12.6 s
Wall time: 1min 14s


{'randomforestclassifier__n_estimators': 120,
 'randomforestclassifier__max_depth': None}

In [29]:
scores(y_test,
       search_params.best_estimator_.predict(X_test),
       search_params.best_estimator_.predict_proba(X_test)[:, 1],
       scoring=['precision', 'f1', 'roc_auc'])

### confusion matrix

array([[16772,   870],
       [ 3708,   755]])

array([[0.95, 0.05],
       [0.83, 0.17]])



### scores

{'f1_score': 0.248,
 'precision_score': 0.465,
 'roc_auc_score': 0.733}
