In [1]:
from pprint import pprint
from IPython.display import display, Markdown

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, cross_validate
from sklearn.metrics import get_scorer, confusion_matrix
from sklearn.metrics import f1_score, precision_score, roc_auc_score

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.compose import make_column_transformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder

In [2]:
pd.options.display.float_format = '{:.4f}'.format

In [3]:
def title(title, n=3):
    return Markdown('{} {}'.format('#'*(max(n%4, 1)), title))

In [4]:
# !ls -lah

In [5]:
# filename = 'KaggleV2-May-2016.csv'
filename = 'https://github.com/dm6801/noshow_dataset/raw/master/KaggleV2-May-2016.csv'

In [6]:
# !wc -l "$filename"
# !head -n2 "$filename"

#### Variables' description:

    PatientId - identification of a patient;
    AppointmentID - identification of each appointment;
    Gender - male or female;
    ScheduledDay - day when a patient registered for an appointment;
    AppointmentDay - day of actuall appointment;
    Age - patient age;
    Neighbourhood - where the appointment takes place (hospital location);
    Scholarship - 1 for True, 0 for False. For more details read the article (the link is here: https://bit.ly/3AYv4GF);
    Hipertension - 1 for True, 0 for False;
    Diabetes - 1 for True, 0 for False;
    Alcoholism - 1 for True, 0 for False;
    Handcap - number of disabilities a patient has;
    SMS_received - 1 if one or more messages were sent to the patient;
    No-show - 'No' if the patient showed up to their appointment, 'Yes' if they did not show up.

In [7]:
df = pd.read_csv(filename)

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 110527 entries, 0 to 110526
Data columns (total 14 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   PatientId       110527 non-null  float64
 1   AppointmentID   110527 non-null  int64  
 2   Gender          110527 non-null  object 
 3   ScheduledDay    110527 non-null  object 
 4   AppointmentDay  110527 non-null  object 
 5   Age             110527 non-null  int64  
 6   Neighbourhood   110527 non-null  object 
 7   Scholarship     110527 non-null  int64  
 8   Hipertension    110527 non-null  int64  
 9   Diabetes        110527 non-null  int64  
 10  Alcoholism      110527 non-null  int64  
 11  Handcap         110527 non-null  int64  
 12  SMS_received    110527 non-null  int64  
 13  No-show         110527 non-null  object 
dtypes: float64(1), int64(8), object(5)
memory usage: 11.8+ MB


In [9]:
df.head()

Unnamed: 0,PatientId,AppointmentID,Gender,ScheduledDay,AppointmentDay,Age,Neighbourhood,Scholarship,Hipertension,Diabetes,Alcoholism,Handcap,SMS_received,No-show
0,29872499824296.0,5642903,F,2016-04-29T18:38:08Z,2016-04-29T00:00:00Z,62,JARDIM DA PENHA,0,1,0,0,0,0,No
1,558997776694438.0,5642503,M,2016-04-29T16:08:27Z,2016-04-29T00:00:00Z,56,JARDIM DA PENHA,0,0,0,0,0,0,No
2,4262962299951.0,5642549,F,2016-04-29T16:19:04Z,2016-04-29T00:00:00Z,62,MATA DA PRAIA,0,0,0,0,0,0,No
3,867951213174.0,5642828,F,2016-04-29T17:29:31Z,2016-04-29T00:00:00Z,8,PONTAL DE CAMBURI,0,0,0,0,0,0,No
4,8841186448183.0,5642494,F,2016-04-29T16:07:23Z,2016-04-29T00:00:00Z,56,JARDIM DA PENHA,0,1,1,0,0,0,No


In [10]:
df['No-show'] = df['No-show'].map(dict(No=0, Yes=1))

In [11]:
df['Gender'] = df['Gender'].map(dict(F=0, M=1))

In [12]:
(lambda x:x[x.dt.days<0].dt.days
)(pd.to_datetime(df['AppointmentDay']).dt.date - pd.to_datetime(df['ScheduledDay']).dt.date)

27033   -1
55226   -1
64175   -1
71533   -6
72362   -1
dtype: int64

In [13]:
df.drop((lambda x:x[x.dt.days<0].dt.days
)(pd.to_datetime(df['AppointmentDay']).dt.date - pd.to_datetime(df['ScheduledDay']).dt.date).index, inplace=True)

In [14]:
X = df.drop('No-show', axis=1)
y = df['No-show']

In [15]:
def cyclicalEncode(x, end, start=0, names={}):
    
    sin = np.sin(2 * np.pi * (x+float(start))/end)
    if 'sin' in names:
        sin.rename(names['sin'], inplace=True)
        
    cos = np.cos(2 * np.pi * (x+float(start))/end)
    if 'cos' in names:
        cos.rename(names['cos'], inplace=True)
        
    return sin, cos

In [16]:
class DebugTransformer(BaseEstimator, TransformerMixin):
    
    def __init__(self, name='', verbose=[], **kwargs):
        self.name = name if name else self.__class__.__name__
        self.tag = f'{self.name:<30}'
        self.verbose = verbose
        self.col_tf = kwargs.get('col_tf')
        
        
    def fit(self, X, y=None):
        print(self.tag, 'fit',
              '\t\tX shape:', X.shape, 
              '\ty shape:', y.shape if isinstance(y, pd.Series) else None)
        
        # if 'fit' in self.verbose:
        #     display(X if isinstance(X, pd.DataFrame) else pd.DataFrame(X),
        #             y if isinstance(y, pd.Series) else pd.Series(y) if y != None else None)
        #     print()
        
        return self
    

    def transform(self, X, y=None):
        print(self.tag, 'transform', 
              '\tX shape:', X.shape, 
              '\ty shape:', y.shape if isinstance(y, pd.Series) else None)
        
        if 'transform' in self.verbose:
            columns = self.col_tf.get_feature_names_out() if self.col_tf != None else None
            
            if not isinstance(X, pd.DataFrame):
                _X = pd.DataFrame(X, columns=columns)
            elif columns != None:
                _X = X.copy()
                _X.columns = columns
            else:
                _X = X
                
            display(_X,
                    y if isinstance(y, pd.Series) else pd.Series(y) if y != None else None)
            
            display(_X.describe().T)
            if isinstance(y, pd.Series):
                display(y.describe())
            elif y != None:
                display(pd.Series(y).describe())
                
            print()
        
        return X

In [17]:
class FeaturerEngineering(BaseEstimator, TransformerMixin):
    
    def __init__(self, name=''):
        self.name = name if name else self.__class__.__name__
        
        
    def fit(self, X, y=None):
        # print(f'{self.name:<30}', 'fit',
        #       '\t\tX shape:', X.shape, 
        #       '\ty shape:', y.shape if isinstance(y, pd.Series) else '')
        
        self.neighbourhood_ratio = (X.join(y)
                                    .pivot_table(
                                        index='Neighbourhood',
                                        columns=y.name,
                                        values='Age',
                                        aggfunc='count')
                                    .apply(lambda x:x[1]/x[0], axis=1))
        
        self.previous_no_show_ratio = (X.join(y)
                                       .pivot_table(
                                           index='PatientId',
                                           columns=y.name,
                                           values='Age',
                                           aggfunc='count')
                                    .apply(lambda x:x[1]/x[0], axis=1))
        
        return self
    

    def transform(self, X, y=None):
        # print(f'{self.name:<30}', 'transform', 
        #       '\tX shape:', X.shape, 
        #       '\ty shape:', y.shape if isinstance(y, pd.Series) else '')
        
        neighbourhood = (X['Neighbourhood']
                         .map(self.neighbourhood_ratio)
                         .fillna(self.neighbourhood_ratio.median())
                         .rename('neighbourhood'))
        
        previous_no_show = (X['PatientId']
                            .map(self.previous_no_show_ratio)
                            .fillna(self.previous_no_show_ratio.median())
                            .rename('previous_no_show'))
        
        scheduled_datetime = pd.to_datetime(X['ScheduledDay'])
        appointmt_datetime = pd.to_datetime(X['AppointmentDay'])
        
        days_diff = (appointmt_datetime.dt.date
                     - scheduled_datetime.dt.date).dt.days.rename('days_diff')
        
        scheduled_day_of_week = scheduled_datetime.dt.dayofweek.rename('scheduled_day')
        scheduled_hour = scheduled_datetime.dt.hour.rename('scheduled_hour')
        
        appointmt_day_of_week = appointmt_datetime.dt.dayofweek.rename('appointment_day')
        
#         scheduled_month_sin, \
#         scheduled_month_cos = cyclicalEncode(scheduled_datetime.dt.month, 12, 
#                                              names={'sin': 'scheduled_month_sin',
#                                                     'cos': 'scheduled_month_cos'})
        
#         scheduled_day_sin, \
#         scheduled_day_cos = cyclicalEncode(scheduled_datetime.dt.dayofweek, 7, 
#                                            names={'sin': 'scheduled_day_sin',
#                                                   'cos': 'scheduled_day_cos'})
        
#         scheduled_hour_sin, \
#         scheduled_hour_cos = cyclicalEncode(scheduled_datetime.dt.hour, 24, 
#                                             names={'sin': 'scheduled_hour_sin',
#                                                    'cos': 'scheduled_hour_cos'})
        
#         appointmt_month_sin, \
#         appointmt_month_cos = cyclicalEncode(appointmt_datetime.dt.month, 12, 
#                                              names={'sin': 'appointmt_month_sin',
#                                                     'cos': 'appointmt_month_cos'})
        
#         appointmt_day_sin, \
#         appointmt_day_cos = cyclicalEncode(appointmt_datetime.dt.dayofweek, 7, 
#                                            names={'sin': 'appointmt_day_sin',
#                                                   'cos': 'appointmt_day_cos'})
        
        return X.join([
            days_diff,
            neighbourhood,
            previous_no_show,
            scheduled_day_of_week,
            scheduled_hour,
            appointmt_day_of_week,
#             scheduled_month_sin,
#             scheduled_month_cos,
#             scheduled_day_sin,
#             scheduled_day_cos,
#             scheduled_hour_sin,
#             scheduled_hour_cos,
#             appointmt_month_sin,
#             appointmt_month_cos,
#             appointmt_day_sin, 
#             appointmt_day_cos,
        ])

In [18]:
df.columns

Index(['PatientId', 'AppointmentID', 'Gender', 'ScheduledDay',
       'AppointmentDay', 'Age', 'Neighbourhood', 'Scholarship', 'Hipertension',
       'Diabetes', 'Alcoholism', 'Handcap', 'SMS_received', 'No-show'],
      dtype='object')

In [19]:
def my_pipeline(estimator, remainder='drop', verbose=0):
    
    column_transformer = make_column_transformer(
#             (StandardScaler(), [

#             ]),
            (MinMaxScaler(), [
                'Handcap',
                'previous_no_show',
                'Age',
                'days_diff',
                'neighbourhood',
                'scheduled_day',
                'scheduled_hour',
                'appointment_day',
                'SMS_received',
                'Scholarship',
                'Hipertension',
                'Diabetes',
                'Alcoholism',
#                 'scheduled_month_sin',
#                 'scheduled_month_cos',
#                 'scheduled_day_sin',
#                 'scheduled_day_cos',
#                 'scheduled_hour_sin',
#                 'scheduled_hour_cos',
#                 'appointmt_month_sin',
#                 'appointmt_month_cos',
#                 'appointmt_day_sin', 
#                 'appointmt_day_cos',
            ]),
#             (OneHotEncoder(drop='first'), [
#                 'Gender',
#             ]),
#             ('passthrough', [
#                 'SMS_received',
#                 'Scholarship',
#                 'Hipertension',
#                 'Diabetes',
#                 'Alcoholism',
#             ]),
            remainder=remainder,
            n_jobs=-1)
    
    if verbose:
        steps = [
            DebugTransformer('before'),
            FeaturerEngineering(),
            DebugTransformer('after FeaturerEngineering', ['transform'] if verbose==2 else []),
            column_transformer,
            DebugTransformer('after ColumnTransformer', ['transform'] if verbose==2 else [],
                             col_tf=column_transformer),
        ]
    else:
        steps = [
            FeaturerEngineering(),
            column_transformer,
        ]
    
    return make_pipeline(*steps, estimator)

In [20]:
def estimate(estimator, X, y, scoring=None, remainder='drop', verbose=0, *args, **kwargs):
    
    
    # verbose
    def _print(_title, func, *args, **kwargs):
        if verbose: display(title(_title))
        out = func(*args, **kwargs)
        if verbose: print()
        return out
    
    
    # create pipeline
    if isinstance(estimator, Pipeline):
        pipeline = estimator
    else:
        pipeline = my_pipeline(estimator, remainder, verbose)
    display(title(pipeline.steps[-1][1].__class__.__name__, 2))
    pprint([s[1] for s in pipeline.steps], width=1)
    
    
    # split data
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=123,
                                                        test_size=.2, shuffle=True, stratify=y)
    
    
    # cross validate
    cross = cross_validate(pipeline, X_train, y_train, scoring=scoring, n_jobs=-1)
    display(title('cross_validate'))
    pprint({k: round(v.mean(), 3) for k, v in cross.items()}, width=1)
    print()

    
    # fit, predict
    _print('fitting', pipeline.fit, X_train, y_train)
    y_pred = _print('predeicting', pipeline.predict, X_test)
    y_proba = _print('predicting probabilities', pipeline.predict_proba, X_test)[:, 1]
    
    
    # confusion matrix
    display(title('confusion matrix'))
    conf_mtx = confusion_matrix(y_test, y_pred)
    pprint(conf_mtx, width=1)
    print()
    
    
    # scores
    display(title('scores'))
    if isinstance(scoring, str):
        score_func = get_scorer(scoring)._score_func
        print(score_func.__name__ , round(score_func(y_test, y_pred), 3))
    else:
        scores = {f.__name__: 
                  f(y_test, y_proba) if 'y_score' in f.__code__.co_varnames else f(y_test, y_pred)
                  for f in map(lambda x:get_scorer(x)._score_func, scoring)}
        pprint({k: round(v, 3) for k, v in scores.items()}, width=1)

In [21]:
# explicit pipeline

estimate(make_pipeline(StandardScaler(),
                       RandomForestClassifier(random_state=100, class_weight='balanced',
                                              n_estimators=100, max_depth=13)),
         X.drop(['PatientId', 'AppointmentID', 'ScheduledDay', 'AppointmentDay', 'Neighbourhood'], axis=1),
         y,
         scoring=['precision', 'f1', 'roc_auc'],
)

## RandomForestClassifier

[StandardScaler(),
 RandomForestClassifier(class_weight='balanced', max_depth=13, random_state=100)]


### cross_validate

{'fit_time': 3.186,
 'score_time': 0.454,
 'test_f1': 0.344,
 'test_precision': 0.275,
 'test_roc_auc': 0.6}



### confusion matrix

array([[12178,  5464],
       [ 2419,  2044]])



### scores

{'f1_score': 0.341,
 'precision_score': 0.272,
 'roc_auc_score': 0.598}


In [22]:
# custom implicit pipeline

estimate(RandomForestClassifier(random_state=100, class_weight='balanced',
                                n_estimators=100, max_depth=13),
         X,
         y,
         scoring=['precision', 'f1', 'roc_auc'],
         verbose=1,
        )

## RandomForestClassifier

[DebugTransformer(name='before'),
 FeaturerEngineering(name='FeaturerEngineering'),
 DebugTransformer(name='after FeaturerEngineering'),
 ColumnTransformer(n_jobs=-1,
                  transformers=[('minmaxscaler', MinMaxScaler(),
                                 ['Handcap', 'previous_no_show', 'Age',
                                  'days_diff', 'neighbourhood', 'scheduled_day',
                                  'scheduled_hour', 'appointment_day',
                                  'SMS_received', 'Scholarship', 'Hipertension',
                                  'Diabetes', 'Alcoholism'])]),
 DebugTransformer(name='after ColumnTransformer'),
 RandomForestClassifier(class_weight='balanced', max_depth=13, random_state=100)]


### cross_validate

{'fit_time': 6.435,
 'score_time': 0.632,
 'test_f1': 0.451,
 'test_precision': 0.334,
 'test_roc_auc': 0.741}



### fitting

before                         fit 		X shape: (88417, 13) 	y shape: (88417,)
before                         transform 	X shape: (88417, 13) 	y shape: None
after FeaturerEngineering      fit 		X shape: (88417, 19) 	y shape: (88417,)
after FeaturerEngineering      transform 	X shape: (88417, 19) 	y shape: None
after ColumnTransformer        fit 		X shape: (88417, 13) 	y shape: (88417,)
after ColumnTransformer        transform 	X shape: (88417, 13) 	y shape: None



### predeicting

before                         transform 	X shape: (22105, 13) 	y shape: None
after FeaturerEngineering      transform 	X shape: (22105, 19) 	y shape: None
after ColumnTransformer        transform 	X shape: (22105, 13) 	y shape: None



### predicting probabilities

before                         transform 	X shape: (22105, 13) 	y shape: None
after FeaturerEngineering      transform 	X shape: (22105, 19) 	y shape: None
after ColumnTransformer        transform 	X shape: (22105, 13) 	y shape: None



### confusion matrix

array([[11389,  6253],
       [ 1338,  3125]])



### scores

{'f1_score': 0.452,
 'precision_score': 0.333,
 'roc_auc_score': 0.74}


In [23]:
# custom implicit pipeline, verbose=2

estimate(RandomForestClassifier(random_state=100, class_weight='balanced',
                                n_estimators=100, max_depth=13),
         X,
         y,
         scoring=['precision', 'f1', 'roc_auc'],
         # remainder='passthrough',
         verbose=2,
        )

## RandomForestClassifier

[DebugTransformer(name='before'),
 FeaturerEngineering(name='FeaturerEngineering'),
 DebugTransformer(name='after FeaturerEngineering', verbose=['transform']),
 ColumnTransformer(n_jobs=-1,
                  transformers=[('minmaxscaler', MinMaxScaler(),
                                 ['Handcap', 'previous_no_show', 'Age',
                                  'days_diff', 'neighbourhood', 'scheduled_day',
                                  'scheduled_hour', 'appointment_day',
                                  'SMS_received', 'Scholarship', 'Hipertension',
                                  'Diabetes', 'Alcoholism'])]),
 DebugTransformer(name='after ColumnTransformer', verbose=['transform']),
 RandomForestClassifier(class_weight='balanced', max_depth=13, random_state=100)]


### cross_validate

{'fit_time': 6.223,
 'score_time': 0.871,
 'test_f1': 0.451,
 'test_precision': 0.334,
 'test_roc_auc': 0.741}



### fitting

before                         fit 		X shape: (88417, 13) 	y shape: (88417,)
before                         transform 	X shape: (88417, 13) 	y shape: None
after FeaturerEngineering      fit 		X shape: (88417, 19) 	y shape: (88417,)
after FeaturerEngineering      transform 	X shape: (88417, 19) 	y shape: None


Unnamed: 0,PatientId,AppointmentID,Gender,ScheduledDay,AppointmentDay,Age,Neighbourhood,Scholarship,Hipertension,Diabetes,Alcoholism,Handcap,SMS_received,days_diff,neighbourhood,previous_no_show,scheduled_day,scheduled_hour,appointment_day
99995,71599135144931.0000,5755900,0,2016-06-01T07:46:49Z,2016-06-08T00:00:00Z,28,RESISTÊNCIA,0,0,0,0,0,1,7,0.2626,1.0000,2,7,2
83756,227248344584.0000,5738454,0,2016-05-25T11:01:45Z,2016-05-25T00:00:00Z,55,TABUAZEIRO,0,0,0,0,0,0,0,0.2212,1.0000,2,11,2
29069,9594968327911.0000,5683072,0,2016-05-11T07:11:57Z,2016-05-19T00:00:00Z,4,SANTO ANDRÉ,0,0,0,0,0,0,8,0.2518,1.0000,2,7,3
79045,462782469997.0000,5750823,1,2016-05-31T09:22:31Z,2016-05-31T00:00:00Z,5,ILHA DE SANTA MARIA,0,0,0,0,0,0,0,0.2365,1.0000,1,9,1
62274,24349994988665.0000,5585699,1,2016-04-14T17:47:21Z,2016-05-17T00:00:00Z,43,JABOUR,0,0,0,0,0,0,33,0.2173,1.0000,3,17,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
73603,96244428927879.0000,5719482,1,2016-05-19T11:26:28Z,2016-05-19T00:00:00Z,16,MARIA ORTIZ,0,0,0,0,0,0,0,0.2694,1.0000,3,11,3
33587,22653384379946.0000,5665262,0,2016-05-05T14:10:45Z,2016-05-09T00:00:00Z,63,CENTRO,0,0,0,0,0,0,4,0.2711,1.0000,3,14,0
19424,8484929899753.0000,5623999,0,2016-04-26T15:01:45Z,2016-05-10T00:00:00Z,63,ILHA DE SANTA MARIA,0,0,0,0,0,1,14,0.2365,2.0000,1,15,1
96202,9845148296311.0000,5677190,0,2016-05-09T18:43:20Z,2016-06-07T00:00:00Z,45,JARDIM DA PENHA,0,1,0,0,0,1,29,0.1883,1.0000,0,18,1


None

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
PatientId,88417.0,147298107406615.75,256010614417034.47,39217.8444,4133985777999.0,31627943533386.0,94336536145654.0,999981631772427.0
AppointmentID,88417.0,5675544.4295,71297.6541,5030230.0,5640523.0,5680724.0,5725839.0,5790484.0
Gender,88417.0,0.3512,0.4773,0.0,0.0,0.0,1.0,1.0
Age,88417.0,37.0791,23.0944,-1.0,18.0,37.0,55.0,115.0
Scholarship,88417.0,0.0982,0.2975,0.0,0.0,0.0,0.0,1.0
Hipertension,88417.0,0.1972,0.3979,0.0,0.0,0.0,0.0,1.0
Diabetes,88417.0,0.0718,0.2581,0.0,0.0,0.0,0.0,1.0
Alcoholism,88417.0,0.0309,0.173,0.0,0.0,0.0,0.0,1.0
Handcap,88417.0,0.0221,0.1609,0.0,0.0,0.0,0.0,4.0
SMS_received,88417.0,0.3203,0.4666,0.0,0.0,0.0,1.0,1.0



after ColumnTransformer        fit 		X shape: (88417, 13) 	y shape: (88417,)
after ColumnTransformer        transform 	X shape: (88417, 13) 	y shape: None


Unnamed: 0,minmaxscaler__Handcap,minmaxscaler__previous_no_show,minmaxscaler__Age,minmaxscaler__days_diff,minmaxscaler__neighbourhood,minmaxscaler__scheduled_day,minmaxscaler__scheduled_hour,minmaxscaler__appointment_day,minmaxscaler__SMS_received,minmaxscaler__Scholarship,minmaxscaler__Hipertension,minmaxscaler__Diabetes,minmaxscaler__Alcoholism
0,0.0000,0.1097,0.2500,0.0391,0.4755,0.4000,0.0667,0.4000,1.0000,0.0000,0.0000,0.0000,0.0000
1,0.0000,0.1097,0.4828,0.0000,0.3323,0.4000,0.3333,0.4000,0.0000,0.0000,0.0000,0.0000,0.0000
2,0.0000,0.1097,0.0431,0.0447,0.4382,0.4000,0.0667,0.6000,0.0000,0.0000,0.0000,0.0000,0.0000
3,0.0000,0.1097,0.0517,0.0000,0.3853,0.2000,0.2000,0.2000,0.0000,0.0000,0.0000,0.0000,0.0000
4,0.0000,0.1097,0.3793,0.1844,0.3189,0.6000,0.7333,0.2000,0.0000,0.0000,0.0000,0.0000,0.0000
...,...,...,...,...,...,...,...,...,...,...,...,...,...
88412,0.0000,0.1097,0.1466,0.0000,0.4989,0.6000,0.3333,0.6000,0.0000,0.0000,0.0000,0.0000,0.0000
88413,0.0000,0.1097,0.5517,0.0223,0.5046,0.6000,0.5333,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000
88414,0.0000,0.2210,0.5517,0.0782,0.3853,0.2000,0.6000,0.2000,1.0000,0.0000,0.0000,0.0000,0.0000
88415,0.0000,0.1097,0.3966,0.1620,0.2189,0.0000,0.8000,0.2000,1.0000,0.0000,1.0000,0.0000,0.0000


None

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
minmaxscaler__Handcap,88417.0,0.0055,0.0402,0.0,0.0,0.0,0.0,1.0
minmaxscaler__previous_no_show,88417.0,0.1043,0.0416,0.0,0.1097,0.1097,0.1097,1.0
minmaxscaler__Age,88417.0,0.3283,0.1991,0.0,0.1638,0.3276,0.4828,1.0
minmaxscaler__days_diff,88417.0,0.0567,0.0852,0.0,0.0,0.0223,0.0782,1.0
minmaxscaler__neighbourhood,88417.0,0.4476,0.157,0.0,0.3371,0.4338,0.5046,1.0
minmaxscaler__scheduled_day,88417.0,0.3703,0.2754,0.0,0.2,0.4,0.6,1.0
minmaxscaler__scheduled_hour,88417.0,0.3183,0.2144,0.0,0.1333,0.2667,0.4667,1.0
minmaxscaler__appointment_day,88417.0,0.3717,0.2739,0.0,0.2,0.4,0.6,1.0
minmaxscaler__SMS_received,88417.0,0.3203,0.4666,0.0,0.0,0.0,1.0,1.0
minmaxscaler__Scholarship,88417.0,0.0982,0.2975,0.0,0.0,0.0,0.0,1.0






### predeicting

before                         transform 	X shape: (22105, 13) 	y shape: None
after FeaturerEngineering      transform 	X shape: (22105, 19) 	y shape: None


Unnamed: 0,PatientId,AppointmentID,Gender,ScheduledDay,AppointmentDay,Age,Neighbourhood,Scholarship,Hipertension,Diabetes,Alcoholism,Handcap,SMS_received,days_diff,neighbourhood,previous_no_show,scheduled_day,scheduled_hour,appointment_day
98633,67811388333888.0000,5673541,0,2016-05-09T09:31:31Z,2016-06-01T00:00:00Z,50,MARIA ORTIZ,0,0,0,0,0,1,23,0.2694,1.0000,0,9,2
56036,889232264943.0000,5638544,0,2016-04-29T08:08:41Z,2016-05-03T00:00:00Z,55,SANTO ANDRÉ,0,0,0,0,0,1,4,0.2518,1.0000,4,8,1
267,2616716224317.0000,5639738,0,2016-04-29T09:29:55Z,2016-04-29T00:00:00Z,9,MARIA ORTIZ,0,0,0,0,0,0,0,0.2694,1.0000,4,9,4
82513,292235549422.0000,5658351,0,2016-05-04T10:58:23Z,2016-05-25T00:00:00Z,33,ITARARÉ,0,0,0,0,0,1,21,0.3547,1.0000,2,10,2
20923,162265377816667.0000,5698567,0,2016-05-16T07:40:41Z,2016-05-17T00:00:00Z,15,NOVA PALESTINA,0,0,0,0,0,0,1,0.2172,1.0000,0,7,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
92832,5626541457177.0000,5772419,0,2016-06-03T15:21:56Z,2016-06-08T00:00:00Z,25,JARDIM CAMBURI,0,0,0,0,0,1,5,0.2384,1.0000,4,15,2
1123,3512173414334.0000,5524870,0,2016-03-30T08:19:28Z,2016-04-29T00:00:00Z,51,FORTE SÃO JOÃO,0,0,0,0,0,0,30,0.2275,1.0000,2,8,4
27937,1298339162661.0000,5680725,0,2016-05-10T12:30:36Z,2016-05-13T00:00:00Z,71,MATA DA PRAIA,0,0,0,0,0,0,3,0.2113,1.0000,1,12,4
97570,996878977419.0000,5782796,0,2016-06-07T12:20:51Z,2016-06-07T00:00:00Z,36,SANTO ANTÔNIO,0,0,0,1,0,0,0,0.2014,1.0000,1,12,1


None

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
PatientId,22105.0,148275095711983.0,256394642324419.03,43741.7565,4316314596129.0,32272726434943.0,94624885965183.02,999637954175253.0
AppointmentID,22105.0,5674346.7402,71289.0481,5134249.0,5639502.0,5679825.0,5724316.0,5790367.0
Gender,22105.0,0.3454,0.4755,0.0,0.0,0.0,1.0,1.0
Age,22105.0,37.129,23.1732,0.0,18.0,37.0,55.0,115.0
Scholarship,22105.0,0.0987,0.2983,0.0,0.0,0.0,0.0,1.0
Hipertension,22105.0,0.1975,0.3981,0.0,0.0,0.0,0.0,1.0
Diabetes,22105.0,0.0723,0.259,0.0,0.0,0.0,0.0,1.0
Alcoholism,22105.0,0.0285,0.1665,0.0,0.0,0.0,0.0,1.0
Handcap,22105.0,0.0227,0.1637,0.0,0.0,0.0,0.0,4.0
SMS_received,22105.0,0.324,0.468,0.0,0.0,0.0,1.0,1.0



after ColumnTransformer        transform 	X shape: (22105, 13) 	y shape: None


Unnamed: 0,minmaxscaler__Handcap,minmaxscaler__previous_no_show,minmaxscaler__Age,minmaxscaler__days_diff,minmaxscaler__neighbourhood,minmaxscaler__scheduled_day,minmaxscaler__scheduled_hour,minmaxscaler__appointment_day,minmaxscaler__SMS_received,minmaxscaler__Scholarship,minmaxscaler__Hipertension,minmaxscaler__Diabetes,minmaxscaler__Alcoholism
0,0.0000,0.1097,0.4397,0.1285,0.4989,0.0000,0.2000,0.4000,1.0000,0.0000,0.0000,0.0000,0.0000
1,0.0000,0.1097,0.4828,0.0223,0.4382,0.8000,0.1333,0.2000,1.0000,0.0000,0.0000,0.0000,0.0000
2,0.0000,0.1097,0.0862,0.0000,0.4989,0.8000,0.2000,0.8000,0.0000,0.0000,0.0000,0.0000,0.0000
3,0.0000,0.1097,0.2931,0.1173,0.7935,0.4000,0.2667,0.4000,1.0000,0.0000,0.0000,0.0000,0.0000
4,0.0000,0.1097,0.1379,0.0056,0.3184,0.0000,0.0667,0.2000,0.0000,0.0000,0.0000,0.0000,0.0000
...,...,...,...,...,...,...,...,...,...,...,...,...,...
22100,0.0000,0.1097,0.2241,0.0279,0.3918,0.8000,0.6000,0.4000,1.0000,0.0000,0.0000,0.0000,0.0000
22101,0.0000,0.1097,0.4483,0.1676,0.3542,0.4000,0.1333,0.8000,0.0000,0.0000,0.0000,0.0000,0.0000
22102,0.0000,0.1097,0.6207,0.0168,0.2980,0.2000,0.4000,0.8000,0.0000,0.0000,0.0000,0.0000,0.0000
22103,0.0000,0.1097,0.3190,0.0000,0.2640,0.2000,0.4000,0.2000,0.0000,0.0000,0.0000,0.0000,1.0000


None

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
minmaxscaler__Handcap,22105.0,0.0057,0.0409,0.0,0.0,0.0,0.0,1.0
minmaxscaler__previous_no_show,22105.0,0.1066,0.0363,0.0,0.1097,0.1097,0.1097,1.0
minmaxscaler__Age,22105.0,0.3287,0.1998,0.0086,0.1638,0.3276,0.4828,1.0
minmaxscaler__days_diff,22105.0,0.0575,0.0853,0.0,0.0,0.0223,0.0838,1.0
minmaxscaler__neighbourhood,22105.0,0.4464,0.1562,0.0,0.3371,0.4293,0.5046,1.0
minmaxscaler__scheduled_day,22105.0,0.3708,0.2769,0.0,0.2,0.4,0.6,1.0
minmaxscaler__scheduled_hour,22105.0,0.3183,0.2144,0.0,0.1333,0.2667,0.5333,0.9333
minmaxscaler__appointment_day,22105.0,0.3716,0.276,0.0,0.2,0.4,0.6,1.0
minmaxscaler__SMS_received,22105.0,0.324,0.468,0.0,0.0,0.0,1.0,1.0
minmaxscaler__Scholarship,22105.0,0.0987,0.2983,0.0,0.0,0.0,0.0,1.0






### predicting probabilities

before                         transform 	X shape: (22105, 13) 	y shape: None
after FeaturerEngineering      transform 	X shape: (22105, 19) 	y shape: None


Unnamed: 0,PatientId,AppointmentID,Gender,ScheduledDay,AppointmentDay,Age,Neighbourhood,Scholarship,Hipertension,Diabetes,Alcoholism,Handcap,SMS_received,days_diff,neighbourhood,previous_no_show,scheduled_day,scheduled_hour,appointment_day
98633,67811388333888.0000,5673541,0,2016-05-09T09:31:31Z,2016-06-01T00:00:00Z,50,MARIA ORTIZ,0,0,0,0,0,1,23,0.2694,1.0000,0,9,2
56036,889232264943.0000,5638544,0,2016-04-29T08:08:41Z,2016-05-03T00:00:00Z,55,SANTO ANDRÉ,0,0,0,0,0,1,4,0.2518,1.0000,4,8,1
267,2616716224317.0000,5639738,0,2016-04-29T09:29:55Z,2016-04-29T00:00:00Z,9,MARIA ORTIZ,0,0,0,0,0,0,0,0.2694,1.0000,4,9,4
82513,292235549422.0000,5658351,0,2016-05-04T10:58:23Z,2016-05-25T00:00:00Z,33,ITARARÉ,0,0,0,0,0,1,21,0.3547,1.0000,2,10,2
20923,162265377816667.0000,5698567,0,2016-05-16T07:40:41Z,2016-05-17T00:00:00Z,15,NOVA PALESTINA,0,0,0,0,0,0,1,0.2172,1.0000,0,7,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
92832,5626541457177.0000,5772419,0,2016-06-03T15:21:56Z,2016-06-08T00:00:00Z,25,JARDIM CAMBURI,0,0,0,0,0,1,5,0.2384,1.0000,4,15,2
1123,3512173414334.0000,5524870,0,2016-03-30T08:19:28Z,2016-04-29T00:00:00Z,51,FORTE SÃO JOÃO,0,0,0,0,0,0,30,0.2275,1.0000,2,8,4
27937,1298339162661.0000,5680725,0,2016-05-10T12:30:36Z,2016-05-13T00:00:00Z,71,MATA DA PRAIA,0,0,0,0,0,0,3,0.2113,1.0000,1,12,4
97570,996878977419.0000,5782796,0,2016-06-07T12:20:51Z,2016-06-07T00:00:00Z,36,SANTO ANTÔNIO,0,0,0,1,0,0,0,0.2014,1.0000,1,12,1


None

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
PatientId,22105.0,148275095711983.0,256394642324419.03,43741.7565,4316314596129.0,32272726434943.0,94624885965183.02,999637954175253.0
AppointmentID,22105.0,5674346.7402,71289.0481,5134249.0,5639502.0,5679825.0,5724316.0,5790367.0
Gender,22105.0,0.3454,0.4755,0.0,0.0,0.0,1.0,1.0
Age,22105.0,37.129,23.1732,0.0,18.0,37.0,55.0,115.0
Scholarship,22105.0,0.0987,0.2983,0.0,0.0,0.0,0.0,1.0
Hipertension,22105.0,0.1975,0.3981,0.0,0.0,0.0,0.0,1.0
Diabetes,22105.0,0.0723,0.259,0.0,0.0,0.0,0.0,1.0
Alcoholism,22105.0,0.0285,0.1665,0.0,0.0,0.0,0.0,1.0
Handcap,22105.0,0.0227,0.1637,0.0,0.0,0.0,0.0,4.0
SMS_received,22105.0,0.324,0.468,0.0,0.0,0.0,1.0,1.0



after ColumnTransformer        transform 	X shape: (22105, 13) 	y shape: None


Unnamed: 0,minmaxscaler__Handcap,minmaxscaler__previous_no_show,minmaxscaler__Age,minmaxscaler__days_diff,minmaxscaler__neighbourhood,minmaxscaler__scheduled_day,minmaxscaler__scheduled_hour,minmaxscaler__appointment_day,minmaxscaler__SMS_received,minmaxscaler__Scholarship,minmaxscaler__Hipertension,minmaxscaler__Diabetes,minmaxscaler__Alcoholism
0,0.0000,0.1097,0.4397,0.1285,0.4989,0.0000,0.2000,0.4000,1.0000,0.0000,0.0000,0.0000,0.0000
1,0.0000,0.1097,0.4828,0.0223,0.4382,0.8000,0.1333,0.2000,1.0000,0.0000,0.0000,0.0000,0.0000
2,0.0000,0.1097,0.0862,0.0000,0.4989,0.8000,0.2000,0.8000,0.0000,0.0000,0.0000,0.0000,0.0000
3,0.0000,0.1097,0.2931,0.1173,0.7935,0.4000,0.2667,0.4000,1.0000,0.0000,0.0000,0.0000,0.0000
4,0.0000,0.1097,0.1379,0.0056,0.3184,0.0000,0.0667,0.2000,0.0000,0.0000,0.0000,0.0000,0.0000
...,...,...,...,...,...,...,...,...,...,...,...,...,...
22100,0.0000,0.1097,0.2241,0.0279,0.3918,0.8000,0.6000,0.4000,1.0000,0.0000,0.0000,0.0000,0.0000
22101,0.0000,0.1097,0.4483,0.1676,0.3542,0.4000,0.1333,0.8000,0.0000,0.0000,0.0000,0.0000,0.0000
22102,0.0000,0.1097,0.6207,0.0168,0.2980,0.2000,0.4000,0.8000,0.0000,0.0000,0.0000,0.0000,0.0000
22103,0.0000,0.1097,0.3190,0.0000,0.2640,0.2000,0.4000,0.2000,0.0000,0.0000,0.0000,0.0000,1.0000


None

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
minmaxscaler__Handcap,22105.0,0.0057,0.0409,0.0,0.0,0.0,0.0,1.0
minmaxscaler__previous_no_show,22105.0,0.1066,0.0363,0.0,0.1097,0.1097,0.1097,1.0
minmaxscaler__Age,22105.0,0.3287,0.1998,0.0086,0.1638,0.3276,0.4828,1.0
minmaxscaler__days_diff,22105.0,0.0575,0.0853,0.0,0.0,0.0223,0.0838,1.0
minmaxscaler__neighbourhood,22105.0,0.4464,0.1562,0.0,0.3371,0.4293,0.5046,1.0
minmaxscaler__scheduled_day,22105.0,0.3708,0.2769,0.0,0.2,0.4,0.6,1.0
minmaxscaler__scheduled_hour,22105.0,0.3183,0.2144,0.0,0.1333,0.2667,0.5333,0.9333
minmaxscaler__appointment_day,22105.0,0.3716,0.276,0.0,0.2,0.4,0.6,1.0
minmaxscaler__SMS_received,22105.0,0.324,0.468,0.0,0.0,0.0,1.0,1.0
minmaxscaler__Scholarship,22105.0,0.0987,0.2983,0.0,0.0,0.0,0.0,1.0






### confusion matrix

array([[11389,  6253],
       [ 1338,  3125]])



### scores

{'f1_score': 0.452,
 'precision_score': 0.333,
 'roc_auc_score': 0.74}
