In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.impute import SimpleImputer
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder



# additional experiement packages
import missingno as msno

%matplotlib inline

In [151]:
# loading dataset 
training = pd.read_csv("./input/training_v2.csv")
solution_template = pd.read_csv("./input/solution_template.csv")
samplesubmission = pd.read_csv("./input/samplesubmission.csv")
unlabeled = pd.read_csv("./input/unlabeled.csv")
dictionary = pd.read_csv("./input/WiDS Datathon 2020 Dictionary.csv")

In [152]:
X, y =  training.drop(columns=['hospital_death']), training[['hospital_death']]

In [153]:
# train test split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state=42)

In [154]:
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(73370, 185) (18343, 185) (73370, 1) (18343, 1)


In [17]:
## create transformers for pipeline

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

## data needs to be at preprocessing stage
# need to undo steps below

In [17]:
# Missing value imputer. Replace NaN with 'missing_value' for categorical fields.
categorical_transformer  = Pipeline(steps=[('imputer', SimpleImputer(strategy='constant')),
                                           ('encoder', OneHotEncoder(handle_unknown='ignore'))])

# Standardise numerical fields
numeric_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='mean')),
                                      ('scaler', StandardScaler())])

In [18]:
# create a list of features with specific data types for processing

numeric_features     = X_train.select_dtypes(include=[np.number]).columns
categorical_features = X_train.select_dtypes(exclude=[np.number]).columns


In [19]:
# Assemble categorical and numerical pipelines
from sklearn.compose import ColumnTransformer

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

In [20]:
preprocessor.fit(X_train)

ColumnTransformer(n_jobs=None, remainder='drop', sparse_threshold=0.3,
                  transformer_weights=None,
                  transformers=[('num',
                                 Pipeline(memory=None,
                                          steps=[('imputer',
                                                  SimpleImputer(add_indicator=False,
                                                                copy=True,
                                                                fill_value=None,
                                                                missing_values=nan,
                                                                strategy='mean',
                                                                verbose=0)),
                                                 ('scaler',
                                                  StandardScaler(copy=True,
                                                                 with_mean=True,
                             

In [21]:
# https://stackoverflow.com/questions/10592605/save-classifier-to-disk-in-scikit-learn

from sklearn.externals import joblib

# now you can save it to a file
# joblib.dump(preprocessor, 'preprocessor.pkl') 

# and later you can load it
# preprocessor_load = joblib.load('preprocessor.pkl') 



In [22]:
def drop_missing_columns(df):
    list_to_drop = df.columns[(df.isnull().sum()/df.shape[0])>0.1]
    df.drop(columns=list_to_drop, inplace=True)
    return df

In [27]:
def drop_missing_rows(df):
    rows_to_drop = df[(df.isnull().sum(axis=1)/df.shape[1])>0.2].index
    df.drop(rows_to_drop, inplace=True)
    return df

In [24]:
from sklearn.base import BaseEstimator, TransformerMixin

In [197]:
# Reference
# https://github.com/jem1031/pandas-pipelines-custom-transformers/blob/master/code/custom_transformers.py
# http://zacstewart.com/2014/08/05/pipelines-of-featureunions-of-pipelines.html

class drop_missing_rows_columns(BaseEstimator, TransformerMixin):
    
    
    def __init__(self, column_threshold=0.1, row_threshold=0.2):
        
        # attributes
        self.column_threshold = column_threshold
        self.row_threshold = row_threshold
        self.cols_to_drop = []
        self.rows_to_drop = []
        
    def fit(self, X):
        
        # identify missing columns and rows
        self.cols_to_drop = X.columns[(X.isnull().sum()/X.shape[0]) < self.column_threshold].to_list()
        self.rows_to_drop = X[(X.isnull().sum(axis=1)/X.shape[1]) < self.row_threshold].index.to_list()
        
        return self
    
    def transform(self, X):
        
        # missing columns threshold 0.1
        X = X.drop(columns=self.cols_to_drop)
        
        # missing rows, threshold 0.2
        X = X.drop(self.rows_to_drop)
        
        return X
        


In [198]:
test_drop = drop_missing_rows_columns(column_threshold=0.1, row_threshold=0.2)

In [199]:
test_drop.fit(X_train)

drop_missing_rows_columns(column_threshold=0.1, row_threshold=0.2)

In [200]:
test_drop.rows_to_drop?

[1;31mType:[0m        list
[1;31mString form:[0m [1518, 58744, 19909, 56735, 62066, 81024, 65013, 18147, 38266, 81506, 39359, 8334, 58094, 47054,  <...> 5, 22662, 39504, 62592, 71211, 45758, 66557, 89475, 35920, 41606, 3890, 66803, 53707, 64925, 769]
[1;31mLength:[0m      8796
[1;31mDocstring:[0m  
Built-in mutable sequence.

If no argument is given, the constructor creates a new empty list.
The argument must be an iterable if specified.


In [201]:
test_drop.transform(X_train)

Unnamed: 0,hospital_admit_source,albumin_apache,bilirubin_apache,bun_apache,creatinine_apache,fio2_apache,glucose_apache,hematocrit_apache,paco2_apache,paco2_for_ph_apache,...,d1_pao2fio2ratio_max,d1_pao2fio2ratio_min,h1_arterial_pco2_max,h1_arterial_pco2_min,h1_arterial_ph_max,h1_arterial_ph_min,h1_arterial_po2_max,h1_arterial_po2_min,h1_pao2fio2ratio_max,h1_pao2fio2ratio_min
4609,Direct Admit,,,,,,70.0,,,,...,,,,,,,,,,
75674,,,,,,,,29.3,,,...,427.000000,427.0,,,,,,,,
84022,Emergency Department,,,6.0,0.77,,122.0,35.8,,,...,,,,,,,,,,
38035,Emergency Department,,,127.0,7.90,0.5,279.0,19.3,28.4,28.4,...,386.666667,245.6,18.700,18.7,7.210,7.210,81.2,81.2,386.666667,386.666667
24371,Emergency Department,2.5,0.3,10.0,0.66,,70.0,21.3,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6265,Operating Room,,,8.0,0.74,,78.0,33.7,,,...,,,,,,,,,,
54886,Recovery Room,,,27.0,1.82,,106.0,28.2,,,...,,,,,,,,,,
76820,Emergency Department,,,,,,201.0,,,,...,,,,,,,,,,
860,Emergency Department,3.4,0.4,18.0,1.10,1.0,151.0,49.9,42.0,42.0,...,138.333333,46.0,111.505,107.0,7.270,7.270,172.0,172.0,,


In [163]:
# Missing value imputer. Replace NaN with 'missing_value' for categorical fields.
categorical_transformer  = Pipeline(steps=[('imputer', SimpleImputer(strategy='constant')),
                                           ('encoder', OneHotEncoder(handle_unknown='ignore'))])

# Standardise numerical fields
numeric_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='mean')),
                                      ('scaler', StandardScaler())])

In [164]:
# create a list of features with specific data types for processing

numeric_features     = X_train.select_dtypes(include=[np.number]).columns
categorical_features = X_train.select_dtypes(exclude=[np.number]).columns


In [165]:
# Assemble categorical and numerical pipelines
from sklearn.compose import ColumnTransformer

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

In [166]:
pipeline = Pipeline(steps=[('drop_missing_rows_columns', drop_missing_rows_columns()),
                          ('preprocessor', preprocessor)])

In [168]:
pipeline = Pipeline(steps=[('drop_missing_rows_columns', drop_missing_rows_columns())])

In [169]:
X_train

Unnamed: 0,encounter_id,patient_id,hospital_id,age,bmi,elective_surgery,ethnicity,gender,height,hospital_admit_source,...,aids,cirrhosis,diabetes_mellitus,hepatic_failure,immunosuppression,leukemia,lymphoma,solid_tumor_with_metastasis,apache_3j_bodysystem,apache_2_bodysystem
4609,59954,90449,118,86.0,28.168975,0,Caucasian,M,170.2,Direct Admit,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Neurological,Neurologic
75674,63518,19384,185,72.0,36.635088,0,Caucasian,M,170.1,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Gastrointestinal,Gastrointestinal
84022,34401,20558,188,36.0,27.459684,0,African American,M,162.6,Emergency Department,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Metabolic,Metabolic
38035,71581,112066,62,60.0,21.977351,0,Caucasian,M,181.0,Emergency Department,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Genitourinary,Renal/Genitourinary
24371,29019,129440,161,27.0,19.960244,0,Caucasian,F,177.8,Emergency Department,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,Sepsis,Cardiovascular
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6265,47848,567,118,16.0,22.378743,1,Hispanic,M,165.1,Operating Room,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Neurological,Neurologic
54886,120944,108645,194,70.0,29.475309,1,Caucasian,M,180.0,Recovery Room,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,
76820,89789,87549,103,72.0,30.827304,1,,M,187.0,Emergency Department,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,Cardiovascular,Cardiovascular
860,7917,34465,118,46.0,61.339079,0,African American,M,167.6,Emergency Department,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,Respiratory,Respiratory


In [171]:
pipeline.fit(X_train)

TypeError: fit() takes 2 positional arguments but 3 were given

In [138]:
pipeline.fit(X_train)

ValueError: 'albumin_apache' is not in list

In [139]:
pipeline.transform(X_train)

NotFittedError: This ColumnTransformer instance is not fitted yet. Call 'fit' with appropriate arguments before using this method.

In [128]:
pd.DataFrame(pipeline.transform(X_train))

NotFittedError: This ColumnTransformer instance is not fitted yet. Call 'fit' with appropriate arguments before using this method.

In [129]:
pipeline.fit_transform(X_train)

ValueError: 'albumin_apache' is not in list