In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.impute import SimpleImputer
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder



# additional experiement packages
import missingno as msno

%matplotlib inline

In [151]:
# loading dataset 
training = pd.read_csv("./input/training_v2.csv")
solution_template = pd.read_csv("./input/solution_template.csv")
samplesubmission = pd.read_csv("./input/samplesubmission.csv")
unlabeled = pd.read_csv("./input/unlabeled.csv")
dictionary = pd.read_csv("./input/WiDS Datathon 2020 Dictionary.csv")

In [152]:
X, y =  training.drop(columns=['hospital_death']), training[['hospital_death']]

In [153]:
# train test split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state=42)

In [154]:
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(73370, 185) (18343, 185) (73370, 1) (18343, 1)


In [17]:
## create transformers for pipeline

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

## data needs to be at preprocessing stage
# need to undo steps below

In [17]:
# Missing value imputer. Replace NaN with 'missing_value' for categorical fields.
categorical_transformer  = Pipeline(steps=[('imputer', SimpleImputer(strategy='constant')),
                                           ('encoder', OneHotEncoder(handle_unknown='ignore'))])

# Standardise numerical fields
numeric_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='mean')),
                                      ('scaler', StandardScaler())])

In [18]:
# create a list of features with specific data types for processing

numeric_features     = X_train.select_dtypes(include=[np.number]).columns
categorical_features = X_train.select_dtypes(exclude=[np.number]).columns


In [19]:
# Assemble categorical and numerical pipelines
from sklearn.compose import ColumnTransformer

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

In [20]:
preprocessor.fit(X_train)

ColumnTransformer(n_jobs=None, remainder='drop', sparse_threshold=0.3,
                  transformer_weights=None,
                  transformers=[('num',
                                 Pipeline(memory=None,
                                          steps=[('imputer',
                                                  SimpleImputer(add_indicator=False,
                                                                copy=True,
                                                                fill_value=None,
                                                                missing_values=nan,
                                                                strategy='mean',
                                                                verbose=0)),
                                                 ('scaler',
                                                  StandardScaler(copy=True,
                                                                 with_mean=True,
                             

In [21]:
# https://stackoverflow.com/questions/10592605/save-classifier-to-disk-in-scikit-learn

from sklearn.externals import joblib

# now you can save it to a file
# joblib.dump(preprocessor, 'preprocessor.pkl') 

# and later you can load it
# preprocessor_load = joblib.load('preprocessor.pkl') 



In [22]:
def drop_missing_columns(df):
    list_to_drop = df.columns[(df.isnull().sum()/df.shape[0]) > 0.1]
    df.drop(columns=list_to_drop, inplace=True)
    return df

In [27]:
def drop_missing_rows(df):
    rows_to_drop = df[(df.isnull().sum(axis=1)/df.shape[1]) > 0.2].index
    df.drop(rows_to_drop, inplace=True)
    return df

In [24]:
from sklearn.base import BaseEstimator, TransformerMixin

In [226]:
base = BaseEstimator()

In [228]:
base.get_params

<bound method BaseEstimator.get_params of BaseEstimator()>

In [227]:
?BaseEstimator

[1;31mInit signature:[0m [0mBaseEstimator[0m[1;33m([0m[1;33m)[0m[1;33m[0m[1;33m[0m[0m
[1;31mDocstring:[0m     
Base class for all estimators in scikit-learn

Notes
-----
All estimators should specify all the parameters that can be set
at the class level in their ``__init__`` as explicit keyword
arguments (no ``*args`` or ``**kwargs``).
[1;31mFile:[0m           c:\users\anthony\anaconda3\lib\site-packages\sklearn\base.py
[1;31mType:[0m           type
[1;31mSubclasses:[0m     SimpleImputer, MissingIndicator, _BaseComposition, FunctionTransformer, LabelEncoder, LabelBinarizer, MultiLabelBinarizer, _BaseEncoder, MinMaxScaler, StandardScaler, ...


In [338]:
# Reference
# https://github.com/jem1031/pandas-pipelines-custom-transformers/blob/master/code/custom_transformers.py
# http://zacstewart.com/2014/08/05/pipelines-of-featureunions-of-pipelines.html

class drop_missing_rows_columns(TransformerMixin, BaseEstimator):
    
    def __init__(self, column_threshold=0.1, row_threshold=0.2):
        
        # attributes
        self.column_threshold = column_threshold
        self.row_threshold = row_threshold
        self.cols_to_drop = []
        self.rows_to_drop = []
#         self.numeric_features     = X_train.select_dtypes(include=[np.number]).columns
#         self.categorical_features = X_train.select_dtypes(exclude=[np.number]).columns
        
        
    def fit(self, X, y=None):
        
        print('enter fitting drop_missing_rows_columns')
        # identify missing columns and rows
        self.cols_to_drop = X.columns[(X.isnull().sum()/X.shape[0]) > self.column_threshold].to_list()
        self.rows_to_drop = X[(X.isnull().sum(axis=1)/X.shape[1]) > self.row_threshold].index.to_list()
        print('finish fitting drop_missing_rows_columns')
        
        return self
    
    def transform(self, X, y=None):
        
        print('enter transforming drop_missing_rows_columns')
        # missing columns threshold 0.1
        X = X.drop(columns=self.cols_to_drop)
        
        # missing rows, threshold 0.2
        X = X.drop(self.rows_to_drop)
        
        print('finish transforming drop_missing_rows_columns')
        
        return X
        


In [339]:
drop_missing = drop_missing_rows_columns() #column_threshold=0.1, row_threshold=0.2)

In [340]:
X_train = drop_missing.fit_transform(X_train)

enter fitting drop_missing_rows_columns
finish fitting drop_missing_rows_columns
enter transforming drop_missing_rows_columns
finish transforming drop_missing_rows_columns


In [342]:
# create a list of features with specific data types for processing

numeric_features     = X_train.select_dtypes(include=[np.number]).columns
categorical_features = X_train.select_dtypes(exclude=[np.number]).columns


In [308]:
# Missing value imputer. Replace NaN with 'missing_value' for categorical fields.
categorical_transformer  = Pipeline(steps=[('imputer', SimpleImputer(strategy='constant')),
                                           ('encoder', OneHotEncoder(handle_unknown='ignore'))])

# Standardise numerical fields
numeric_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='mean')),
                                      ('scaler', StandardScaler())])

In [333]:
# Assemble categorical and numerical pipelines

from sklearn.compose import ColumnTransformer

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])


In [336]:
pipeline = Pipeline(steps=[('preprocessor', preprocessor)])