In [39]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, make_pipeline, make_union
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV

import warnings
warnings.filterwarnings("ignore")

In [2]:
train_df = pd.read_csv('input/TrainingWiDS2021.csv')
test_df = pd.read_csv('input/UnlabeledWiDS2021.csv')
data_dict = pd.read_csv('input/DataDictionaryWiDS2021.csv')

In [25]:
train_df.columns, test_df.columns

(Index(['age', 'bmi', 'elective_surgery', 'ethnicity', 'gender', 'height',
        'hospital_admit_source', 'icu_admit_source', 'icu_stay_type',
        'icu_type',
        ...
        'h1_arterial_po2_min', 'h1_pao2fio2ratio_max', 'h1_pao2fio2ratio_min',
        'aids', 'cirrhosis', 'hepatic_failure', 'immunosuppression', 'leukemia',
        'lymphoma', 'solid_tumor_with_metastasis'],
       dtype='object', length=176),
 Index(['age', 'bmi', 'elective_surgery', 'ethnicity', 'gender', 'height',
        'hospital_admit_source', 'icu_admit_source', 'icu_stay_type',
        'icu_type',
        ...
        'h1_arterial_po2_min', 'h1_pao2fio2ratio_max', 'h1_pao2fio2ratio_min',
        'aids', 'cirrhosis', 'hepatic_failure', 'immunosuppression', 'leukemia',
        'lymphoma', 'solid_tumor_with_metastasis'],
       dtype='object', length=176))

In [None]:
# Target variable to be predicted
set(train_df.columns) - set(test_df.columns)

In [26]:
train_df.shape, test_df.shape

((130157, 176), (10234, 176))

In [None]:
train_df.head(10)

In [3]:
train_df = train_df.drop(columns=['Unnamed: 0'])
test_df = test_df.drop(columns=['Unnamed: 0'])

In [4]:
Y_train = train_df['diabetes_mellitus']
train_df = train_df.drop(columns=['diabetes_mellitus'])

## EDA on Training Data

In [None]:
train_df.describe().transpose()

In [None]:
# Columns containing null values
train_df.columns[train_df.isna().any()], test_df.columns[test_df.isna().any()]

In [None]:
# Columns not containing null values
train_df.columns[~train_df.isna().any()], test_df.columns[~test_df.isna().any()]

In [None]:
set(train_df.dtypes)

In [None]:
num_df = train_df.select_dtypes(include=['int64', 'float64'])
cat_df = train_df.select_dtypes(include=['object'])

In [None]:
num_min_cols = num_df.filter(regex='_min', axis=1).columns.to_list()
num_max_cols = num_df.filter(regex='_max', axis=1).columns.to_list()
num_range_cols = num_max_cols + num_min_cols

In [None]:
len(num_df.columns), len(cat_df.columns)

In [None]:
for col in cat_df.columns:
    print("Column:", str(col), cat_df[col].unique(), np.divide(cat_df[col].isna().sum(), cat_df.shape[0])*100)

In [None]:
# nominal - ethnicity, gender, 
# seems like ordinal - icu_type, icu_stay_type, hospital_admit_score, icu_admit_score

In [None]:
cat_df['gender'].mode()

In [None]:
cat_df[['ethnicity', 'gender', 'icu_admit_source', 'hospital_admit_source']][10:20]

In [None]:
num_min_columns = train_df.filter(regex = '_min', axis=1).columns
num_max_columns = train_df.filter(regex = '_max', axis=1).columns

In [None]:
num_id_df = train_df.filter(regex = '_id', axis=1)
num_id_df.head(5)

In [None]:
num_id_df['hospital_id'].value_counts()

In [None]:
num_id_df['icu_id'].value_counts()

In [None]:
num_id_df['encounter_id'].value_counts()

encounter_id is like a patient id, so it can be dropped.
hospital_id and icu_id can be frequency encoded

In [5]:
train_df = train_df.drop(columns=['encounter_id', 'hospital_id', 'icu_id'])
test_df = test_df.drop(columns=['encounter_id', 'hospital_id', 'icu_id'])

In [None]:
data_dict.head(10)

### Feature imputations

In [6]:
num_df = train_df.select_dtypes(include=['int64', 'float64'])
cat_df = train_df.select_dtypes(include=['object'])

In [7]:
# categorical features preprocessing
categorical_features = list(cat_df.columns)
categorical_transformer = Pipeline(steps = [('imputer', SimpleImputer(strategy = 'most_frequent')),
                                            ('ohe', OneHotEncoder())])
# categorical_features

In [8]:
# numerical features preprocessing 
numerical_features = list(num_df.columns)
numerical_transformer = Pipeline(steps = [('imputer', SimpleImputer(strategy = 'median')),
                                          ('scaler', StandardScaler())])
# numerical_features

In [9]:
preprocessor = ColumnTransformer(transformers=[('num', numerical_transformer, numerical_features),
                                               ('cat', categorical_transformer, categorical_features)])

## Logistic Regression - baseline model

In [10]:
preprocessor.fit_transform(train_df)

array([[ 0.35923734, -0.77846407, -0.48407027, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.90463229, -0.20126728, -0.48407027, ...,  1.        ,
         0.        ,  0.        ],
       [-2.24653854,  0.35645624, -0.48407027, ...,  1.        ,
         0.        ,  0.        ],
       ...,
       [ 0.66223454,  0.39493812, -0.48407027, ...,  0.        ,
         0.        ,  0.        ],
       [ 1.14703005, -0.572186  , -0.48407027, ...,  1.        ,
         0.        ,  0.        ],
       [-1.64054415,  0.13237702, -0.48407027, ...,  0.        ,
         0.        ,  0.        ]])

In [11]:
clf = Pipeline(steps = [('preprocessor', preprocessor),
                        ('classifier', LogisticRegression())])
# clf = make_pipeline(preprocessor, LogisticRegression())

In [86]:
clf

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  ['age', 'bmi',
                                                   'elective_surgery', 'height',
                                                   'pre_icu_los_days',
                                                   'readmission_status',
                                                   'weight', 'albumin_apache',
                                                   'apache_2_diagnosis',
                                                   'apache_3j_diagnosis',
                                                 

In [12]:
x_train, x_val, y_train, y_val = train_test_split(train_df, Y_train, test_size=0.2, random_state=0)

In [13]:
x_train.shape, x_val.shape, y_train.shape, y_val.shape

((104125, 176), (26032, 176), (104125,), (26032,))

In [14]:
clf.fit(x_train, y_train)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  ['age', 'bmi',
                                                   'elective_surgery', 'height',
                                                   'pre_icu_los_days',
                                                   'readmission_status',
                                                   'weight', 'albumin_apache',
                                                   'apache_2_diagnosis',
                                                   'apache_3j_diagnosis',
                                                 

In [19]:
y_val_pred = clf.predict(x_val)
print("model score: %.3f" % clf.score(x_val, y_val))
y_test_pred = clf.predict(test_df)

model score: 0.813


In [18]:
y_val_pred.shape

(26032,)

## Logistic with gridsearch cv

In [48]:
param_grid = {
    'logisticregression__penalty': ['l1', 'l2'],
    'logisticregression__C': [0.1, 1.0, 1.0],
    }

In [49]:
grid_clf = GridSearchCV(clf, param_grid, cv=10, scoring='accuracy')
grid_clf.fit(x_train, y_train)

ValueError: Invalid parameter logisticregression for estimator Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  ['age', 'bmi',
                                                   'elective_surgery', 'height',
                                                   'pre_icu_los_days',
                                                   'readmission_status',
                                                   'weight', 'albumin_apache',
                                                   'apache_2_diagnosis',
                                                   'apache_3j_diagnosis',
                                                   'apache_post_operative',
                                                   'arf_apache',
                                                   '...
                                                   'intubated_apache',
                                                   'map_apache', 'paco2_apache',
                                                   'paco2_for_ph_apache',
                                                   'pao2_apache', 'ph_apache',
                                                   'resprate_apache', ...]),
                                                 ('cat',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('ohe',
                                                                   OneHotEncoder())]),
                                                  ['ethnicity', 'gender',
                                                   'hospital_admit_source',
                                                   'icu_admit_source',
                                                   'icu_stay_type',
                                                   'icu_type'])])),
                ('classifier', LogisticRegression())]). Check the list of available parameters with `estimator.get_params().keys()`.

# Generate Solution

In [33]:
sol_df = pd.read_csv('input/SolutionTemplateWiDS2021.csv', usecols=['encounter_id'])
sol_df.head()

Unnamed: 0,encounter_id
0,135000
1,135001
2,135002
3,135003
4,135004


#### 28 Jan v1 1st submission

In [37]:
sol_df['diabetes_mellitus'] = y_test_pred
sol_df.to_csv('output/v1_logistic_regression_28_jan.csv', index=False)

In [24]:
y_test_pred.shape

(10234,)

#### 28 Jan v1 2nd submission