### Packages

In [89]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, OrdinalEncoder
from sklearn.metrics import mean_absolute_error
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split

### Load Cleaned Data Set

In [216]:
train_data = pd.read_csv("..\\Data\\train_data_EDA_processed.csv", index_col=0)

In [218]:
train_data.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,22073,22074,22075,22076,22077,22078,22079,22080,22081,22082
Patient_Age,2.0,4.0,6.0,12.0,11.0,14.0,3.0,3.0,11.0,4.0,...,13.0,4.0,10.0,0.0,9.0,4.0,8.0,8.0,7.0,11.0
Mother_Gene,Yes,Yes,Yes,Yes,Yes,Yes,Yes,No,No,No,...,Yes,No,No,Yes,No,Yes,No,Yes,Yes,Yes
Father_Gene,No,Yes,No,No,No,No,No,No,No,Yes,...,Yes,No,No,No,Yes,Yes,Yes,No,No,No
Maternal_Gene,Yes,No,No,Yes,,Yes,Yes,Yes,Yes,Yes,...,No,,Yes,Yes,Yes,Yes,No,Yes,Yes,No
Paternal_Gene,No,No,No,No,Yes,No,Yes,Yes,No,Yes,...,Yes,No,Yes,No,Yes,No,Yes,No,Yes,No
Blood_Cell,4.760603,4.910669,4.893297,4.70528,4.720703,5.103188,4.90108,4.964816,5.209058,4.752272,...,4.874635,4.789307,4.64386,4.931758,5.012599,5.258298,4.97422,5.18647,4.858543,4.738067
Mother_Age,,,41.0,21.0,32.0,,,40.0,45.0,44.0,...,44.0,35.0,49.0,,47.0,35.0,,35.0,19.0,32.0
Father_Age,,23.0,22.0,,,,63.0,,44.0,42.0,...,62.0,51.0,,50.0,,64.0,56.0,51.0,,62.0
Status,Alive,Deceased,Alive,Deceased,Alive,Deceased,Alive,Alive,Alive,Alive,...,Alive,Alive,Deceased,Alive,Deceased,Deceased,Alive,Deceased,Alive,Deceased
Respiratory_Rate,Normal (30-60),Tachypnea,Normal (30-60),Tachypnea,Tachypnea,,Normal (30-60),Tachypnea,Tachypnea,Tachypnea,...,Tachypnea,Tachypnea,,Normal (30-60),,Normal (30-60),Normal (30-60),Tachypnea,Tachypnea,Normal (30-60)


In [219]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 19915 entries, 0 to 22082
Data columns (total 32 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Patient_Age           18672 non-null  float64
 1   Mother_Gene           19915 non-null  object 
 2   Father_Gene           19652 non-null  object 
 3   Maternal_Gene         17510 non-null  object 
 4   Paternal_Gene         19915 non-null  object 
 5   Blood_Cell            19915 non-null  float64
 6   Mother_Age            14719 non-null  float64
 7   Father_Age            14796 non-null  float64
 8   Status                19915 non-null  object 
 9   Respiratory_Rate      18058 non-null  object 
 10  Heart_Rate            18127 non-null  object 
 11  Follow_Up             18070 non-null  object 
 12  Gender                12008 non-null  object 
 13  Birth_Asphyxia        9060 non-null   object 
 14  Autopsy_Birth_Defect  9013 non-null   object 
 15  Birth_Place        

### Road-Map Preprocessing 

I have decided to perform XGBoost modeling on my data sets given its ability etc.....

In [230]:
# Split the data into feature X and target y
X = train_data.drop(columns="Disorder_Subclass")
y = train_data.Disorder_Subclass

# Label encode target y
le = LabelEncoder()
encoded_y = le.fit_transform(y)

In [231]:
# Train Test Split by 30%
X_train, X_test, y_train, y_test = train_test_split(X, encoded_y, test_size = 0.3, random_state=123)

In [240]:
# Grouped numerical columns
num_cols = train_data.select_dtypes('float64').columns

# Grouped categorical columns
cat_cols = train_data.select_dtypes('object').drop(columns="Disorder_Subclass").columns

# Preprocessing for numerical data
num_preprocessor = SimpleImputer(strategy='constant', fill_value=-99)

# Preprocessing for categorical data
cat_preprocessor = Pipeline(steps=[('imputer', SimpleImputer(strategy='constant', fill_value="-99")),
                                   ('oe', OrdinalEncoder()),
                                   ('ohe', OneHotEncoder())])

# Combine num and cat preprocessor for data frame transformation
preprocessor = ColumnTransformer(
    transformers=[('num_transformer', num_preprocessor, num_cols),
                  ('cat_transformer', cat_preprocessor, cat_cols)])

# Preprocessor pipeline


In [241]:
# Testing purposes
import xgboost as xgb
model = xgb.XGBClassifier(max_depth=3)

# Bundle preprocessing and modeling code in a pipeline
my_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('model', model)
                             ])

# Preprocessing of training data, fit model 
my_pipeline.fit(X_train, y_train)

# Preprocessing of validation data, get predictions
preds = my_pipeline.predict(X_test)

# Evaluate the model
score = mean_absolute_error(y_test, preds)
print('MAE:', score)



MAE: 1.3035983263598325


In [250]:
print(preds)

print(y_test)

[6 7 2 ... 2 2 2]
[2 7 2 ... 3 2 2]


In [252]:
le.inverse_transform(preds)

array(['Leigh syndrome', 'Mitochondrial myopathy', 'Cystic fibrosis', ...,
       'Cystic fibrosis', 'Cystic fibrosis', 'Cystic fibrosis'],
      dtype=object)