In [1]:
import pandas as pd
import numpy as np

In [2]:
import os
path = 'C:/Users/dubrangala/OneDrive - VMware, Inc/Case Studies/hackerarth_prediction/dataset'
os.chdir(path)

In [3]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

#### Load train and test data

In [4]:
df_train = pd.read_csv("train.csv")
df_train.shape

(22083, 45)

In [5]:
df_test = pd.read_csv("test.csv")
df_test.shape

(9465, 43)

In [6]:
## Checking what is extra column there in train and test
set(df_train.columns)-set(df_test.columns)

{'Disorder Subclass', 'Genetic Disorder'}

In [7]:
## Add two columns which are not available on test
df_test['Disorder Subclass'] = np.nan
df_test['Genetic Disorder'] = np.nan

In [8]:
def clean_col_name (cname):
    pat_value = [' ', '(', ')','/','\'','-']
    repl_value = ['_', '', '','','','']
    for j in range(len(repl_value)):
        cname = [x.replace(pat_value[j], repl_value[j]) for x in cname]
    cname = [x.lower() for x in cname]
    return(cname)

In [9]:
df_train.columns = clean_col_name(df_train.columns)
df_test.columns = clean_col_name(df_test.columns)

In [10]:
### Lets combine train and test data before data preprocessing
df_train['myflag'] = 'train'
df_test['myflag'] = 'test'
df_master = pd.concat([df_train,df_test], axis=0).reset_index(drop=True)
df_master.shape

(31548, 46)

In [11]:
df_master.columns

Index(['patient_id', 'patient_age', 'genes_in_mothers_side',
       'inherited_from_father', 'maternal_gene', 'paternal_gene',
       'blood_cell_count_mcl', 'patient_first_name', 'family_name',
       'fathers_name', 'mothers_age', 'fathers_age', 'institute_name',
       'location_of_institute', 'status', 'respiratory_rate_breathsmin',
       'heart_rate_ratesmin', 'test_1', 'test_2', 'test_3', 'test_4', 'test_5',
       'parental_consent', 'followup', 'gender', 'birth_asphyxia',
       'autopsy_shows_birth_defect_if_applicable', 'place_of_birth',
       'folic_acid_details_periconceptional', 'ho_serious_maternal_illness',
       'ho_radiation_exposure_xray', 'ho_substance_abuse',
       'assisted_conception_ivfart',
       'history_of_anomalies_in_previous_pregnancies',
       'no._of_previous_abortion', 'birth_defects',
       'white_blood_cell_count_thousand_per_microliter', 'blood_test_result',
       'symptom_1', 'symptom_2', 'symptom_3', 'symptom_4', 'symptom_5',
       'genetic

### Data Pre - processing

In [12]:
df_master.myflag.value_counts()

train    22083
test      9465
Name: myflag, dtype: int64

#### 1. Explore target variable

In [14]:
## From the problem statement it is clear that we have to predict both Genitic disorder and Disorder sublcass
## let us understand target variable fist
mytarget = df_master[df_master['myflag']=='train'][['patient_id','genetic_disorder', 'disorder_subclass']]
print(mytarget.shape)

(22083, 3)


In [15]:
## there are some missing in target 
mytarget.fillna("missing", inplace=True)

In [16]:
mytarget.groupby(['genetic_disorder']).patient_id.nunique()

genetic_disorder
Mitochondrial genetic inheritance disorders     10202
Multifactorial genetic inheritance disorders     2071
Single-gene inheritance diseases                 7664
missing                                          2146
Name: patient_id, dtype: int64

In [17]:
mytarget.groupby(['disorder_subclass']).patient_id.nunique()

disorder_subclass
Alzheimer's                             152
Cancer                                   97
Cystic fibrosis                        3448
Diabetes                               1817
Hemochromatosis                        1355
Leber's hereditary optic neuropathy     648
Leigh syndrome                         5160
Mitochondrial myopathy                 4405
Tay-Sachs                              2833
missing                                2168
Name: patient_id, dtype: int64

In [18]:
## We need to treat missing dependent variable before data prep
## Quickly check the cross tabulations of disorder_subclass and genetic_disorder
t_pivot = mytarget.pivot_table(index='disorder_subclass', 
                               columns='genetic_disorder', 
                               values='patient_id', 
                               aggfunc='nunique',
                               fill_value=0).reset_index()
t_pivot

genetic_disorder,disorder_subclass,Mitochondrial genetic inheritance disorders,Multifactorial genetic inheritance disorders,Single-gene inheritance diseases,missing
0,Alzheimer's,0,133,0,19
1,Cancer,0,91,0,6
2,Cystic fibrosis,0,0,3145,303
3,Diabetes,0,1653,0,164
4,Hemochromatosis,0,0,1228,127
5,Leber's hereditary optic neuropathy,587,0,0,61
6,Leigh syndrome,4683,0,0,477
7,Mitochondrial myopathy,3971,0,0,434
8,Tay-Sachs,0,0,2556,277
9,missing,961,194,735,278


In [19]:
## From the pivot it is clear that 278 patients both genetic disorder and subcalss are missing (we can drop this sample)
## Treat Genetic disorder column first
t_pivot_d = t_pivot[t_pivot['disorder_subclass']!='missing']
Single_gene_subclass = t_pivot_d[t_pivot_d['Single-gene inheritance diseases']>0]['disorder_subclass'].unique()
Mitochondrial_gene_subclass = t_pivot_d[t_pivot_d['Mitochondrial genetic inheritance disorders']>0]['disorder_subclass'].unique()
Multifactorial_gene_subclass = t_pivot_d[t_pivot_d['Multifactorial genetic inheritance disorders']>0]['disorder_subclass'].unique()

## Create new genetic category
mytarget['genetic_disorder_v1'] = np.where(mytarget['disorder_subclass'].isin(Single_gene_subclass), 'Single-gene inheritance diseases',
                                          np.where(mytarget['disorder_subclass'].isin(Mitochondrial_gene_subclass), 'Mitochondrial genetic inheritance disorders',
                                                  np.where(mytarget['disorder_subclass'].isin(Multifactorial_gene_subclass), 'Multifactorial genetic inheritance disorders', mytarget['genetic_disorder'])))


In [20]:
mytarget.pivot_table(index='disorder_subclass', 
                               columns='genetic_disorder_v1', 
                               values='patient_id', 
                               aggfunc='nunique',
                               fill_value=0).reset_index()

genetic_disorder_v1,disorder_subclass,Mitochondrial genetic inheritance disorders,Multifactorial genetic inheritance disorders,Single-gene inheritance diseases,missing
0,Alzheimer's,0,152,0,0
1,Cancer,0,97,0,0
2,Cystic fibrosis,0,0,3448,0
3,Diabetes,0,1817,0,0
4,Hemochromatosis,0,0,1355,0
5,Leber's hereditary optic neuropathy,648,0,0,0
6,Leigh syndrome,5160,0,0,0
7,Mitochondrial myopathy,4405,0,0,0
8,Tay-Sachs,0,0,2833,0
9,missing,961,194,735,278


In [21]:
mytarget.head()

Unnamed: 0,patient_id,genetic_disorder,disorder_subclass,genetic_disorder_v1
0,PID0x6418,Mitochondrial genetic inheritance disorders,Leber's hereditary optic neuropathy,Mitochondrial genetic inheritance disorders
1,PID0x25d5,missing,Cystic fibrosis,Single-gene inheritance diseases
2,PID0x4a82,Multifactorial genetic inheritance disorders,Diabetes,Multifactorial genetic inheritance disorders
3,PID0x4ac8,Mitochondrial genetic inheritance disorders,Leigh syndrome,Mitochondrial genetic inheritance disorders
4,PID0x1bf7,Multifactorial genetic inheritance disorders,Cancer,Multifactorial genetic inheritance disorders


In [22]:
mytarget.columns = ['patient_id','genetic_disorder_old','disorder_subclass_v1','genetic_disorder_v1']
mytarget.to_csv("mytarget_new.csv")

#### Data Engineering

In [52]:
df_master = pd.read_csv("df_master.csv")

In [53]:
### 
df_master[df_master['myflag']=='train'].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 22083 entries, 0 to 22082
Data columns (total 46 columns):
 #   Column                                          Non-Null Count  Dtype  
---  ------                                          --------------  -----  
 0   patient_id                                      22083 non-null  object 
 1   patient_age                                     20656 non-null  float64
 2   genes_in_mothers_side                           22083 non-null  object 
 3   inherited_from_father                           21777 non-null  object 
 4   maternal_gene                                   19273 non-null  object 
 5   paternal_gene                                   22083 non-null  object 
 6   blood_cell_count_mcl                            22083 non-null  float64
 7   patient_first_name                              22083 non-null  object 
 8   family_name                                     12392 non-null  object 
 9   fathers_name                           

In [None]:
'location_of_institute'
'ho_radiation_exposure_xray'
'ho_substance_abuse'

In [54]:
 df_master[df_master['myflag']=='train'].describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
patient_age,20656.0,6.974148,4.319475,0.0,3.0,7.0,11.0,14.0
blood_cell_count_mcl,22083.0,4.898871,0.199663,4.092727,4.763109,4.899399,5.03383,5.609829
mothers_age,16047.0,34.526454,9.852598,18.0,26.0,35.0,43.0,51.0
fathers_age,16097.0,41.972852,13.035501,20.0,31.0,42.0,53.0,64.0
test_1,19956.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
test_2,19931.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
test_3,19936.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
test_4,19943.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0
test_5,19913.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
no._of_previous_abortion,19921.0,2.003062,1.411919,0.0,1.0,2.0,3.0,4.0


In [55]:
def summary_stat (df):
    missing_cnt = df.isnull().sum()
    missing_prop = round(100*df.isnull().sum()/len(df),2)
    uniqv = df.apply(lambda x: x.nunique())
    ctype = df.apply(lambda x: x.dtype)
    mydf = pd.DataFrame({'Column Type':ctype, 'Number of Unique':uniqv, 'Total missing':missing_cnt, 'missing percentage':missing_prop})
    return(mydf)

In [56]:
#mydf = pd.DataFrame({'sdsd':df_master.apply(lambda x: x.dtype), 'dsd':df_master.apply(lambda x: x.nunique())})
#mydf.head()
data_dictinary = summary_stat(df_master)
data_dictinary = data_dictinary.reset_index()
data_dictinary.head()

Unnamed: 0,index,Column Type,Number of Unique,Total missing,missing percentage
0,patient_id,object,31548,0,0.0
1,patient_age,float64,15,1427,4.52
2,genes_in_mothers_side,object,2,0,0.0
3,inherited_from_father,object,2,857,2.72
4,maternal_gene,object,2,6533,20.71


In [57]:
all_variable = df_master.columns.to_list()
print("Total variables in the master data set: ", len(all_variable))
drop_var_from_ana = ['patient_first_name', 'family_name','fathers_name']
print("We can drop some of the variables which are not really important: ", len(drop_var_from_ana))
variable_no_missing = data_dictinary[data_dictinary['Total missing']==0]['index'].to_list()
print("Total variable has complete data: ", len(variable_no_missing))
variable_with_missing = data_dictinary[data_dictinary['Total missing']!=0]['index'].to_list()
print("Total variable has missing data: ", len(variable_with_missing))
total_cat_var = data_dictinary[data_dictinary['Column Type']=='object']['index'].to_list()
print("Total categorical variable: ", len(total_cat_var))
total_num_var = data_dictinary[data_dictinary['Column Type']!='object']['index'].to_list()
print("Total numerical variable: ", len(total_num_var))

Total variables in the master data set:  46
We can drop some of the variables which are not really important:  3
Total variable has complete data:  9
Total variable has missing data:  37
Total categorical variable:  30
Total numerical variable:  16


In [58]:
### Checking the unique values from all the missing variable
for j in all_variable:
    print(j)
    print(df_master[j].unique())
    print('==================','\n')


patient_id
['PID0x6418' 'PID0x25d5' 'PID0x4a82' ... 'PID0x5408' 'PID0x2017'
 'PID0x7f61']

patient_age
[ 2.  4.  6. 12. 11. 14.  3.  7.  1.  0. nan 10.  5. 13.  8.  9.]

genes_in_mothers_side
['Yes' 'No']

inherited_from_father
['No' 'Yes' nan]

maternal_gene
['Yes' 'No' nan]

paternal_gene
['No' 'Yes']

blood_cell_count_mcl
[4.76060309 4.91066906 4.89329743 ... 4.89835233 4.80483966 5.42123643]

patient_first_name
['Richard' 'Mike' 'Kimberly' ... 'Alejandrina' 'Karolyn' 'Jerrod']

family_name
[nan 'Hoelscher' 'Stutzman' ... 'Rodrigue' 'Lebron' 'Dilworth']

fathers_name
['Larre' 'Brycen' 'Nashon' ... 'Shamond' 'Nasir' 'Jatorian']

mothers_age
[nan 41. 21. 32. 40. 45. 44. 50. 28. 30. 24. 36. 51. 23. 49. 46. 18. 38.
 37. 42. 48. 25. 19. 47. 34. 35. 22. 33. 20. 26. 31. 29. 27. 43. 39.]

fathers_age
[nan 23. 22. 63. 44. 42. 56. 20. 24. 57. 48. 30. 55. 62. 43. 32. 41. 52.
 28. 31. 61. 35. 49. 50. 29. 64. 39. 34. 51. 25. 60. 53. 58. 26. 27. 59.
 47. 38. 54. 21. 37. 36. 33. 46. 40. 45.]

inst

In [59]:
### Value Corrections on master data
df_master['institute_name'] = np.where(df_master['institute_name'] == '-99', np.nan, df_master['institute_name'])
df_master['mcap_institute_name'] = np.where(df_master['institute_name'].isnull(), 1, 0)
df_master['institute_name'] = np.where(df_master['institute_name'].isnull(), 'Not applicable', df_master['institute_name'])

df_master['respiratory_rate_breathsmin'] = np.where(df_master['respiratory_rate_breathsmin'] == '-99',np.nan, df_master['respiratory_rate_breathsmin'])
df_master['mcap_respiratory_rate_breathsmin'] = np.where(df_master['respiratory_rate_breathsmin'].isnull(),1, 0)

df_master['heart_rate_ratesmin'] = np.where(df_master['heart_rate_ratesmin'] == '-99', np.nan, df_master['heart_rate_ratesmin'])
df_master['mcap_heart_rate_ratesmin'] = np.where(df_master['heart_rate_ratesmin'].isnull(), 1, 0)

In [60]:
## treat -99 as nan
df_master['test_1'] = np.where(df_master['test_1']==-99, np.nan, df_master['test_1'])
df_master['test_2'] = np.where(df_master['test_2']==-99, np.nan, df_master['test_2'])
df_master['test_3'] = np.where(df_master['test_3']==-99, np.nan, df_master['test_3'])
df_master['test_4'] = np.where(df_master['test_4']==-99, np.nan, df_master['test_4'])
df_master['test_5'] = np.where(df_master['test_5']==-99, np.nan, df_master['test_5'])
## combine all test
cols = ['test_1','test_2','test_3','test_4','test_5']
df_master['test_1to5'] = df_master[cols].fillna(9).apply(lambda row: '_'.join(row.values.astype(str)), axis=1)

## Replace NaN as constant value
df_master[cols] = df_master[cols].fillna(99)

In [61]:
df_master['parental_consent'] = np.where(df_master['parental_consent'] == '-99', np.nan, df_master['parental_consent'])
df_master['parental_consent'] = np.where(df_master['parental_consent'].isnull(), 'No', df_master['parental_consent'])

df_master['followup'] = np.where(df_master['followup'] == '-99', np.nan, df_master['followup'])
df_master['mcap_followup'] = np.where(df_master['followup'].isnull(), 1, 0)
df_master['followup'] = np.where(df_master['followup'].isnull(), 'Missing', df_master['followup'])

df_master['gender'] = np.where(df_master['gender'] == '-99', np.nan, df_master['gender'])
df_master['mcap_gender'] = np.where(df_master['gender'].isnull(), 1, 0)
df_master['gender'] = np.where(df_master['gender'].isnull(), 'Missing', df_master['gender'])

In [62]:
df_master['birth_asphyxia'] = np.where(df_master['birth_asphyxia'] == '-99', np.nan, df_master['birth_asphyxia'])
df_master['mcap_birth_asphyxia'] = np.where(df_master['birth_asphyxia'].isnull(), 1, 0)
df_master['birth_asphyxia'] = np.where(df_master['birth_asphyxia'].isin(['No record','Not available']), 'Not available', df_master['birth_asphyxia'])

In [63]:
df_master['autopsy_shows_birth_defect_if_applicable'] = np.where(df_master['autopsy_shows_birth_defect_if_applicable'] == '-99', np.nan, df_master['autopsy_shows_birth_defect_if_applicable'])
df_master['mcap_autopsy_shows_birth_defect_if_applicable'] = np.where(df_master['autopsy_shows_birth_defect_if_applicable'].isnull(), 1, 0)
df_master['autopsy_shows_birth_defect_if_applicable'] = np.where(df_master['autopsy_shows_birth_defect_if_applicable'].isin(['None','No']), 'No', df_master['autopsy_shows_birth_defect_if_applicable'])

In [64]:
df_master['place_of_birth'] = np.where(df_master['place_of_birth'] == '-99', np.nan, df_master['place_of_birth'])
df_master['mcap_place_of_birth'] = np.where(df_master['place_of_birth'].isnull(), 1, 0)
df_master['place_of_birth'] = np.where(df_master['place_of_birth'].isnull(), 'Not available', df_master['place_of_birth'])

In [65]:
df_master['folic_acid_details_periconceptional'] = np.where(df_master['folic_acid_details_periconceptional'] == '-99', np.nan, df_master['folic_acid_details_periconceptional'])
df_master['mcap_folic_acid_details_periconceptional'] = np.where(df_master['folic_acid_details_periconceptional'].isnull(), 1, 0)
#df_master['folic_acid_details_periconceptional'] = np.where(df_master['folic_acid_details_periconceptional'].isnull(), 'No', df_master['folic_acid_details_periconceptional'])

df_master['ho_serious_maternal_illness'] = np.where(df_master['ho_serious_maternal_illness'] == '-99', np.nan, df_master['ho_serious_maternal_illness'])
df_master['mcap_ho_serious_maternal_illness'] = np.where(df_master['ho_serious_maternal_illness'].isnull(), 1, 0)

## New derived feature
df_master['flag_ho_serious_maternal_folic_acid'] = np.where((df_master['ho_serious_maternal_illness']=='Yes') & (df_master['folic_acid_details_periconceptional']=='Yes'), 1, 0)

In [66]:
df_master['ho_serious_maternal_illness'] = np.where(df_master['ho_serious_maternal_illness'] == '-99', np.nan, df_master['ho_serious_maternal_illness'])
df_master['mcap_ho_serious_maternal_illness'] = np.where(df_master['ho_serious_maternal_illness'].isnull(), 1, 0)

In [67]:
df_master['ho_radiation_exposure_xray'] = np.where(df_master['ho_radiation_exposure_xray'].isin(['-99', '-']), np.nan, df_master['ho_radiation_exposure_xray'])
df_master['mcap_ho_radiation_exposure_xray'] = np.where(df_master['ho_radiation_exposure_xray'].isnull(), 1, 0)

df_master['ho_substance_abuse'] = np.where(df_master['ho_substance_abuse'].isin(['-99', '-']), np.nan, df_master['ho_substance_abuse'])
df_master['mcap_ho_substance_abuse'] = np.where(df_master['ho_substance_abuse'].isnull(), 1, 0)

## New derived feature
df_master['flag_ho_substance_abuse_radiation'] = np.where((df_master['ho_substance_abuse']=='Yes') & (df_master['ho_radiation_exposure_xray']=='Yes'), 1, 0)

In [68]:
df_master['assisted_conception_ivfart'] = np.where(df_master['assisted_conception_ivfart'].isin(['-99', '-']), np.nan, df_master['assisted_conception_ivfart'])
df_master['mcap_assisted_conception_ivfart'] = np.where(df_master['assisted_conception_ivfart'].isnull(), 1, 0)

df_master['history_of_anomalies_in_previous_pregnancies'] = np.where(df_master['history_of_anomalies_in_previous_pregnancies'].isin(['-99', '-']), np.nan, df_master['history_of_anomalies_in_previous_pregnancies'])
df_master['mcap_history_of_anomalies_in_previous_pregnancies'] = np.where(df_master['history_of_anomalies_in_previous_pregnancies'].isnull(), 1, 0)

# Derived Feature
df_master['flag_history_of_anomalies_ivfart'] = np.where((df_master['assisted_conception_ivfart']=='Yes') & (df_master['history_of_anomalies_in_previous_pregnancies']=='Yes'), 1, 0)


In [69]:
df_master['no._of_previous_abortion'] = np.where(df_master['no._of_previous_abortion']==-99, np.nan, df_master['no._of_previous_abortion'])

df_master['birth_defects'] = np.where(df_master['birth_defects']=='-99', np.nan, df_master['birth_defects'])
df_master['mcap_birth_defects'] = np.where(df_master['birth_defects'].isnull(), 1, 0)

df_master['blood_test_result'] = np.where(df_master['blood_test_result']=='-99', np.nan, df_master['blood_test_result'])
df_master['mcap_blood_test_result'] = np.where(df_master['blood_test_result'].isnull(), 1,0)

In [70]:
cols = ['symptom_1','symptom_2','symptom_3','symptom_4', 'symptom_5']

df_master['mcap_symptom_1to_5'] = np.where((df_master['symptom_1'].isnull()) | (df_master['symptom_2'].isnull())
                                           | (df_master['symptom_3'].isnull())| (df_master['symptom_4'].isnull())| (df_master['symptom_5'].isnull()), 1, 0)

df_master[cols] = df_master[cols].fillna(0)
df_master['symptom_1to_5'] = df_master[cols].fillna(9).apply(lambda row: '_'.join(row.values.astype(str)), axis=1)



In [71]:
### Checking the unique values from all the missing variable
for j in all_variable:
    print(j)
    print(df_master[j].unique())
    print('==================','\n')


patient_id
['PID0x6418' 'PID0x25d5' 'PID0x4a82' ... 'PID0x5408' 'PID0x2017'
 'PID0x7f61']

patient_age
[ 2.  4.  6. 12. 11. 14.  3.  7.  1.  0. nan 10.  5. 13.  8.  9.]

genes_in_mothers_side
['Yes' 'No']

inherited_from_father
['No' 'Yes' nan]

maternal_gene
['Yes' 'No' nan]

paternal_gene
['No' 'Yes']

blood_cell_count_mcl
[4.76060309 4.91066906 4.89329743 ... 4.89835233 4.80483966 5.42123643]

patient_first_name
['Richard' 'Mike' 'Kimberly' ... 'Alejandrina' 'Karolyn' 'Jerrod']

family_name
[nan 'Hoelscher' 'Stutzman' ... 'Rodrigue' 'Lebron' 'Dilworth']

fathers_name
['Larre' 'Brycen' 'Nashon' ... 'Shamond' 'Nasir' 'Jatorian']

mothers_age
[nan 41. 21. 32. 40. 45. 44. 50. 28. 30. 24. 36. 51. 23. 49. 46. 18. 38.
 37. 42. 48. 25. 19. 47. 34. 35. 22. 33. 20. 26. 31. 29. 27. 43. 39.]

fathers_age
[nan 23. 22. 63. 44. 42. 56. 20. 24. 57. 48. 30. 55. 62. 43. 32. 41. 52.
 28. 31. 61. 35. 49. 50. 29. 64. 39. 34. 51. 25. 60. 53. 58. 26. 27. 59.
 47. 38. 54. 21. 37. 36. 33. 46. 40. 45.]

inst

In [72]:
#mydf = pd.DataFrame({'sdsd':df_master.apply(lambda x: x.dtype), 'dsd':df_master.apply(lambda x: x.nunique())})
#mydf.head()
data_dictinary = summary_stat(df_master)
data_dictinary = data_dictinary.reset_index()
all_variable = df_master.columns.to_list()
print("Total variables in the master data set: ", len(all_variable))
drop_var_from_ana = ['patient_first_name', 'family_name','fathers_name']
print("We can drop some of the variables which are not really important: ", len(drop_var_from_ana))

print("----------------------------------------------------------------")
total_cat_var = data_dictinary[data_dictinary['Column Type']=='object']['index'].to_list()
print("Total categorical variable: ", len(total_cat_var))
total_num_var = data_dictinary[data_dictinary['Column Type']!='object']['index'].to_list()
print("Total numerical variable: ", len(total_num_var))

print("----------------------------------------------------------------")
variable_no_missing = data_dictinary[data_dictinary['Total missing']==0]['index'].to_list()
print("Total variable no missing data: ", len(variable_no_missing))

variable_no_missing_cat = data_dictinary[(data_dictinary['Total missing']==0) & (data_dictinary['Column Type']=='object')]['index'].to_list()
print("Total categorical variable has complete data: ", len(variable_no_missing_cat))
variable_no_missing_num = data_dictinary[(data_dictinary['Total missing']==0) & (data_dictinary['Column Type']!='object')]['index'].to_list()
print("Total numeric variable has complete data: ", len(variable_no_missing_num))

print("----------------------------------------------------------------")

variable_with_missing = data_dictinary[data_dictinary['Total missing']!=0]['index'].to_list()
print("Total variable has missing data: ", len(variable_with_missing))

variable_missing_cat = data_dictinary[(data_dictinary['Total missing']!=0) & (data_dictinary['Column Type']=='object')]['index'].to_list()
print("Total categorical variable has missing data: ", len(variable_missing_cat))
variable_missing_num = data_dictinary[(data_dictinary['Total missing']!=0) & (data_dictinary['Column Type']!='object')]['index'].to_list()
print("Total numeric variable has missing data: ", len(variable_missing_num))


Total variables in the master data set:  68
We can drop some of the variables which are not really important:  3
----------------------------------------------------------------
Total categorical variable:  32
Total numerical variable:  36
----------------------------------------------------------------
Total variable no missing data:  46
Total categorical variable has complete data:  15
Total numeric variable has complete data:  31
----------------------------------------------------------------
Total variable has missing data:  22
Total categorical variable has missing data:  17
Total numeric variable has missing data:  5


In [73]:
###
drop_var_from_mi = ['patient_first_name', 'family_name','fathers_name', 'location_of_institute', 'institute_name','patient_id', 'genetic_disorder','disorder_subclass']
imputate_data = df_master.copy()
imputate_data.index = imputate_data.patient_id ## Add index name as patient id
#imputate_data = imputate_data[my_var_selected]

In [74]:
imputate_data.myflag.head()

patient_id
PID0x6418    train
PID0x25d5    train
PID0x4a82    train
PID0x4ac8    train
PID0x1bf7    train
Name: myflag, dtype: object

In [75]:
#mydf = pd.DataFrame({'sdsd':df_master.apply(lambda x: x.dtype), 'dsd':df_master.apply(lambda x: x.nunique())})
#mydf.head()
data_dictinary = summary_stat(imputate_data)
data_dictinary = data_dictinary.reset_index()
data_dictinary = data_dictinary[~data_dictinary['index'].isin(drop_var_from_mi)]

my_var_selected = set(data_dictinary['index'].to_list())

print("----------------------------------------------------------------")
total_cat_var = data_dictinary[data_dictinary['Column Type']=='object']['index'].to_list()
print("Total categorical variable: ", len(total_cat_var))
total_num_var = data_dictinary[data_dictinary['Column Type']!='object']['index'].to_list()
print("Total numerical variable: ", len(total_num_var))

print("----------------------------------------------------------------")
variable_no_missing = data_dictinary[data_dictinary['Total missing']==0]['index'].to_list()
print("Total variable no missing data: ", len(variable_no_missing))

variable_no_missing_cat = data_dictinary[(data_dictinary['Total missing']==0) & (data_dictinary['Column Type']=='object')]['index'].to_list()
print("Total categorical variable has complete data: ", len(variable_no_missing_cat))
variable_no_missing_num = data_dictinary[(data_dictinary['Total missing']==0) & (data_dictinary['Column Type']!='object')]['index'].to_list()
print("Total numeric variable has complete data: ", len(variable_no_missing_num))

print("----------------------------------------------------------------")

variable_with_missing = data_dictinary[data_dictinary['Total missing']!=0]['index'].to_list()
print("Total variable has missing data: ", len(variable_with_missing))

variable_missing_cat = data_dictinary[(data_dictinary['Total missing']!=0) & (data_dictinary['Column Type']=='object')]['index'].to_list()
print("Total categorical variable has missing data: ", len(variable_missing_cat))
variable_missing_num = data_dictinary[(data_dictinary['Total missing']!=0) & (data_dictinary['Column Type']!='object')]['index'].to_list()
print("Total numeric variable has missing data: ", len(variable_missing_num))


----------------------------------------------------------------
Total categorical variable:  24
Total numerical variable:  36
----------------------------------------------------------------
Total variable no missing data:  41
Total categorical variable has complete data:  10
Total numeric variable has complete data:  31
----------------------------------------------------------------
Total variable has missing data:  19
Total categorical variable has missing data:  14
Total numeric variable has missing data:  5


In [None]:
### Predict categorical data missing values

In [76]:
from sklearn.model_selection import train_test_split
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, balanced_accuracy_score
from sklearn.ensemble import RandomForestRegressor
from collections import Counter

In [77]:
def my_dummy_var (data, varlist):
    sparse_mat = pd.get_dummies(data[varlist]).rename(columns=lambda x: 'dummy_' + str(x))
    sparse_mat_names = sparse_mat.columns.to_list()
    sparse_mat_names = [w.replace('-', '_') for w in sparse_mat_names]
    sparse_mat_names = [w.lower() for w in sparse_mat_names]
    sparse_mat.columns = sparse_mat_names
    return(sparse_mat)    

In [78]:
def cat_rf_imputation (mydata, my_var_selected, dep_var, ntree=500, seed=123, method = 'class'):
    data = mydata.copy()
    imputation_variable = set(my_var_selected)-set('myflag')
    #dep_var = ['ho_radiation_exposure_xray']
    iv_var = set(imputation_variable) - set(dep_var)
    dummy_iv = set(variable_missing_cat)
    dummy_iv = dummy_iv.union(set(variable_no_missing_cat))-set(dep_var)
    numeric_iv = set(variable_no_missing_num)
    numeric_iv = numeric_iv.union(set(variable_missing_num)) - set(dep_var)
    
    if method == 'class' :
        print(data[dep_var].value_counts())
    if method == 'reg' :
        print(data[dep_var].describe())
    
    ## Create Train and Score df
    train_df = data[data[dep_var].notnull().values].copy()
    print("Train data", train_df.shape)
    score_df = data[data[dep_var].isnull().values].copy()
    print("Score data",score_df.shape)
    
    ## Create Dummy coding
    dummy_var = my_dummy_var(data, dummy_iv)
    train_dummy_df = dummy_var.loc[train_df.index]
    score_dummy_df = dummy_var.loc[score_df.index]

    X = pd.concat([train_df[numeric_iv], train_dummy_df], axis=1)
    X.fillna(0, inplace=True)
    Z = pd.concat([score_df[numeric_iv], score_dummy_df], axis=1)
    Z.fillna(0, inplace=True)
    
    # Split dataset into training set and test set
    y = train_df[dep_var]
    y.columns = ['my_target']
    X_train, X_test, y_train, y_test = train_test_split(X, y.my_target.values, test_size=0.2) 
    
    #Import Random Forest Model
    #from sklearn.ensemble import RandomForestClassifier, BalancedRandomForestClassifier
    #Create a Gaussian Classifier
    if method=='class':
        clf=BalancedRandomForestClassifier(n_estimators=ntree, random_state=seed)

        #Train the model using the training sets y_pred=clf.predict(X_test)
        clf.fit(X_train,y_train)
        y_pred=clf.predict(X_test)
        print(dep_var, "accuaracy is ", balanced_accuracy_score(y_test, y_pred))
        y_score=clf.predict(Z)
        Z[dep_var] = y_score
        data.loc[Z.index, dep_var] = Z[dep_var]
        myimputed_data = data[dep_var]
        return(myimputed_data)
    if method=='reg':
        
        regr = RandomForestRegressor(max_depth=2, n_estimators=ntree, random_state=seed)
        regr.fit(X_train,y_train)
        y_pred=regr.predict(X_test)
        y_score=regr.predict(Z)
        Z[dep_var] = y_score
        data.loc[Z.index, dep_var] = Z[dep_var]
        myimputed_data = data[dep_var]
        return(myimputed_data)

In [79]:
 variable_missing_num

['patient_age',
 'mothers_age',
 'fathers_age',
 'no._of_previous_abortion',
 'white_blood_cell_count_thousand_per_microliter']

In [80]:
### Numerical variable imputation
for j in range(len(variable_missing_num)):
    print("imputation for ",variable_missing_num[j])
    cc = cat_rf_imputation(imputate_data, my_var_selected, dep_var= [variable_missing_num[j]], ntree=500, seed=123, method = 'reg')
    if j==0:
        mydata_imp_num = cc
    if j!=0:
        mydata_imp_num = pd.concat([mydata_imp_num, cc], axis=1)
    del(cc)

imputation for  patient_age
        patient_age
count  30121.000000
mean       6.995418
std        4.325345
min        0.000000
25%        3.000000
50%        7.000000
75%       11.000000
max       14.000000
Train data (30121, 68)
Score data (1427, 68)
imputation for  mothers_age
        mothers_age
count  25512.000000
mean      34.544646
std        9.845861
min       18.000000
25%       26.000000
50%       35.000000
75%       43.000000
max       51.000000
Train data (25512, 68)
Score data (6036, 68)
imputation for  fathers_age
        fathers_age
count  25562.000000
mean      41.920233
std       13.037442
min       20.000000
25%       30.000000
50%       42.000000
75%       53.000000
max       64.000000
Train data (25562, 68)
Score data (5986, 68)
imputation for  no._of_previous_abortion
       no._of_previous_abortion
count              27290.000000
mean                   2.006852
std                    1.411019
min                    0.000000
25%                    1.000000
50%     

In [81]:
mydata_imp_num.isnull().sum()

patient_age                                       0
mothers_age                                       0
fathers_age                                       0
no._of_previous_abortion                          0
white_blood_cell_count_thousand_per_microliter    0
dtype: int64

In [82]:
mydata_imp_num.head()

Unnamed: 0_level_0,patient_age,mothers_age,fathers_age,no._of_previous_abortion,white_blood_cell_count_thousand_per_microliter
patient_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
PID0x6418,2.0,34.764493,41.668601,2.073244,9.857562
PID0x25d5,4.0,34.328428,23.0,2.007323,5.52256
PID0x4a82,6.0,41.0,22.0,4.0,7.519419
PID0x4ac8,12.0,21.0,41.734429,1.0,7.919321
PID0x1bf7,11.0,32.0,42.086319,4.0,4.09821


In [83]:
variable_missing_cat

['inherited_from_father',
 'maternal_gene',
 'respiratory_rate_breathsmin',
 'heart_rate_ratesmin',
 'birth_asphyxia',
 'autopsy_shows_birth_defect_if_applicable',
 'folic_acid_details_periconceptional',
 'ho_serious_maternal_illness',
 'ho_radiation_exposure_xray',
 'ho_substance_abuse',
 'assisted_conception_ivfart',
 'history_of_anomalies_in_previous_pregnancies',
 'birth_defects',
 'blood_test_result']

In [84]:
### Categorical variable imputation
for j in range(len(variable_missing_cat)):
    print("imputation for ",variable_missing_cat[j])
    cc = cat_rf_imputation(imputate_data, my_var_selected, dep_var= [variable_missing_cat[j]], ntree=500, seed=123, method = 'class')
    if j==0:
        mydata_imp_cat = cc
    if j!=0:
        mydata_imp_cat = pd.concat([mydata_imp_cat, cc], axis=1)
    del(cc)

imputation for  inherited_from_father
inherited_from_father
No                       18576
Yes                      12115
dtype: int64
Train data (30691, 68)
Score data (857, 68)
['inherited_from_father'] accuaracy is  0.5329992355531223
imputation for  maternal_gene
maternal_gene
Yes              13902
No               11113
dtype: int64
Train data (25015, 68)
Score data (6533, 68)
['maternal_gene'] accuaracy is  0.523286925549305
imputation for  respiratory_rate_breathsmin
respiratory_rate_breathsmin
Normal (30-60)                 12328
Tachypnea                      12080
dtype: int64
Train data (24408, 68)
Score data (7140, 68)
['respiratory_rate_breathsmin'] accuaracy is  0.4970260757225985
imputation for  heart_rate_ratesmin
heart_rate_ratesmin
Normal                 12408
Tachycardia            12053
dtype: int64
Train data (24461, 68)
Score data (7087, 68)
['heart_rate_ratesmin'] accuaracy is  0.49596726477006514
imputation for  birth_asphyxia
birth_asphyxia
Not available     1

In [85]:
mydata_imp_cat.head()

Unnamed: 0_level_0,inherited_from_father,maternal_gene,respiratory_rate_breathsmin,heart_rate_ratesmin,birth_asphyxia,autopsy_shows_birth_defect_if_applicable,folic_acid_details_periconceptional,ho_serious_maternal_illness,ho_radiation_exposure_xray,ho_substance_abuse,assisted_conception_ivfart,history_of_anomalies_in_previous_pregnancies,birth_defects,blood_test_result
patient_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
PID0x6418,No,Yes,Normal (30-60),Normal,Yes,Not applicable,No,No,No,No,No,Yes,Multiple,inconclusive
PID0x25d5,Yes,No,Tachypnea,Normal,No,No,Yes,Yes,Not applicable,Not applicable,No,Yes,Multiple,normal
PID0x4a82,No,No,Normal (30-60),Tachycardia,Not available,Not applicable,Yes,No,Yes,No,Yes,Yes,Singular,normal
PID0x4ac8,No,Yes,Tachypnea,Normal,Not available,No,No,Yes,Yes,Not applicable,No,Yes,Singular,inconclusive
PID0x1bf7,No,Yes,Tachypnea,Tachycardia,Not available,Not applicable,No,Yes,Not applicable,Not applicable,Yes,No,Multiple,inconclusive


In [94]:
col = imputate_data.columns
col

Index(['patient_id', 'patient_age', 'genes_in_mothers_side',
       'inherited_from_father', 'maternal_gene', 'paternal_gene',
       'blood_cell_count_mcl', 'patient_first_name', 'family_name',
       'fathers_name', 'mothers_age', 'fathers_age', 'institute_name',
       'location_of_institute', 'status', 'respiratory_rate_breathsmin',
       'heart_rate_ratesmin', 'test_1', 'test_2', 'test_3', 'test_4', 'test_5',
       'parental_consent', 'followup', 'gender', 'birth_asphyxia',
       'autopsy_shows_birth_defect_if_applicable', 'place_of_birth',
       'folic_acid_details_periconceptional', 'ho_serious_maternal_illness',
       'ho_radiation_exposure_xray', 'ho_substance_abuse',
       'assisted_conception_ivfart',
       'history_of_anomalies_in_previous_pregnancies',
       'no._of_previous_abortion', 'birth_defects',
       'white_blood_cell_count_thousand_per_microliter', 'blood_test_result',
       'symptom_1', 'symptom_2', 'symptom_3', 'symptom_4', 'symptom_5',
       'genetic

In [95]:
missing_flag = [col for col in  imputate_data.columns if 'mcap_' in col]
other_flag = [col for col in  imputate_data.columns if 'flag_' in col]
other_flag

['flag_ho_serious_maternal_folic_acid',
 'flag_ho_substance_abuse_radiation',
 'flag_history_of_anomalies_ivfart']

In [88]:
mydata_imp_cat['maternal_gene'].value_counts()

Yes    17384
No     14164
Name: maternal_gene, dtype: int64

In [105]:
imputate_data[variable_no_missing_cat]

Unnamed: 0_level_0,genes_in_mothers_side,paternal_gene,status,parental_consent,followup,gender,place_of_birth,myflag,test_1to5,symptom_1to_5
patient_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
PID0x6418,Yes,No,Alive,Yes,High,Missing,Institute,train,0.0_9.0_9.0_1.0_0.0,1.0_1.0_1.0_1.0_1.0
PID0x25d5,Yes,No,Deceased,Yes,High,Missing,Not available,train,9.0_0.0_0.0_1.0_0.0,1.0_0.0_1.0_1.0_0.0
PID0x4a82,Yes,No,Alive,Yes,Low,Missing,Not available,train,0.0_0.0_0.0_1.0_0.0,0.0_1.0_1.0_1.0_1.0
PID0x4ac8,Yes,No,Deceased,Yes,High,Male,Institute,train,0.0_0.0_0.0_1.0_0.0,0.0_0.0_1.0_0.0_0.0
PID0x1bf7,Yes,Yes,Alive,No,Low,Male,Institute,train,0.0_0.0_0.0_1.0_0.0,0.0_0.0_0.0_0.0_0.0
...,...,...,...,...,...,...,...,...,...,...
PID0x81e1,Yes,No,Alive,Yes,Low,Missing,Institute,test,9.0_0.0_9.0_1.0_9.0,0.0_1.0_1.0_1.0_0.0
PID0x3514,Yes,Yes,Deceased,Yes,Missing,Male,Institute,test,0.0_0.0_9.0_1.0_9.0,0.0_1.0_0.0_1.0_1.0
PID0x5408,No,No,Deceased,Yes,Missing,Female,Institute,test,9.0_0.0_0.0_1.0_0.0,0.0_1.0_0.0_0.0_1.0
PID0x2017,No,No,Alive,Yes,High,Missing,Home,test,0.0_0.0_0.0_1.0_0.0,1.0_1.0_1.0_1.0_1.0


In [162]:
### Lets Combine all the data 
missing_flag = [col for col in  imputate_data.columns if 'mcap_' in col]
other_flag = [col for col in  imputate_data.columns if 'flag_' in col]
keep_var_list = variable_no_missing_num+variable_no_missing_cat+['genetic_disorder','disorder_subclass']
print(imputate_data[keep_var_list].isnull().sum())
mymaster = pd.concat([mydata_imp_num, mydata_imp_cat, imputate_data[keep_var_list]], axis=1)
#mymaster.to_csv("after_imputation_master_data.csv")
data_dictinary = summary_stat(mymaster)
data_dictinary = data_dictinary.reset_index()
all_variable = df_master.columns.to_list()
data_dictinary

blood_cell_count_mcl                                     0
test_1                                                   0
test_2                                                   0
test_3                                                   0
test_4                                                   0
test_5                                                   0
symptom_1                                                0
symptom_2                                                0
symptom_3                                                0
symptom_4                                                0
symptom_5                                                0
mcap_institute_name                                      0
mcap_respiratory_rate_breathsmin                         0
mcap_heart_rate_ratesmin                                 0
mcap_followup                                            0
mcap_gender                                              0
mcap_birth_asphyxia                                     

Unnamed: 0,index,Column Type,Number of Unique,Total missing,missing percentage
0,patient_age,float64,1442,0,0.00
1,mothers_age,float64,6070,0,0.00
2,fathers_age,float64,6031,0,0.00
3,no._of_previous_abortion,float64,4263,0,0.00
4,white_blood_cell_count_thousand_per_microliter,float64,25801,0,0.00
...,...,...,...,...,...
57,myflag,object,2,0,0.00
58,test_1to5,object,32,0,0.00
59,symptom_1to_5,object,32,0,0.00
60,genetic_disorder,object,3,11611,36.80


In [163]:
mytarget.head()

Unnamed: 0,patient_id,genetic_disorder_old,disorder_subclass_v1,genetic_disorder_v1
0,PID0x6418,Mitochondrial genetic inheritance disorders,Leber's hereditary optic neuropathy,Mitochondrial genetic inheritance disorders
1,PID0x25d5,missing,Cystic fibrosis,Single-gene inheritance diseases
2,PID0x4a82,Multifactorial genetic inheritance disorders,Diabetes,Multifactorial genetic inheritance disorders
3,PID0x4ac8,Mitochondrial genetic inheritance disorders,Leigh syndrome,Mitochondrial genetic inheritance disorders
4,PID0x1bf7,Multifactorial genetic inheritance disorders,Cancer,Multifactorial genetic inheritance disorders


In [164]:
mymaster = mymaster.reset_index()
mymaster.head()

Unnamed: 0,patient_id,patient_age,mothers_age,fathers_age,no._of_previous_abortion,white_blood_cell_count_thousand_per_microliter,inherited_from_father,maternal_gene,respiratory_rate_breathsmin,heart_rate_ratesmin,...,status,parental_consent,followup,gender,place_of_birth,myflag,test_1to5,symptom_1to_5,genetic_disorder,disorder_subclass
0,PID0x6418,2.0,34.764493,41.668601,2.073244,9.857562,No,Yes,Normal (30-60),Normal,...,Alive,Yes,High,Missing,Institute,train,0.0_9.0_9.0_1.0_0.0,1.0_1.0_1.0_1.0_1.0,Mitochondrial genetic inheritance disorders,Leber's hereditary optic neuropathy
1,PID0x25d5,4.0,34.328428,23.0,2.007323,5.52256,Yes,No,Tachypnea,Normal,...,Deceased,Yes,High,Missing,Not available,train,9.0_0.0_0.0_1.0_0.0,1.0_0.0_1.0_1.0_0.0,,Cystic fibrosis
2,PID0x4a82,6.0,41.0,22.0,4.0,7.519419,No,No,Normal (30-60),Tachycardia,...,Alive,Yes,Low,Missing,Not available,train,0.0_0.0_0.0_1.0_0.0,0.0_1.0_1.0_1.0_1.0,Multifactorial genetic inheritance disorders,Diabetes
3,PID0x4ac8,12.0,21.0,41.734429,1.0,7.919321,No,Yes,Tachypnea,Normal,...,Deceased,Yes,High,Male,Institute,train,0.0_0.0_0.0_1.0_0.0,0.0_0.0_1.0_0.0_0.0,Mitochondrial genetic inheritance disorders,Leigh syndrome
4,PID0x1bf7,11.0,32.0,42.086319,4.0,4.09821,No,Yes,Tachypnea,Tachycardia,...,Alive,No,Low,Male,Institute,train,0.0_0.0_0.0_1.0_0.0,0.0_0.0_0.0_0.0_0.0,Multifactorial genetic inheritance disorders,Cancer


In [165]:
mytarget.head()

Unnamed: 0,patient_id,genetic_disorder_old,disorder_subclass_v1,genetic_disorder_v1
0,PID0x6418,Mitochondrial genetic inheritance disorders,Leber's hereditary optic neuropathy,Mitochondrial genetic inheritance disorders
1,PID0x25d5,missing,Cystic fibrosis,Single-gene inheritance diseases
2,PID0x4a82,Multifactorial genetic inheritance disorders,Diabetes,Multifactorial genetic inheritance disorders
3,PID0x4ac8,Mitochondrial genetic inheritance disorders,Leigh syndrome,Mitochondrial genetic inheritance disorders
4,PID0x1bf7,Multifactorial genetic inheritance disorders,Cancer,Multifactorial genetic inheritance disorders


In [166]:
mymaster_v1 = mymaster.merge(mytarget, how='left', on='patient_id')
mymaster_v1.head()

Unnamed: 0,patient_id,patient_age,mothers_age,fathers_age,no._of_previous_abortion,white_blood_cell_count_thousand_per_microliter,inherited_from_father,maternal_gene,respiratory_rate_breathsmin,heart_rate_ratesmin,...,gender,place_of_birth,myflag,test_1to5,symptom_1to_5,genetic_disorder,disorder_subclass,genetic_disorder_old,disorder_subclass_v1,genetic_disorder_v1
0,PID0x6418,2.0,34.764493,41.668601,2.073244,9.857562,No,Yes,Normal (30-60),Normal,...,Missing,Institute,train,0.0_9.0_9.0_1.0_0.0,1.0_1.0_1.0_1.0_1.0,Mitochondrial genetic inheritance disorders,Leber's hereditary optic neuropathy,Mitochondrial genetic inheritance disorders,Leber's hereditary optic neuropathy,Mitochondrial genetic inheritance disorders
1,PID0x25d5,4.0,34.328428,23.0,2.007323,5.52256,Yes,No,Tachypnea,Normal,...,Missing,Not available,train,9.0_0.0_0.0_1.0_0.0,1.0_0.0_1.0_1.0_0.0,,Cystic fibrosis,missing,Cystic fibrosis,Single-gene inheritance diseases
2,PID0x4a82,6.0,41.0,22.0,4.0,7.519419,No,No,Normal (30-60),Tachycardia,...,Missing,Not available,train,0.0_0.0_0.0_1.0_0.0,0.0_1.0_1.0_1.0_1.0,Multifactorial genetic inheritance disorders,Diabetes,Multifactorial genetic inheritance disorders,Diabetes,Multifactorial genetic inheritance disorders
3,PID0x4ac8,12.0,21.0,41.734429,1.0,7.919321,No,Yes,Tachypnea,Normal,...,Male,Institute,train,0.0_0.0_0.0_1.0_0.0,0.0_0.0_1.0_0.0_0.0,Mitochondrial genetic inheritance disorders,Leigh syndrome,Mitochondrial genetic inheritance disorders,Leigh syndrome,Mitochondrial genetic inheritance disorders
4,PID0x1bf7,11.0,32.0,42.086319,4.0,4.09821,No,Yes,Tachypnea,Tachycardia,...,Male,Institute,train,0.0_0.0_0.0_1.0_0.0,0.0_0.0_0.0_0.0_0.0,Multifactorial genetic inheritance disorders,Cancer,Multifactorial genetic inheritance disorders,Cancer,Multifactorial genetic inheritance disorders


In [167]:
### white_blood_cell_count_thousand_per_microliter Treat -99 as missing
mymaster_v1['white_blood_cell_count_thousand_per_microliter'] = np.where(mymaster_v1['white_blood_cell_count_thousand_per_microliter']==-99, np.nan, mymaster_v1['white_blood_cell_count_thousand_per_microliter'])
mean_val = mymaster_v1[mymaster_v1.myflag=='train']['white_blood_cell_count_thousand_per_microliter'].mean()
mymaster_v1['white_blood_cell_count_thousand_per_microliter'] = mymaster_v1['white_blood_cell_count_thousand_per_microliter'].fillna(mean_val)

In [169]:
mymaster_v1.to_csv("mymaster_new_v1.csv", index=False)

In [170]:
#### Create Descrete features from numberical columns
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.tree import DecisionTreeRegressor

In [171]:
# transform the dataset with KBinsDiscretizer
X = mymaster_v1[['patient_age','mothers_age','fathers_age','no._of_previous_abortion','white_blood_cell_count_thousand_per_microliter','blood_cell_count_mcl']].copy()
enc = KBinsDiscretizer(n_bins=9, encode='onehot')
X_binned = enc.fit_transform(X)



In [172]:
X_binned = pd.DataFrame.sparse.from_spmatrix(X_binned)
X_binned.columns =  ["kbin5_c"+str(i) for i in range(49)]

In [173]:
X_binned.head()

Unnamed: 0,kbin5_c0,kbin5_c1,kbin5_c2,kbin5_c3,kbin5_c4,kbin5_c5,kbin5_c6,kbin5_c7,kbin5_c8,kbin5_c9,...,kbin5_c39,kbin5_c40,kbin5_c41,kbin5_c42,kbin5_c43,kbin5_c44,kbin5_c45,kbin5_c46,kbin5_c47,kbin5_c48
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [175]:
mymaster_v2 = pd.concat([mymaster_v1, X_binned], axis=1)
mymaster_v2.to_csv('mymaster_new_v2.csv', index=False)

In [181]:
cat_var = list(data_dictinary[data_dictinary['Column Type']=='object']['index'])
cat_var.remove('myflag')
cat_var.remove('disorder_subclass')
cat_var.remove('genetic_disorder')
mymaster_v2[cat_var].head()

Unnamed: 0,inherited_from_father,maternal_gene,respiratory_rate_breathsmin,heart_rate_ratesmin,birth_asphyxia,autopsy_shows_birth_defect_if_applicable,folic_acid_details_periconceptional,ho_serious_maternal_illness,ho_radiation_exposure_xray,ho_substance_abuse,...,blood_test_result,genes_in_mothers_side,paternal_gene,status,parental_consent,followup,gender,place_of_birth,test_1to5,symptom_1to_5
0,No,Yes,Normal (30-60),Normal,Yes,Not applicable,No,No,No,No,...,inconclusive,Yes,No,Alive,Yes,High,Missing,Institute,0.0_9.0_9.0_1.0_0.0,1.0_1.0_1.0_1.0_1.0
1,Yes,No,Tachypnea,Normal,No,No,Yes,Yes,Not applicable,Not applicable,...,normal,Yes,No,Deceased,Yes,High,Missing,Not available,9.0_0.0_0.0_1.0_0.0,1.0_0.0_1.0_1.0_0.0
2,No,No,Normal (30-60),Tachycardia,Not available,Not applicable,Yes,No,Yes,No,...,normal,Yes,No,Alive,Yes,Low,Missing,Not available,0.0_0.0_0.0_1.0_0.0,0.0_1.0_1.0_1.0_1.0
3,No,Yes,Tachypnea,Normal,Not available,No,No,Yes,Yes,Not applicable,...,inconclusive,Yes,No,Deceased,Yes,High,Male,Institute,0.0_0.0_0.0_1.0_0.0,0.0_0.0_1.0_0.0_0.0
4,No,Yes,Tachypnea,Tachycardia,Not available,Not applicable,No,Yes,Not applicable,Not applicable,...,inconclusive,Yes,Yes,Alive,No,Low,Male,Institute,0.0_0.0_0.0_1.0_0.0,0.0_0.0_0.0_0.0_0.0


In [182]:
### Count features for Cat varible
cat_count_features = []
for c in cat_var:
    d = mymaster_v2[c].value_counts().to_dict()
    mymaster_v2['%s_count'%c] = mymaster_v2[c].apply(lambda x:d.get(x,0))
    cat_count_features.append('%s_count'%c)    

In [183]:
mymaster_v2.to_csv("mymaster_new_v3.csv", index=False)

In [184]:
#### Create dummy variables
dummy_var = my_dummy_var(mymaster_v2, cat_var)
dummy_var.head()

Unnamed: 0,dummy_inherited_from_father_no,dummy_inherited_from_father_yes,dummy_maternal_gene_no,dummy_maternal_gene_yes,dummy_respiratory_rate_breathsmin_normal (30_60),dummy_respiratory_rate_breathsmin_tachypnea,dummy_heart_rate_ratesmin_normal,dummy_heart_rate_ratesmin_tachycardia,dummy_birth_asphyxia_no,dummy_birth_asphyxia_not available,...,dummy_symptom_1to_5_1.0_0.0_1.0_1.0_0.0,dummy_symptom_1to_5_1.0_0.0_1.0_1.0_1.0,dummy_symptom_1to_5_1.0_1.0_0.0_0.0_0.0,dummy_symptom_1to_5_1.0_1.0_0.0_0.0_1.0,dummy_symptom_1to_5_1.0_1.0_0.0_1.0_0.0,dummy_symptom_1to_5_1.0_1.0_0.0_1.0_1.0,dummy_symptom_1to_5_1.0_1.0_1.0_0.0_0.0,dummy_symptom_1to_5_1.0_1.0_1.0_0.0_1.0,dummy_symptom_1to_5_1.0_1.0_1.0_1.0_0.0,dummy_symptom_1to_5_1.0_1.0_1.0_1.0_1.0
0,1,0,0,1,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,0,1,1,0,0,1,1,0,1,0,...,1,0,0,0,0,0,0,0,0,0
2,1,0,1,0,1,0,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
3,1,0,0,1,0,1,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,1,0,0,1,0,1,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0


In [185]:
dummy_var.shape

(31548, 116)

In [187]:
mymaster_v3 = pd.concat([mymaster_v2, dummy_var], axis=1)
mymaster_v3.to_csv('mymaster_new_v3.csv', index=False)

#### Missing Target Value predicitions - Subclass

In [418]:
#### data prep
train_df = mymaster_v1[mymaster_v1.myflag=='train'].copy()
print('train data', train_df.shape)
score_df = mymaster_v1[mymaster_v1.myflag!='train'].copy()
print('score data', score_df.shape)


train data (22083, 44)
score data (9465, 44)


In [420]:
train_df.genetic_disorder_v1.value_counts()

Mitochondrial genetic inheritance disorders     11174
Single-gene inheritance diseases                 8371
Multifactorial genetic inheritance disorders     2260
missing                                           278
Name: genetic_disorder_v1, dtype: int64

In [424]:
train_df.disorder_subclass_v1.value_counts()

Leigh syndrome                         5160
Mitochondrial myopathy                 4405
Cystic fibrosis                        3448
Tay-Sachs                              2833
missing                                2168
Diabetes                               1817
Hemochromatosis                        1355
Leber's hereditary optic neuropathy     648
Alzheimer's                             152
Cancer                                   97
Name: disorder_subclass_v1, dtype: int64

In [426]:
## Predict Target class which are missing
train_df_v1 = train_df[train_df.genetic_disorder_v1 != 'missing'].copy()
print(train_df_v1.shape)
temp_train = train_df_v1[train_df_v1.disorder_subclass_v1!='missing'].copy()
print(temp_train.shape)
temp_score = train_df_v1[train_df_v1.disorder_subclass_v1=='missing'].copy()
print(temp_score.shape)
#train_df_v1['traget_subclass'] = np.where(train_df_v1.disorder_subclass_v1=='missing')

(21805, 44)
(19915, 44)
(1890, 44)


In [448]:
drop_var_from = ['myflag', 'genetic_disorder', 'disorder_subclass', 'genetic_disorder_old','patient_id','disorder_subclass_v1']

In [449]:
data_dictinary = summary_stat(train_df_v1)
data_dictinary = data_dictinary.reset_index()
data_dictinary = data_dictinary[~data_dictinary['index'].isin(drop_var_from)]

my_var_selected = set(data_dictinary['index'].to_list())

print("----------------------------------------------------------------")
total_cat_var = data_dictinary[data_dictinary['Column Type']=='object']['index'].to_list()
print("Total categorical variable: ", len(total_cat_var))
total_num_var = data_dictinary[data_dictinary['Column Type']!='object']['index'].to_list()
print("Total numerical variable: ", len(total_num_var))


----------------------------------------------------------------
Total categorical variable:  22
Total numerical variable:  16


In [450]:
dummy_var = my_dummy_var(train_df_v1, total_cat_var)
dummy_var.head()

Unnamed: 0,dummy_inherited_from_father_no,dummy_inherited_from_father_yes,dummy_maternal_gene_no,dummy_maternal_gene_yes,dummy_respiratory_rate_breathsmin_normal (30_60),dummy_respiratory_rate_breathsmin_tachypnea,dummy_heart_rate_ratesmin_normal,dummy_heart_rate_ratesmin_tachycardia,dummy_gender_ambiguous,dummy_gender_female,...,dummy_status_alive,dummy_status_deceased,dummy_parental_consent_no,dummy_parental_consent_yes,dummy_followup_high,dummy_followup_low,dummy_followup_missing,dummy_genetic_disorder_v1_mitochondrial genetic inheritance disorders,dummy_genetic_disorder_v1_multifactorial genetic inheritance disorders,dummy_genetic_disorder_v1_single_gene inheritance diseases
0,1,0,0,1,1,0,1,0,0,0,...,1,0,0,1,1,0,0,1,0,0
1,0,1,1,0,0,1,1,0,1,0,...,0,1,0,1,1,0,0,0,0,1
2,1,0,1,0,1,0,0,1,0,1,...,1,0,0,1,0,1,0,0,1,0
3,1,0,0,1,0,1,1,0,0,0,...,0,1,0,1,1,0,0,1,0,0
4,1,0,0,1,0,1,0,1,0,0,...,1,0,1,0,0,1,0,0,1,0


In [452]:
temp_train = pd.concat([train_df_v1[['patient_id','disorder_subclass_v1']],train_df_v1[total_num_var], dummy_var], axis=1)
temp_train.shape

(21805, 71)

In [454]:
temp_train.disorder_subclass_v1.value_counts()

Leigh syndrome                         5160
Mitochondrial myopathy                 4405
Cystic fibrosis                        3448
Tay-Sachs                              2833
missing                                1890
Diabetes                               1817
Hemochromatosis                        1355
Leber's hereditary optic neuropathy     648
Alzheimer's                             152
Cancer                                   97
Name: disorder_subclass_v1, dtype: int64

In [455]:
temp_train1 = temp_train[temp_train.disorder_subclass_v1!='missing'].copy()
print(temp_train1.shape)
temp_score = temp_train[temp_train.disorder_subclass_v1=='missing'].copy()
print(temp_score.shape)

(19915, 71)
(1890, 71)


In [537]:
X = temp_train1.drop(columns=['patient_id','disorder_subclass_v1'],axis=1)
Z = temp_score.drop(columns=['patient_id','disorder_subclass_v1'],axis=1)
y = temp_train1.disorder_subclass_v1.values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) 
Counter(y_train)

Counter({'Tay-Sachs': 2304,
         "Leber's hereditary optic neuropathy": 510,
         'Leigh syndrome': 4153,
         'Mitochondrial myopathy': 3505,
         'Hemochromatosis': 1088,
         'Cystic fibrosis': 2734,
         'Diabetes': 1441,
         'Cancer': 78,
         "Alzheimer's": 119})

In [458]:
Counter(y_test)

Counter({"Leber's hereditary optic neuropathy": 138,
         'Diabetes': 376,
         'Leigh syndrome': 1007,
         'Hemochromatosis': 267,
         'Cystic fibrosis': 714,
         'Mitochondrial myopathy': 900,
         'Tay-Sachs': 529,
         'Cancer': 19,
         "Alzheimer's": 33})

In [533]:
for j in [100, 500, 700, 900, 1000, 1200, 1500, 2000, 3000]:
    print(j, 'tree RF -------------------------\n')
    target_imp_rf = BalancedRandomForestClassifier(n_estimators = j, random_state=0,verbose=0,max_depth=12,
                                         sampling_strategy = {'Tay-Sachs': 200,
         "Leber's hereditary optic neuropathy": 100,
         'Leigh syndrome': 250,
         'Mitochondrial myopathy': 250,
         'Hemochromatosis': 150,
         'Cystic fibrosis': 150,
         'Diabetes': 150,
         'Cancer': 75,
         "Alzheimer's": 80},n_jobs = -1)
    
    target_imp_rf.fit(X_train, y_train)
    y_pred=target_imp_rf.predict(X_test)
    print("Subclass pred bal accuaracy is ", balanced_accuracy_score(y_test, y_pred))

100 tree RF -------------------------

Subclass pred bal accuaracy is  0.6569224518391271
500 tree RF -------------------------

Subclass pred bal accuaracy is  0.6697080242674782
700 tree RF -------------------------

Subclass pred bal accuaracy is  0.6758351535541555
900 tree RF -------------------------

Subclass pred bal accuaracy is  0.6747445658397822
1000 tree RF -------------------------

Subclass pred bal accuaracy is  0.6760695766891124
1200 tree RF -------------------------

Subclass pred bal accuaracy is  0.6777804292504457
1500 tree RF -------------------------

Subclass pred bal accuaracy is  0.6754235704887193
2000 tree RF -------------------------

Subclass pred bal accuaracy is  0.6732693853150638
3000 tree RF -------------------------

Subclass pred bal accuaracy is  0.6687706766545751


In [534]:
## best tree is 1200 
for j in [2, 4, 6, 8, 9, 11, 12, 14, 18,25,30]:
    print(j, 'depth of RF -------------------------\n')
    target_imp_rf = BalancedRandomForestClassifier(n_estimators = 1200, random_state=0,verbose=0,max_depth=j,
                                         sampling_strategy = {'Tay-Sachs': 200,
         "Leber's hereditary optic neuropathy": 100,
         'Leigh syndrome': 250,
         'Mitochondrial myopathy': 250,
         'Hemochromatosis': 150,
         'Cystic fibrosis': 150,
         'Diabetes': 150,
         'Cancer': 75,
         "Alzheimer's": 80},n_jobs = -1)
    
    target_imp_rf.fit(X_train, y_train)
    y_pred=target_imp_rf.predict(X_test)
    print("Subclass pred bal accuaracy is ", balanced_accuracy_score(y_test, y_pred))

2 depth of RF -------------------------

Subclass pred bal accuaracy is  0.48025100569485946
4 depth of RF -------------------------

Subclass pred bal accuaracy is  0.576427687400754
6 depth of RF -------------------------

Subclass pred bal accuaracy is  0.6221172697786824
8 depth of RF -------------------------

Subclass pred bal accuaracy is  0.6530990232396312
9 depth of RF -------------------------

Subclass pred bal accuaracy is  0.6590963025382776
11 depth of RF -------------------------

Subclass pred bal accuaracy is  0.67151983738857
12 depth of RF -------------------------

Subclass pred bal accuaracy is  0.6777804292504457
14 depth of RF -------------------------

Subclass pred bal accuaracy is  0.6705363223310634
18 depth of RF -------------------------

Subclass pred bal accuaracy is  0.6747509583739499
25 depth of RF -------------------------

Subclass pred bal accuaracy is  0.6743699504312989
30 depth of RF -------------------------

Subclass pred bal accuaracy is  0.6

In [535]:
target_imp_rf = BalancedRandomForestClassifier(n_estimators = 1200, random_state=0,verbose=1,max_depth=12,
                                         sampling_strategy = {'Tay-Sachs': 200,
         "Leber's hereditary optic neuropathy": 100,
         'Leigh syndrome': 250,
         'Mitochondrial myopathy': 250,
         'Hemochromatosis': 150,
         'Cystic fibrosis': 150,
         'Diabetes': 150,
         'Cancer': 75,
         "Alzheimer's": 80},n_jobs = -1)

target_imp_rf.fit(X_train, y_train)
y_pred=target_imp_rf.predict(X_test)
print("Subclass pred bal accuaracy is ", balanced_accuracy_score(y_test, y_pred))

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed:    0.9s
[Parallel(n_jobs=-1)]: Done 426 tasks      | elapsed:    2.4s
[Parallel(n_jobs=-1)]: Done 776 tasks      | elapsed:    4.6s
[Parallel(n_jobs=-1)]: Done 1200 out of 1200 | elapsed:    7.5s finished
[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done 176 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done 426 tasks      | elapsed:    0.2s
[Parallel(n_jobs=12)]: Done 776 tasks      | elapsed:    0.4s


Subclass pred bal accuaracy is  0.6777804292504457


[Parallel(n_jobs=12)]: Done 1200 out of 1200 | elapsed:    0.7s finished


In [538]:
### Predict Subclass
y_score=target_imp_rf.predict(Z)
Counter(y_score)

[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done 176 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done 426 tasks      | elapsed:    0.1s
[Parallel(n_jobs=12)]: Done 776 tasks      | elapsed:    0.3s
[Parallel(n_jobs=12)]: Done 1200 out of 1200 | elapsed:    0.5s finished


Counter({'Tay-Sachs': 393,
         'Cystic fibrosis': 279,
         'Diabetes': 177,
         'Leigh syndrome': 399,
         'Mitochondrial myopathy': 538,
         "Leber's hereditary optic neuropathy": 24,
         'Hemochromatosis': 63,
         'Cancer': 8,
         "Alzheimer's": 9})

In [541]:
temp_score['disorder_subclass_v1_p'] = y_score
temp_score1 = temp_score[['patient_id', 'disorder_subclass_v1_p']].copy()
temp_score1.index = temp_score1.patient_id
temp_score1.head()

Unnamed: 0_level_0,patient_id,disorder_subclass_v1_p
patient_id,Unnamed: 1_level_1,Unnamed: 2_level_1
PID0x42c5,PID0x42c5,Tay-Sachs
PID0x6c8c,PID0x6c8c,Cystic fibrosis
PID0x322f,PID0x322f,Tay-Sachs
PID0x1e91,PID0x1e91,Cystic fibrosis
PID0x4b8,PID0x4b8,Diabetes


In [546]:
temp_score1.shape

(1890, 2)

In [547]:
### Add preidcted Sub class into master data

print(Counter(train_df_v1.disorder_subclass_v1))
print(Counter(train_df_v1.genetic_disorder_v1))
train_df_v1.index = train_df_v1.patient_id
train_df_v1.drop(columns=['patient_id'], axis=1, inplace=True)


Counter({'Leigh syndrome': 5160, 'Mitochondrial myopathy': 4405, 'Cystic fibrosis': 3448, 'Tay-Sachs': 2833, 'missing': 1890, 'Diabetes': 1817, 'Hemochromatosis': 1355, "Leber's hereditary optic neuropathy": 648, "Alzheimer's": 152, 'Cancer': 97})
Counter({'Mitochondrial genetic inheritance disorders': 11174, 'Single-gene inheritance diseases': 8371, 'Multifactorial genetic inheritance disorders': 2260})


In [552]:
Counter(train_df_v1.loc[temp_score1.index, 'disorder_subclass_v1'])
train_df_v1.loc[temp_score1.index, 'disorder_subclass_v1'] = temp_score1.disorder_subclass_v1_p

In [554]:
pd.crosstab(train_df_v1.disorder_subclass_v1, train_df_v1.genetic_disorder_v1)

genetic_disorder_v1,Mitochondrial genetic inheritance disorders,Multifactorial genetic inheritance disorders,Single-gene inheritance diseases
disorder_subclass_v1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Alzheimer's,0,161,0
Cancer,0,105,0
Cystic fibrosis,0,0,3727
Diabetes,0,1994,0
Hemochromatosis,0,0,1418
Leber's hereditary optic neuropathy,672,0,0
Leigh syndrome,5559,0,0
Mitochondrial myopathy,4943,0,0
Tay-Sachs,0,0,3226


In [556]:
## Master data
train_df_v1.shape

(21805, 43)

In [559]:
score_df.index = score_df.patient_id
score_df.drop(columns=['patient_id'], axis=1, inplace=True)

In [566]:
final_data_master = pd.concat([train_df_v1, score_df], axis=0)
final_data_master.shape

(31270, 43)

In [675]:
final_data_master.tail()

Unnamed: 0_level_0,patient_age,mothers_age,fathers_age,no._of_previous_abortion,white_blood_cell_count_thousand_per_microliter,inherited_from_father,maternal_gene,respiratory_rate_breathsmin,heart_rate_ratesmin,gender,...,status,parental_consent,followup,myflag,genetic_disorder,disorder_subclass,genetic_disorder_old,disorder_subclass_v1,genetic_disorder_v1,mytarget
patient_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
PID0x81e1,9.0,28.0,63.0,2.0,7.23496,Yes,No,Tachypnea,Normal,Female,...,Alive,Yes,Low,test,,,,,,
PID0x3514,1.0,37.0,62.0,2.003024,4.859536,No,No,Tachypnea,Normal,Male,...,Deceased,Yes,Missing,test,,,,,,
PID0x5408,2.0,24.0,32.0,3.0,5.696062,Yes,No,Tachypnea,Tachycardia,Female,...,Deceased,Yes,Missing,test,,,,,,
PID0x2017,13.0,36.0,56.0,1.0,3.0,Yes,No,Normal (30-60),Normal,Ambiguous,...,Alive,Yes,High,test,,,,,,
PID0x7f61,12.0,40.0,35.0,0.0,7.492765,No,Yes,Normal (30-60),Normal,Ambiguous,...,Deceased,Yes,Low,test,,,,,,


In [567]:
final_data_master.to_csv("final_data_master.csv")

In [7]:
final_data_master = pd.read_csv("final_data_master.csv")

### Model Engine - Start

In [10]:
### H2o Automl model
mydata_dictinary = summary_stat(final_data_master)
mydata_dictinary = mydata_dictinary.reset_index()
mydata_dictinary

Unnamed: 0,index,Column Type,Number of Unique,Total missing,missing percentage
0,patient_id,object,31270,0,0.0
1,patient_age,float64,1423,0,0.0
2,mothers_age,float64,5977,0,0.0
3,fathers_age,float64,5911,0,0.0
4,no._of_previous_abortion,float64,4218,0,0.0
5,white_blood_cell_count_thousand_per_microliter,float64,25557,0,0.0
6,inherited_from_father,object,2,0,0.0
7,maternal_gene,object,2,0,0.0
8,respiratory_rate_breathsmin,object,2,0,0.0
9,heart_rate_ratesmin,object,2,0,0.0


In [11]:
## Define New target variable 
final_data_master['mytarget'] = final_data_master['genetic_disorder_v1']+str('::')+final_data_master['disorder_subclass_v1']
final_data_master['mytarget'].value_counts()

Mitochondrial genetic inheritance disorders::Leigh syndrome                         5559
Mitochondrial genetic inheritance disorders::Mitochondrial myopathy                 4943
Single-gene inheritance diseases::Cystic fibrosis                                   3727
Single-gene inheritance diseases::Tay-Sachs                                         3226
Multifactorial genetic inheritance disorders::Diabetes                              1994
Single-gene inheritance diseases::Hemochromatosis                                   1418
Mitochondrial genetic inheritance disorders::Leber's hereditary optic neuropathy     672
Multifactorial genetic inheritance disorders::Alzheimer's                            161
Multifactorial genetic inheritance disorders::Cancer                                 105
Name: mytarget, dtype: int64

In [17]:
exclude_list = ['genetic_disorder','disorder_subclass','genetic_disorder_old','disorder_subclass_v1','genetic_disorder_v1','myflag','patient_id']
mydata_dictinary = mydata_dictinary[~mydata_dictinary['index'].isin(exclude_list)]
cat_var = mydata_dictinary[mydata_dictinary['Column Type']=='object']['index']
num_var = mydata_dictinary[mydata_dictinary['Column Type']!='object']['index']

In [18]:
my_train = final_data_master[final_data_master.myflag=='train']
print('final training sample size', my_train.shape)
my_score = final_data_master[final_data_master.myflag!='train']
print('final Scoring sample size', my_score.shape)

final training sample size (21805, 45)
final Scoring sample size (9465, 45)


In [14]:
my_score.tail()

Unnamed: 0,patient_id,patient_age,mothers_age,fathers_age,no._of_previous_abortion,white_blood_cell_count_thousand_per_microliter,inherited_from_father,maternal_gene,respiratory_rate_breathsmin,heart_rate_ratesmin,...,status,parental_consent,followup,myflag,genetic_disorder,disorder_subclass,genetic_disorder_old,disorder_subclass_v1,genetic_disorder_v1,mytarget
31265,PID0x81e1,9.0,28.0,63.0,2.0,7.23496,Yes,No,Tachypnea,Normal,...,Alive,Yes,Low,test,,,,,,
31266,PID0x3514,1.0,37.0,62.0,2.003024,4.859536,No,No,Tachypnea,Normal,...,Deceased,Yes,Missing,test,,,,,,
31267,PID0x5408,2.0,24.0,32.0,3.0,5.696062,Yes,No,Tachypnea,Tachycardia,...,Deceased,Yes,Missing,test,,,,,,
31268,PID0x2017,13.0,36.0,56.0,1.0,3.0,Yes,No,Normal (30-60),Normal,...,Alive,Yes,High,test,,,,,,
31269,PID0x7f61,12.0,40.0,35.0,0.0,7.492765,No,Yes,Normal (30-60),Normal,...,Deceased,Yes,Low,test,,,,,,


In [19]:
### Create dummy featues
df_dummy_fetures = my_dummy_var(final_data_master, cat_var)
df_dummy_fetures.head(3)
df_dummy_fetures.shape

(31270, 51)

In [20]:
cat_var

6                            inherited_from_father
7                                    maternal_gene
8                      respiratory_rate_breathsmin
9                              heart_rate_ratesmin
10                                          gender
11                                  birth_asphyxia
12        autopsy_shows_birth_defect_if_applicable
13                                  place_of_birth
14             folic_acid_details_periconceptional
15                     ho_serious_maternal_illness
16                      ho_radiation_exposure_xray
17                              ho_substance_abuse
18                      assisted_conception_ivfart
19    history_of_anomalies_in_previous_pregnancies
20                                   birth_defects
21                               blood_test_result
33                           genes_in_mothers_side
34                                   paternal_gene
35                                          status
36                             

In [22]:
master_v1 = pd.concat([final_data_master[['patient_id','mytarget','disorder_subclass_v1','genetic_disorder_v1','myflag']],final_data_master[num_var], df_dummy_fetures], axis=1)
master_v1.shape


(31270, 72)

In [23]:
master_v1.head()

Unnamed: 0,patient_id,mytarget,disorder_subclass_v1,genetic_disorder_v1,myflag,patient_age,mothers_age,fathers_age,no._of_previous_abortion,white_blood_cell_count_thousand_per_microliter,...,dummy_genes_in_mothers_side_yes,dummy_paternal_gene_no,dummy_paternal_gene_yes,dummy_status_alive,dummy_status_deceased,dummy_parental_consent_no,dummy_parental_consent_yes,dummy_followup_high,dummy_followup_low,dummy_followup_missing
0,PID0x6418,Mitochondrial genetic inheritance disorders::L...,Leber's hereditary optic neuropathy,Mitochondrial genetic inheritance disorders,train,2.0,34.674355,41.665379,2.035378,9.857562,...,1,1,0,1,0,0,1,1,0,0
1,PID0x25d5,Single-gene inheritance diseases::Cystic fibrosis,Cystic fibrosis,Single-gene inheritance diseases,train,4.0,34.363087,23.0,2.026659,5.52256,...,1,1,0,0,1,0,1,1,0,0
2,PID0x4a82,Multifactorial genetic inheritance disorders::...,Diabetes,Multifactorial genetic inheritance disorders,train,6.0,41.0,22.0,4.0,7.476777,...,1,1,0,1,0,0,1,0,1,0
3,PID0x4ac8,Mitochondrial genetic inheritance disorders::L...,Leigh syndrome,Mitochondrial genetic inheritance disorders,train,12.0,21.0,41.684783,1.0,7.919321,...,1,1,0,0,1,0,1,1,0,0
4,PID0x1bf7,Multifactorial genetic inheritance disorders::...,Cancer,Multifactorial genetic inheritance disorders,train,11.0,32.0,41.991621,4.0,4.09821,...,1,0,1,1,0,1,0,0,1,0


In [24]:
mytrain = master_v1[master_v1.myflag=='train'].copy()
print('mytrain',mytrain.shape )
myscore = master_v1[master_v1.myflag!='train'].copy()
print('Myscore',myscore.shape )


mytrain (21805, 72)
Myscore (9465, 72)


In [698]:
mynames = master_v1.columns.to_list()
mynames = set(mynames)-set(['mytarget','genetic_disorder_v1','disorder_subclass_v1','myflag'])
#mynames.remove()
y1 = mytrain.mytarget.values
X_train, X_test, y_train, y_test = train_test_split(mytrain[mynames], y1, test_size=0.20, random_state=12345, stratify=y1)

In [683]:
Counter(y_train)

Counter({'Single-gene inheritance diseases::Cystic fibrosis': 2982,
         'Mitochondrial genetic inheritance disorders::Leigh syndrome': 4447,
         'Mitochondrial genetic inheritance disorders::Mitochondrial myopathy': 3954,
         'Multifactorial genetic inheritance disorders::Diabetes': 1595,
         'Single-gene inheritance diseases::Hemochromatosis': 1134,
         'Single-gene inheritance diseases::Tay-Sachs': 2581,
         "Mitochondrial genetic inheritance disorders::Leber's hereditary optic neuropathy": 538,
         'Multifactorial genetic inheritance disorders::Cancer': 84,
         "Multifactorial genetic inheritance disorders::Alzheimer's": 129})

In [684]:
myfirst_rf = BalancedRandomForestClassifier(n_estimators = 1500, random_state=0,
                                            verbose=0,max_depth=14,n_jobs = -1)
myfirst_rf.fit(X_train,y_train)


BalancedRandomForestClassifier(max_depth=14, n_estimators=1500, n_jobs=-1,
                               random_state=0)

In [685]:
y_pred=myfirst_rf.predict(X_test)
print("Subclass pred bal accuaracy is ", balanced_accuracy_score(y_test, y_pred))

Subclass pred bal accuaracy is  0.39152757809619404


In [686]:
mypred = pd.DataFrame({'Actual':y_test, 'Predicted':y_pred})
mypred.head()

Unnamed: 0,Actual,Predicted
0,Single-gene inheritance diseases::Cystic fibrosis,Multifactorial genetic inheritance disorders::...
1,Mitochondrial genetic inheritance disorders::L...,Single-gene inheritance diseases::Cystic fibrosis
2,Single-gene inheritance diseases::Cystic fibrosis,Mitochondrial genetic inheritance disorders::L...
3,Mitochondrial genetic inheritance disorders::M...,Single-gene inheritance diseases::Tay-Sachs
4,Mitochondrial genetic inheritance disorders::L...,Mitochondrial genetic inheritance disorders::M...


In [687]:
# new data frame with split value columns
new1 = mypred["Actual"].str.split("::", n = 1, expand = True)
new2 = mypred["Predicted"].str.split("::", n = 1, expand = True)

mypred['act_genetic_disorder'] = new1[0]
mypred['act_disorder_subclass'] = new1[1]
mypred['pred_genetic_disorder'] = new2[0]
mypred['pred_disorder_subclass'] = new2[1]
mypred

Unnamed: 0,Actual,Predicted,act_genetic_disorder,act_disorder_subclass,pred_genetic_disorder,pred_disorder_subclass
0,Single-gene inheritance diseases::Cystic fibrosis,Multifactorial genetic inheritance disorders::...,Single-gene inheritance diseases,Cystic fibrosis,Multifactorial genetic inheritance disorders,Alzheimer's
1,Mitochondrial genetic inheritance disorders::L...,Single-gene inheritance diseases::Cystic fibrosis,Mitochondrial genetic inheritance disorders,Leigh syndrome,Single-gene inheritance diseases,Cystic fibrosis
2,Single-gene inheritance diseases::Cystic fibrosis,Mitochondrial genetic inheritance disorders::L...,Single-gene inheritance diseases,Cystic fibrosis,Mitochondrial genetic inheritance disorders,Leber's hereditary optic neuropathy
3,Mitochondrial genetic inheritance disorders::M...,Single-gene inheritance diseases::Tay-Sachs,Mitochondrial genetic inheritance disorders,Mitochondrial myopathy,Single-gene inheritance diseases,Tay-Sachs
4,Mitochondrial genetic inheritance disorders::L...,Mitochondrial genetic inheritance disorders::M...,Mitochondrial genetic inheritance disorders,Leigh syndrome,Mitochondrial genetic inheritance disorders,Mitochondrial myopathy
...,...,...,...,...,...,...
4356,Single-gene inheritance diseases::Tay-Sachs,Single-gene inheritance diseases::Hemochromatosis,Single-gene inheritance diseases,Tay-Sachs,Single-gene inheritance diseases,Hemochromatosis
4357,Single-gene inheritance diseases::Cystic fibrosis,Single-gene inheritance diseases::Cystic fibrosis,Single-gene inheritance diseases,Cystic fibrosis,Single-gene inheritance diseases,Cystic fibrosis
4358,Mitochondrial genetic inheritance disorders::L...,Mitochondrial genetic inheritance disorders::L...,Mitochondrial genetic inheritance disorders,Leber's hereditary optic neuropathy,Mitochondrial genetic inheritance disorders,Leber's hereditary optic neuropathy
4359,Single-gene inheritance diseases::Cystic fibrosis,Multifactorial genetic inheritance disorders::...,Single-gene inheritance diseases,Cystic fibrosis,Multifactorial genetic inheritance disorders,Alzheimer's


In [688]:
from sklearn.metrics import f1_score
s1 = max(0, 100*f1_score(mypred['act_genetic_disorder'], mypred['pred_genetic_disorder'], average='macro'))
s2 = max(0, 100*f1_score(mypred['act_disorder_subclass'], mypred['pred_disorder_subclass'], average='macro'))
s1/2+s2/2

31.70937912949502

In [627]:
import h2o
from h2o.automl import H2OAutoML

h2o.init()

# Import a sample binary outcome train/test set into H2O


Checking whether there is an H2O instance running at http://localhost:54321 ..... not found.
Attempting to start a local H2O server...
; Java HotSpot(TM) 64-Bit Server VM (build 25.281-b09, mixed mode)
  Starting server from C:\Users\dubrangala\Anaconda3\Lib\site-packages\h2o\backend\bin\h2o.jar
  Ice root: C:\Users\DUBRAN~1\AppData\Local\Temp\tmpawi7g9lf
  JVM stdout: C:\Users\DUBRAN~1\AppData\Local\Temp\tmpawi7g9lf\h2o_dubrangala_started_from_python.out
  JVM stderr: C:\Users\DUBRAN~1\AppData\Local\Temp\tmpawi7g9lf\h2o_dubrangala_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.


0,1
H2O_cluster_uptime:,02 secs
H2O_cluster_timezone:,Asia/Kolkata
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.32.1.4
H2O_cluster_version_age:,17 days
H2O_cluster_name:,H2O_from_python_dubrangala_fixlb7
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,7.052 Gb
H2O_cluster_total_cores:,12
H2O_cluster_allowed_cores:,12


In [704]:
#X_train1 = h2o(X_train)
mynames = master_v1.columns.to_list()
mynames = set(mynames)-set(['mytarget','genetic_disorder_v1','disorder_subclass_v1','myflag'])
#mynames.remove()
y1 = mytrain.mytarget.values
X_train1, X_test1, y_train, y_test = train_test_split(mytrain, y1, test_size=0.20, random_state=12345, stratify=y1)

X_train1 = h2o.H2OFrame(X_train1)
X_test1 = h2o.H2OFrame(X_test1)

# Run AutoML for 20 base models (limited to 1 hour max runtime by default)


Parse progress: |█████████████████████████████████████████████████████████| 100%
Parse progress: |█████████████████████████████████████████████████████████| 100%


In [705]:
# Identify predictors and response
x = list(mynames)
y = "mytarget"

In [641]:
# Run AutoML for 20 base models (limited to 1 hour max runtime by default)
aml1 = H2OAutoML(max_models=10, seed=123, stopping_metric = 'misclassification')
aml1.train(x=x, y=y, training_frame=X_train1, validation_frame=X_test1)

AutoML progress: |
16:11:06.756: User specified a validation frame with cross-validation still enabled. Please note that the models will still be validated using cross-validation only, the validation frame will be used to provide purely informative validation metrics on the trained models.
16:11:06.757: AutoML: XGBoost is not available; skipping it.

████████████████████████████████████████████████████████| 100%


In [706]:
# Run AutoML for 20 base models (limited to 1 hour max runtime by default)
aml2 = H2OAutoML(max_models=30, seed=123, stopping_metric = 'logloss')
aml2.train(x=x, y=y, training_frame=X_train1, validation_frame=X_test1)

AutoML progress: |
17:14:13.295: User specified a validation frame with cross-validation still enabled. Please note that the models will still be validated using cross-validation only, the validation frame will be used to provide purely informative validation metrics on the trained models.
17:14:13.296: AutoML: XGBoost is not available; skipping it.

████████████████████████████████████████████████████████| 100%


In [644]:
lb = aml1.leaderboard
lb.head(rows=lb.nrows)

model_id,mean_per_class_error,logloss,rmse,mse,auc,aucpr
StackedEnsemble_BestOfFamily_AutoML_20210725_161106,0.720857,1.32953,0.700247,0.490345,,
GLM_1_AutoML_20210725_161106,0.722198,1.34569,0.703556,0.494991,,
StackedEnsemble_AllModels_AutoML_20210725_161106,0.725007,1.33054,0.700428,0.4906,,
GBM_5_AutoML_20210725_161106,0.737898,1.43327,0.731555,0.535173,,
DeepLearning_1_AutoML_20210725_161106,0.738009,1.36727,0.704285,0.496018,,
GBM_2_AutoML_20210725_161106,0.746588,1.44942,0.732399,0.536408,,
GBM_1_AutoML_20210725_161106,0.747426,1.43228,0.731828,0.535573,,
GBM_3_AutoML_20210725_161106,0.752698,1.47991,0.73144,0.535005,,
GBM_4_AutoML_20210725_161106,0.763356,1.53025,0.728904,0.531302,,
GBM_grid__1_AutoML_20210725_161106_model_1,0.764757,1.44656,0.737835,0.544401,,




In [707]:
lb1 = aml2.leaderboard
lb1.head(rows=lb1.nrows)

model_id,mean_per_class_error,logloss,rmse,mse,auc,aucpr
StackedEnsemble_BestOfFamily_AutoML_20210725_171413,0.720273,1.33071,0.70042,0.490588,,
GLM_1_AutoML_20210725_171413,0.722198,1.34569,0.703556,0.494991,,
DeepLearning_grid__1_AutoML_20210725_171413_model_3,0.725961,1.55144,0.724444,0.524819,,
GBM_5_AutoML_20210725_171413,0.730189,1.43001,0.724105,0.524328,,
DeepLearning_grid__1_AutoML_20210725_171413_model_4,0.730996,1.50805,0.723566,0.523548,,
StackedEnsemble_AllModels_AutoML_20210725_171413,0.73167,1.33498,0.701676,0.492349,,
DeepLearning_grid__1_AutoML_20210725_171413_model_1,0.735577,1.57273,0.728083,0.530104,,
DeepLearning_1_AutoML_20210725_171413,0.736505,1.37455,0.704977,0.496993,,
GBM_grid__1_AutoML_20210725_171413_model_10,0.737144,1.39216,0.724951,0.525554,,
GBM_2_AutoML_20210725_171413,0.739646,1.44774,0.724604,0.525051,,




In [708]:
# To generate predictions on a test set, you can make predictions
# directly on the `"H2OAutoML"` object or on the leader model
# object directly
#preds = aml1.predict(X_test1)
#preds
# or:
preds = aml1.leader.predict(X_test1)
preds1 = aml2.leader.predict(X_test1)

stackedensemble prediction progress: |████████████████████████████████████| 100%
stackedensemble prediction progress: |████████████████████████████████████| 100%


In [713]:
#h2o.as_list(preds['predict'])

In [710]:
mypred['pred_h20'] = h2o.as_list(preds['predict'])
mypred['pred_h201'] = h2o.as_list(preds1['predict'])

# new data frame with split value columns
new3 = mypred["pred_h20"].str.split("::", n = 1, expand = True)
new4 = mypred["pred_h201"].str.split("::", n = 1, expand = True)

mypred['h2o_pred_genetic_disorder'] = new3[0]
mypred['h2o_pred_disorder_subclass'] = new3[1]

mypred['h2o1_pred_genetic_disorder'] = new4[0]
mypred['h2o1_pred_disorder_subclass'] = new4[1]


In [711]:
from sklearn.metrics import f1_score
s1 = max(0, 100*f1_score(mypred['act_genetic_disorder'], mypred['h2o_pred_genetic_disorder'], average='macro'))
s2 = max(0, 100*f1_score(mypred['act_disorder_subclass'], mypred['h2o_pred_disorder_subclass'], average='macro'))
s1/2+s2/2

41.473637674579294

In [712]:
from sklearn.metrics import f1_score
s1 = max(0, 100*f1_score(mypred['act_genetic_disorder'], mypred['h2o1_pred_genetic_disorder'], average='macro'))
s2 = max(0, 100*f1_score(mypred['act_disorder_subclass'], mypred['h2o1_pred_disorder_subclass'], average='macro'))
s1/2+s2/2

41.42970923835445

In [726]:
#myscore.drop(columns = ['mytarget'], axis=1, inplace=True)
X_score = h2o.H2OFrame(myscore)
preds = aml1.leader.predict(X_score)

Parse progress: |█████████████████████████████████████████████████████████| 100%
stackedensemble prediction progress: |████████████████████████████████████| 100%


In [736]:
myscore['pred_h20'] = h2o.as_list(preds['predict']).predict.values
#myscore['pred_h201'] = h2o.as_list(preds1['predict'])


In [737]:
myscore['pred_h20']

patient_id
PID0x4175    Multifactorial genetic inheritance disorders::...
PID0x21f5    Mitochondrial genetic inheritance disorders::M...
PID0x49b8          Single-gene inheritance diseases::Tay-Sachs
PID0x2d97    Mitochondrial genetic inheritance disorders::L...
PID0x58da    Single-gene inheritance diseases::Cystic fibrosis
                                   ...                        
PID0x81e1    Single-gene inheritance diseases::Cystic fibrosis
PID0x3514    Single-gene inheritance diseases::Cystic fibrosis
PID0x5408    Mitochondrial genetic inheritance disorders::M...
PID0x2017    Multifactorial genetic inheritance disorders::...
PID0x7f61    Multifactorial genetic inheritance disorders::...
Name: pred_h20, Length: 9465, dtype: object

In [738]:
# new data frame with split value columns
news = myscore['pred_h20'].str.split("::", n = 1, expand = True)

myscore['genetic_disorder1'] = news[0]
myscore['disorder_subclass1'] = news[1]

In [741]:
mysubmission1 = myscore[['genetic_disorder1','disorder_subclass1']].copy()
mysubmission1 = mysubmission1.reset_index()
mysubmission1.head()

Unnamed: 0,patient_id,genetic_disorder1,disorder_subclass1
0,PID0x4175,Multifactorial genetic inheritance disorders,Diabetes
1,PID0x21f5,Mitochondrial genetic inheritance disorders,Mitochondrial myopathy
2,PID0x49b8,Single-gene inheritance diseases,Tay-Sachs
3,PID0x2d97,Mitochondrial genetic inheritance disorders,Leigh syndrome
4,PID0x58da,Single-gene inheritance diseases,Cystic fibrosis


In [742]:
Counter(mysubmission1.genetic_disorder1)

Counter({'Multifactorial genetic inheritance disorders': 1414,
         'Mitochondrial genetic inheritance disorders': 4189,
         'Single-gene inheritance diseases': 3862})

In [744]:
mysubmission1.to_csv("mysubmission1.csv", index=False)

### New Model Development

In [None]:
mytrain = master_v1[master_v1.myflag=='train'].copy()
print('mytrain',mytrain.shape )
myscore = master_v1[master_v1.myflag!='train'].copy()
print('Myscore',myscore.shape )

In [26]:
mytrain.columns

Index(['patient_id', 'mytarget', 'disorder_subclass_v1', 'genetic_disorder_v1',
       'myflag', 'patient_age', 'mothers_age', 'fathers_age',
       'no._of_previous_abortion',
       'white_blood_cell_count_thousand_per_microliter',
       'blood_cell_count_mcl', 'test_1', 'test_2', 'test_3', 'test_4',
       'test_5', 'symptom_1', 'symptom_2', 'symptom_3', 'symptom_4',
       'symptom_5', 'dummy_inherited_from_father_no',
       'dummy_inherited_from_father_yes', 'dummy_maternal_gene_no',
       'dummy_maternal_gene_yes',
       'dummy_respiratory_rate_breathsmin_normal (30_60)',
       'dummy_respiratory_rate_breathsmin_tachypnea',
       'dummy_heart_rate_ratesmin_normal',
       'dummy_heart_rate_ratesmin_tachycardia', 'dummy_gender_ambiguous',
       'dummy_gender_female', 'dummy_gender_male', 'dummy_birth_asphyxia_no',
       'dummy_birth_asphyxia_not available', 'dummy_birth_asphyxia_yes',
       'dummy_autopsy_shows_birth_defect_if_applicable_no',
       'dummy_autopsy_shows_b

In [None]:
clf = LGBMClassifier(objective              = 'multiclass',
                            #categorical_feature      = cat_cols,
                            #n_estimators             = int(params[ 'n_estimators']),
                            num_leaves               = int(params[ 'num_leaves']),
                            #max_bin = 64,
                            colsample_bytree         = params[ 'colsample_bytree'],
                            learning_rate            = params[ 'learning_rate'],
                            #early_stopping_round    =int( params[ 'early_stopping_round']),
                            #max_depth                = 30,
                            min_child_samples        = int(params[ 'min_child_samples']) ,
                            #lambda_l1                = params['lambda_l1'],
                            #lambda_l2                = params['lambda_l2'],
                            subsample                = params['subsample'],
                            bagging_freq             = int(params['bagging_freq']),
                            verbosity                = -1,
                            device                   = params[ 'device'],
                            first_metric_only        = params[ 'first_metric_only'],
                            #eval_metric              = params[ 'eval_metric'],
                            #metric                   = params[ 'metric'],
                            n_jobs                   = int(params[ 'n_jobs']),
                            #eval_names               = params[ 'eval_names'],
                            random_state             = int(params[ 'random_state']),
                            tree_learner            = params[ 'tree_learner']) 
        
clf.fit(X_train,y_train,verbose=0) 

In [None]:
aml = H2OAutoML(max_models = 10, seed = 1)
aml.train(x = X, y = y, training_frame = df)

In [None]:
#X, y = make_classification(n_samples=100, n_features=10, n_informative=5, n_redundant=5, random_state=1)
# define model evaluation
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
# define search
model = TPOTClassifier(generations=5, population_size=50, cv=cv, scoring='accuracy', verbosity=2, random_state=1, n_jobs=-1)
# perform the search
model.fit(X, y)

In [None]:
#X, y = make_classification(n_samples=100, n_features=10, n_informative=5, n_redundant=5, random_state=1)
# define model evaluation
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
# define search
model = TPOTClassifier(generations=5, population_size=50, cv=cv, scoring='accuracy', verbosity=2, random_state=1, n_jobs=-1)
# perform the search
model.fit(X, y)
# export the best model
model.export('tpot_best_model.py')

# example of tpot for a classification dataset
from sklearn.datasets import make_classification
from sklearn.model_selection import RepeatedStratifiedKFold
from tpot import TPOTClassifier
# define dataset
X, y = make_classification(n_samples=100, n_features=10, n_informative=5, n_redundant=5, random_state=1)
# define model evaluation
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
# define search
model = TPOTClassifier(generations=5, population_size=50, cv=cv, scoring='accuracy', verbosity=2, random_state=1, n_jobs=-1)
# perform the search
model.fit(X, y)
# export the best model
model.export('tpot_best_model.py')