### ***Utilities***

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import gc
gc.collect()

141

## ***Import and Options:***

In [None]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
import math

In [None]:
pd.set_option('display.max_columns',500)

## ***Reading Dataset:***

In [None]:
df = pd.read_csv('/content/drive/MyDrive/FDM/dataset/US_births(2018).csv',low_memory=False)
df.head()

Unnamed: 0,ATTEND,BFACIL,BMI,CIG_0,DBWT,DLMP_MM,DLMP_YY,DMAR,DOB_MM,DOB_TT,DOB_WK,DOB_YY,DWgt_R,FAGECOMB,FEDUC,FHISPX,FRACE15,FRACE31,FRACE6,ILLB_R,ILOP_R,ILP_R,IMP_SEX,IP_GON,LD_INDL,MAGER,MAGE_IMPFLG,MAR_IMP,MBSTATE_REC,MEDUC,MHISPX,MM_AICU,MRACE15,MRACE31,MRACEIMP,MRAVE6,MTRAN,M_Ht_In,NO_INFEC,NO_MMORB,NO_RISKS,PAY,PAY_REC,PRECARE,PREVIS,PRIORDEAD,PRIORLIVE,PRIORTERM,PWgt_R,RDMETH_REC,RESTATUS,RF_CESAR,RF_CESARN,SEX,WTGAIN
0,1,1,30.7,0,3657,4,2017,1,1,1227,2,2018,231,31,3,1,1,1,1,16,33,16,,N,N,30,,,1,6,0,N,1,1,,1,N,66,1,1,1,2,2,3,8,0,1,2,190,1,2,N,0,M,41
1,1,1,33.3,2,3242,99,9999,2,1,1704,2,2018,185,35,4,0,3,3,3,180,888,180,,N,N,35,,,1,9,0,N,3,3,,3,N,63,1,1,0,1,1,3,9,0,2,0,188,4,2,Y,2,F,0
2,1,1,30.0,0,3470,4,2017,1,1,336,2,2018,273,31,4,0,1,1,1,999,888,999,,N,N,28,,,1,6,0,N,1,1,,1,N,71,1,1,0,5,4,5,17,0,1,0,215,1,1,N,0,M,58
3,3,1,23.7,0,3140,5,2017,2,1,938,2,2018,138,26,2,0,3,3,3,43,888,43,,N,N,23,,,1,2,0,N,3,3,,3,N,64,1,1,1,1,1,5,6,0,2,0,138,1,2,N,0,F,0
4,1,1,35.5,0,2125,99,9999,1,1,830,3,2018,219,35,3,0,2,2,2,999,999,999,,N,N,37,,,1,4,0,N,1,1,,1,N,66,1,1,1,1,1,5,15,0,1,4,220,3,1,N,0,M,0


# ***Preprocessing***

In [None]:
df['MRACE6'] = df['MRAVE6']
df = df.drop(columns=['MRAVE6'])

## ***Droping null columns***

In [None]:
df = df.drop(columns=['IMP_SEX', 'MAGE_IMPFLG', 'MAR_IMP', 'MRACEIMP'])

## ***Droping unnecessary columns based on meaning***

### ***ATTEND: Attendant at Birth like Doctor of Medicine or Midwife***

In [None]:
df = df.drop(columns=['ATTEND'])

### ***BFACIL: Birth Place like Hospital***

In [None]:
df = df.drop(columns=['BFACIL'])

### ***DOB_TT: Time Of Birth***

In [None]:
df = df.drop(columns=['DOB_TT'])

### ***DOB_WK: Birth Day of Week***

In [None]:
df = df.drop(columns=['DOB_WK'])

### ***DOB_YY: Birth Year, Always 2018***

In [None]:
df = df.drop(columns=['DOB_YY'])

### ***DWgt_R: Mother Delivery Weight***

In [None]:
df = df.drop(columns=['DWgt_R'])

### ***RDMETH_REC: Delivery Method***

In [None]:
df = df.drop(columns=['RDMETH_REC'])

## ***Working on null values***

### ***BMI***

In [None]:
df.loc[(df['BMI'] == 99.9) & (df['PWgt_R'] != 999) & (df['M_Ht_In'] != 99)]

Unnamed: 0,BMI,CIG_0,DBWT,DLMP_MM,DLMP_YY,DMAR,DOB_MM,FAGECOMB,FEDUC,FHISPX,FRACE15,FRACE31,FRACE6,ILLB_R,ILOP_R,ILP_R,IP_GON,LD_INDL,MAGER,MBSTATE_REC,MEDUC,MHISPX,MM_AICU,MRACE15,MRACE31,MTRAN,M_Ht_In,NO_INFEC,NO_MMORB,NO_RISKS,PAY,PAY_REC,PRECARE,PREVIS,PRIORDEAD,PRIORLIVE,PRIORTERM,PWgt_R,RESTATUS,RF_CESAR,RF_CESARN,SEX,WTGAIN,MRACE6
380,99.9,0,3629,99,9999,1,1,39,3,0,1,1,1,37,888,37,N,N,28,1,2,0,N,1,1,N,61,1,1,1,1,1,2,8,0,2,0,75,1,N,0,M,38,1
12582,99.9,0,3033,4,2017,2,1,28,2,0,2,2,2,84,888,84,N,N,28,1,2,0,N,2,2,N,55,1,1,0,1,1,2,15,0,3,0,340,1,Y,3,M,3,2
16287,99.9,10,2523,5,2017,2,2,24,3,0,2,2,2,189,49,49,N,N,33,1,4,0,N,2,2,N,64,0,1,0,1,1,2,11,0,1,2,75,2,Y,1,M,41,2
25599,99.9,30,2409,4,2017,2,4,18,1,0,1,1,1,888,46,46,N,N,21,1,3,0,N,1,1,N,70,0,1,1,1,1,2,6,0,0,2,90,2,N,0,F,38,1
34769,99.9,0,2636,10,2017,2,6,30,3,0,2,2,2,75,888,75,N,N,28,1,4,0,N,2,2,N,60,1,1,0,2,2,4,13,0,1,0,375,1,Y,1,F,19,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3765541,99.9,0,4167,12,2017,1,9,39,6,0,2,2,2,63,888,63,N,N,38,1,4,0,N,2,2,N,63,1,1,1,2,2,2,11,0,1,0,375,2,N,0,M,50,2
3768792,99.9,0,3121,12,2017,2,9,99,9,9,99,99,9,888,888,888,N,N,23,1,3,0,N,2,2,N,38,0,1,1,1,1,7,3,0,0,0,210,1,N,0,F,49,2
3776407,99.9,0,3420,1,2018,1,10,38,6,0,2,2,2,38,888,38,N,N,28,2,3,0,N,2,2,N,67,1,1,1,1,1,3,9,0,3,0,76,1,N,0,M,92,2
3779188,99.9,0,3202,1,2018,1,10,39,7,0,1,1,1,20,888,20,N,N,35,2,8,0,N,1,1,N,78,1,1,1,1,1,2,9,0,2,0,114,1,N,0,M,38,1


In [None]:
def bmi_func(x):
  if (x['BMI'] == 99.9) and (x['PWgt_R'] != 999) and (x['M_Ht_In'] != 99):
    return round( (x['PWgt_R'] / math.pow(x['M_Ht_In'], 2) ) * 703, 1)
  else:
    return x['BMI']

df['BMI'] = df.apply(bmi_func, axis=1)

In [None]:
df.loc[(df['BMI'] == 99.9) & (df['PWgt_R'] != 999) & (df['M_Ht_In'] != 99)]

Unnamed: 0,BMI,CIG_0,DBWT,DLMP_MM,DLMP_YY,DMAR,DOB_MM,FAGECOMB,FEDUC,FHISPX,FRACE15,FRACE31,FRACE6,ILLB_R,ILOP_R,ILP_R,IP_GON,LD_INDL,MAGER,MBSTATE_REC,MEDUC,MHISPX,MM_AICU,MRACE15,MRACE31,MTRAN,M_Ht_In,NO_INFEC,NO_MMORB,NO_RISKS,PAY,PAY_REC,PRECARE,PREVIS,PRIORDEAD,PRIORLIVE,PRIORTERM,PWgt_R,RESTATUS,RF_CESAR,RF_CESARN,SEX,WTGAIN,MRACE6


### ***Dropping rows with null values for DLMP_MM & DLMP_YY***

In [None]:
df.drop(df[df['DLMP_MM'].eq(99)].index, inplace=True)
df.drop(df[df['DLMP_YY'].eq(9999)].index, inplace=True)

## ***Replacing null values in categorical columns with median***

In [None]:
categorical_columns_09 = ['FEDUC', 'FHISPX',  'FRACE6', 'MEDUC', 'MHISPX', 'NO_INFEC', 'NO_MMORB', 'NO_RISKS', 'PAY', 'PAY_REC']
categorical_columns_99 = ['FRACE15', 'FRACE31']
categorical_columns_empty = ['DMAR']
categorical_columns_u = ['IP_GON', 'LD_INDL', 'MM_AICU', 'MTRAN', 'RF_CESAR']
categorical_columns_3 = ['MBSTATE_REC']

In [None]:
imputer = SimpleImputer(missing_values=9, strategy='median')
df[categorical_columns_09] = imputer.fit_transform(df[categorical_columns_09])

In [None]:
imputer = SimpleImputer(missing_values=99, strategy='median')
df[categorical_columns_99] = imputer.fit_transform(df[categorical_columns_99])

In [None]:
imputer = SimpleImputer(missing_values=" ", strategy='constant', fill_value="1")
df[categorical_columns_empty] = imputer.fit_transform(df[categorical_columns_empty])

In [None]:
imputer = SimpleImputer(missing_values="U", strategy='constant', fill_value="N")
df[categorical_columns_u] = imputer.fit_transform(df[categorical_columns_u])

In [None]:
imputer = SimpleImputer(missing_values=3, strategy='median')
df[categorical_columns_3] = imputer.fit_transform(df[categorical_columns_3])

## ***Replacing null values in numerical columns with mean***

In [None]:
numerical_columns_99_9 = ['BMI']
numerical_columns_99 = ['CIG_0', 'FAGECOMB', 'M_Ht_In', 'PRECARE', 'PREVIS', 'PRIORDEAD', 'PRIORLIVE', 'PRIORTERM', 'RF_CESARN', 'WTGAIN']
numerical_columns_999 = ['ILLB_R', 'ILOP_R', 'ILP_R', 'PWgt_R']

In [None]:
imputer = SimpleImputer(missing_values=99.9, strategy='mean')
df[numerical_columns_99_9] = imputer.fit_transform(df[numerical_columns_99_9])

In [None]:
imputer = SimpleImputer(missing_values=99, strategy='mean')
df[numerical_columns_99] = imputer.fit_transform(df[numerical_columns_99])

In [None]:
imputer = SimpleImputer(missing_values=99, strategy='mean')
df[numerical_columns_999] = imputer.fit_transform(df[numerical_columns_999])

## ***New Features***

### ***Smoked***

In [None]:
df['smoked'] = np.where(df['CIG_0'].eq(0), 'N', 'Y')

### ***Pregnancy Length***

In [None]:
conditions = [(df['DOB_MM'] > df['DLMP_MM']) & (2018 == df['DLMP_YY']),
                  (df['DOB_MM'] > df['DLMP_MM']) & (2018 > df['DLMP_YY']),
                  (df['DOB_MM'] < df['DLMP_MM']) & (2018 > df['DLMP_YY'])]
choices = [df['DOB_MM'] - df['DLMP_MM'],
          ((2018 - df['DLMP_YY'])* 12) + df['DOB_MM'] - df['DLMP_MM'],
          ((2018 - df['DLMP_YY'])* 12) - df['DLMP_MM'] + df['DOB_MM']]
df['pregnancy_length'] = np.select(conditions,choices, 12)

df.drop(df[df['pregnancy_length'].gt(12)].index,inplace=True)
df.drop(df[df['pregnancy_length'].lt(5)].index,inplace=True)

### ***First Live Birth***

In [None]:
df['first_live_birth'] = np.where(df['ILP_R'].eq(888), "Y", "N")

### ***First Birth***

In [None]:
df['first_birth'] = np.where(df['ILLB_R'].eq(888), "Y", "N")

### ***Had PriorDead***

In [None]:
df['had_priordead'] = np.where(df['PRIORDEAD'].eq(0), 'N', 'Y')

## ***Target variable***

### ***Option 1: Dropping rows with null value***

In [None]:
df.drop(df[df['DBWT'].eq(9999)].index, inplace=True)

In [None]:
df.to_csv('/content/drive/MyDrive/FDM/Preprocessed/df_droped.csv', index=False)

### ***Option 2: Replacing with mean***

In [None]:
imputer = SimpleImputer(missing_values=9999, strategy='mean')
df['DBWT'] = imputer.fit_transform(df[['DBWT']])

In [None]:
df.to_csv('/content/drive/MyDrive/FDM/Preprocessed/df.csv', index=False)