<a href="https://colab.research.google.com/github/gsilver321/project_chd/blob/main/chd_cleaning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
import numpy as np
import pandas as pd

In [17]:
df_train = pd.read_csv("https://github.com/DS3001/project_chd/raw/main/fhs_train.csv")
df_test = pd.read_csv("https://github.com/DS3001/project_chd/raw/main/fhs_test.csv")

In [18]:
def winsorize(x):
    # convert from pd to np, if necessary:
    if type(x) == 'pandas.core.series.Series':
        x = x.to_numpy()
    # compute IQR and 25, 75 quantiles:
    pct25, pct75 = np.percentile(x,[25,75])
    iqr = pct75 - pct25
    # compute whiskers:
    lower_whisker = pct25 - iqr*1.5
    upper_whisker = pct75 + iqr*1.5
    # winsorize x:
    x_winsor = np.copy(x)
    x_winsor[ x < lower_whisker ] = lower_whisker
    x_winsor[ x > upper_whisker ] = upper_whisker
    return(x_winsor)

for v in ['age', 'cigsPerDay', 'totChol', 'sysBP', 'diaBP', 'BMI', 'heartRate', 'glucose']: # remove outliers
  df_train[v] = winsorize(df_train[v])
  df_test[v] = winsorize(df_test[v])

In [19]:
print('----------- Pre-cleaning | train data null counts')
for v in df_train.columns:
   print(v, sum(df_train[v].isnull()))

print('\n', df_train.head(), '\n')

df_train = df_train.drop("Unnamed: 0", axis=1) # remove irrelevant column

for v in df_train.columns: # loop through each var
  df_train[v] = df_train[v].fillna(np.nanmedian(df_train[v])) # fill nan with median for that column. Using median omits outliers

print('----------- Post-cleaning | train data null counts')
for v in df_train.columns:
  print(v, sum(df_train[v].isnull()))

print('\n', df_train.head())

----------- Pre-cleaning | train data null counts
Unnamed: 0 0
sex 0
age 0
education 85
currentSmoker 0
cigsPerDay 24
BPMeds 37
prevalentStroke 0
prevalentHyp 0
diabetes 0
totChol 39
sysBP 0
diaBP 0
BMI 15
heartRate 0
glucose 285
TenYearCHD 0

    Unnamed: 0  sex  age  education  currentSmoker  cigsPerDay  BPMeds  \
0        1267    1   58        1.0              0         0.0     0.0   
1        1209    0   40        1.0              1        15.0     0.0   
2        2050    0   52        1.0              0         0.0     0.0   
3        1183    1   38        2.0              1        43.0     0.0   
4        3225    0   43        1.0              0         0.0     0.0   

   prevalentStroke  prevalentHyp  diabetes  totChol  sysBP  diaBP    BMI  \
0                0             0         0    220.0  143.0  104.0  29.85   
1                0             0         0    199.0  122.0   82.0  22.16   
2                0             0         0    275.0  112.0   71.0  25.68   
3           

In [20]:
print('----------- Pre-cleaning | test data null counts')
for v in df_test.columns:
   print(v, sum(df_test[v].isnull()))

print('\n', df_test.head(), '\n')

df_test = df_test.drop("Unnamed: 0", axis=1) # remove irrelevant column

for v in df_test.columns: # loop through each var
  df_test[v] = df_test[v].fillna(np.nanmedian(df_test[v])) # fill nan with median for that column. Using median omits outliers

print('----------- Post-cleaning | test data null counts')
for v in df_test.columns:
  print(v, sum(df_test[v].isnull()))

print('\n', df_test.head())

----------- Pre-cleaning | test data null counts
Unnamed: 0 0
sex 0
age 0
education 20
currentSmoker 0
cigsPerDay 5
BPMeds 16
prevalentStroke 0
prevalentHyp 0
diabetes 0
totChol 11
sysBP 0
diaBP 0
BMI 4
heartRate 1
glucose 103
TenYearCHD 0

    Unnamed: 0  sex  age  education  currentSmoker  cigsPerDay  BPMeds  \
0         674    0   58        1.0              1        20.0     0.0   
1        4070    0   51        3.0              0         0.0     0.0   
2        3150    0   44        2.0              1         9.0     0.0   
3        1695    0   40        2.0              1        20.0     0.0   
4        2692    1   58        2.0              1        20.0     0.0   

   prevalentStroke  prevalentHyp  diabetes  totChol  sysBP  diaBP    BMI  \
0                0             0         0      NaN  126.0   77.0  30.08   
1                0             0         0    264.0  135.0   83.0  26.68   
2                0             1         0      NaN  147.5   96.0  30.57   
3              

In [21]:
df_train.to_csv('fhs_train_clean.csv', index=False)
df_test.to_csv('fhs_test_clean.csv', index=False)