In [1]:
import pandas as pd
import numpy as np

# 1. Load Data

### 1.1 Medical Conditions

In [15]:
df = pd.read_sas('Resources/P_MCQ.XPT', format='xport')
df.head()

Unnamed: 0,SEQN,MCQ010,MCQ025,MCQ035,MCQ040,MCQ050,AGQ030,MCQ053,MCQ080,MCQ092,...,MCQ300A,MCQ366A,MCQ366B,MCQ366C,MCQ366D,MCQ371A,MCQ371B,MCQ371C,MCQ371D,OSQ230
0,109263.0,2.0,,,,,2.0,2.0,,,...,,,,,,,,,,
1,109264.0,2.0,,,,,,2.0,,2.0,...,,,,,,,,,,
2,109265.0,2.0,,,,,,2.0,,,...,,,,,,,,,,
3,109266.0,2.0,,,,,2.0,2.0,1.0,9.0,...,2.0,2.0,2.0,2.0,1.0,1.0,1.0,1.0,1.0,
4,109267.0,2.0,,,,,2.0,2.0,2.0,2.0,...,2.0,2.0,2.0,2.0,2.0,1.0,1.0,2.0,1.0,


In [16]:
df.shape

(14986, 63)

In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14986 entries, 0 to 14985
Data columns (total 63 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   SEQN     14986 non-null  float64
 1   MCQ010   14986 non-null  float64
 2   MCQ025   2322 non-null   float64
 3   MCQ035   2322 non-null   float64
 4   MCQ040   1423 non-null   float64
 5   MCQ050   1423 non-null   float64
 6   AGQ030   6615 non-null   float64
 7   MCQ053   14986 non-null  float64
 8   MCQ080   10195 non-null  float64
 9   MCQ092   13217 non-null  float64
 10  MCD093   1118 non-null   float64
 11  MCQ149   684 non-null    float64
 12  MCQ151   94 non-null     float64
 13  RHD018   88 non-null     float64
 14  MCQ160A  9232 non-null   float64
 15  MCQ195   2812 non-null   float64
 16  MCQ160B  9232 non-null   float64
 17  MCD180B  361 non-null    float64
 18  MCQ160C  9232 non-null   float64
 19  MCD180C  423 non-null    float64
 20  MCQ160D  9232 non-null   float64
 21  MCD180D  240

In [6]:
df['SEQN'].nunique()

14986

# 2. Clean Data

### 2.1 Only Keep Relevant Columns

In [7]:
df_2 = df[['SEQN', 'MCQ025', 'MCQ080', 'MCD093', 'MCD180B', 'MCD180C', 'MCD180D', 'MCD180E', 'MCD180F',
           'MCD180M', 'MCD180L', 'MCQ510A', 'MCQ510B', 'MCQ510C', 'MCQ510D', 'MCQ510E', 'MCQ510F', 'MCQ570',
           'MCQ300B', 'MCQ300C', 'MCQ300A']]
df_2 = df_2.copy()
df_2.head()

Unnamed: 0,SEQN,MCQ025,MCQ080,MCD093,MCD180B,MCD180C,MCD180D,MCD180E,MCD180F,MCD180M,...,MCQ510A,MCQ510B,MCQ510C,MCQ510D,MCQ510E,MCQ510F,MCQ570,MCQ300B,MCQ300C,MCQ300A
0,109263.0,,,,,,,,,,...,,,,,,,,,,
1,109264.0,,,,,,,,,,...,,,,,,,,1.0,,
2,109265.0,,,,,,,,,,...,,,,,,,,,,
3,109266.0,,1.0,,,,,,,,...,,,,,,,,2.0,1.0,2.0
4,109267.0,,2.0,,,,,,,,...,,,,,,,,2.0,2.0,2.0


### 2.2 Rename Columns

In [8]:
df_2 = df_2.rename(columns={'SEQN':'id', 'MCQ025':'asthma_age', 'MCQ080':'ever_overweight',
                            'MCD093':'blood_transfusion_yr', 'MCD180B':'heart_failure_age',
                            'MCD180C':'age_chronic_heart_disease', 'MCD180D':'angina_pectoris_age',
                            'MCD180E':'heart_attack_age', 'MCD180F':'stroke_age', 'MCD180M':'thyroid_age',
                            'MCD180L':'liver_age', 'MCQ510A':'fatty_liver', 'MCQ510B':'liver_fibrosis',
                            'MCQ510C':'liver_cirrhosis', 'MCQ510D':'liver_viral_hepatitis',
                            'MCQ510E':'liver_autoimmune_hepatitis', 'MCQ510F':'other_liver_disease',
                            'MCQ570':'gallbladder_pr_age', 'MCQ300B':'relative_asthma',
                            'MCQ300C':'relative_diabetes', 'MCQ300A':'relative_heart_attack'})
df_2.head()

Unnamed: 0,id,asthma_age,ever_overweight,blood_transfusion_yr,heart_failure_age,age_chronic_heart_disease,angina_pectoris_age,heart_attack_age,stroke_age,thyroid_age,...,fatty_liver,liver_fibrosis,liver_cirrhosis,liver_viral_hepatitis,liver_autoimmune_hepatitis,other_liver_disease,gallbladder_pr_age,relative_asthma,relative_diabetes,relative_heart_attack
0,109263.0,,,,,,,,,,...,,,,,,,,,,
1,109264.0,,,,,,,,,,...,,,,,,,,1.0,,
2,109265.0,,,,,,,,,,...,,,,,,,,,,
3,109266.0,,1.0,,,,,,,,...,,,,,,,,2.0,1.0,2.0
4,109267.0,,2.0,,,,,,,,...,,,,,,,,2.0,2.0,2.0


### 2.3 Create Binary Columns, Fill in Missing Data

#### Non-Age Comparison Variables

In [9]:
df_2['ever_overweight'] = np.where(df_2['ever_overweight']==1, 1, 0)
df_2['fatty_liver'] = np.where(df_2['fatty_liver']==1, 1, 0)
df_2['liver_fibrosis'] = np.where(df_2['liver_fibrosis']==2, 1, 0)
df_2['liver_cirrhosis'] = np.where(df_2['liver_cirrhosis']==3, 1, 0)
df_2['liver_viral_hepatitis'] = np.where(df_2['liver_viral_hepatitis']==4, 1, 0)
df_2['liver_autoimmune_hepatitis'] = np.where(df_2['liver_autoimmune_hepatitis']==5, 1, 0)
df_2['other_liver_disease'] = np.where(df_2['other_liver_disease']==6, 1, 0)
df_2['relative_asthma'] = np.where(df_2['relative_asthma']==1, 1, 0)
df_2['relative_diabetes'] = np.where(df_2['relative_diabetes']==1, 1, 0)
df_2['relative_heart_attack'] = np.where(df_2['relative_heart_attack']==1, 1, 0)
df_2.head()

Unnamed: 0,id,asthma_age,ever_overweight,blood_transfusion_yr,heart_failure_age,age_chronic_heart_disease,angina_pectoris_age,heart_attack_age,stroke_age,thyroid_age,...,fatty_liver,liver_fibrosis,liver_cirrhosis,liver_viral_hepatitis,liver_autoimmune_hepatitis,other_liver_disease,gallbladder_pr_age,relative_asthma,relative_diabetes,relative_heart_attack
0,109263.0,,0,,,,,,,,...,0,0,0,0,0,0,,0,0,0
1,109264.0,,0,,,,,,,,...,0,0,0,0,0,0,,1,0,0
2,109265.0,,0,,,,,,,,...,0,0,0,0,0,0,,0,0,0
3,109266.0,,1,,,,,,,,...,0,0,0,0,0,0,,0,1,0
4,109267.0,,0,,,,,,,,...,0,0,0,0,0,0,,0,0,0


#### Age Comparison Variables

In [10]:
df_2['heart_attack_age'].describe()

count      432.000000
mean       980.837963
std       9583.555375
min         16.000000
25%         47.000000
50%         56.000000
75%         65.000000
max      99999.000000
Name: heart_attack_age, dtype: float64

In [11]:
df_3 = df_2.copy()
df_3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14986 entries, 0 to 14985
Data columns (total 21 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   id                          14986 non-null  float64
 1   asthma_age                  2322 non-null   float64
 2   ever_overweight             14986 non-null  int64  
 3   blood_transfusion_yr        1118 non-null   float64
 4   heart_failure_age           361 non-null    float64
 5   age_chronic_heart_disease   423 non-null    float64
 6   angina_pectoris_age         240 non-null    float64
 7   heart_attack_age            432 non-null    float64
 8   stroke_age                  487 non-null    float64
 9   thyroid_age                 1080 non-null   float64
 10  liver_age                   462 non-null    float64
 11  fatty_liver                 14986 non-null  int64  
 12  liver_fibrosis              14986 non-null  int64  
 13  liver_cirrhosis             149

In [12]:
df_3['asthma'] = np.where(df_3['asthma_age']<=df_3['heart_attack_age'], 1, 0)
df_3['heart_failure'] = np.where(df_3['heart_failure_age']<=df_3['heart_attack_age'], 1, 0)
df_3['chronic_heart_disease'] = np.where(df_3['age_chronic_heart_disease']<=df_3['heart_attack_age'], 1, 0)
df_3['angina_pectoris'] = np.where(df_3['angina_pectoris_age']<=df_3['heart_attack_age'], 1, 0)
df_3['stroke'] = np.where(df_3['stroke_age']<=df_3['heart_attack_age'], 1, 0)
df_3['thyroid'] = np.where(df_3['thyroid_age']<=df_3['heart_attack_age'], 1, 0)
df_3['liver_disease'] = np.where(df_3['liver_age']<=df_3['heart_attack_age'], 1, 0)
df_3['gallbladder_pr'] = np.where(df_3['gallbladder_pr_age']<=df_3['heart_attack_age'], 1, 0)
df_3.head()

Unnamed: 0,id,asthma_age,ever_overweight,blood_transfusion_yr,heart_failure_age,age_chronic_heart_disease,angina_pectoris_age,heart_attack_age,stroke_age,thyroid_age,...,relative_diabetes,relative_heart_attack,asthma,heart_failure,chronic_heart_disease,angina_pectoris,stroke,thyroid,liver_disease,gallbladder_pr
0,109263.0,,0,,,,,,,,...,0,0,0,0,0,0,0,0,0,0
1,109264.0,,0,,,,,,,,...,0,0,0,0,0,0,0,0,0,0
2,109265.0,,0,,,,,,,,...,0,0,0,0,0,0,0,0,0,0
3,109266.0,,1,,,,,,,,...,1,0,0,0,0,0,0,0,0,0
4,109267.0,,0,,,,,,,,...,0,0,0,0,0,0,0,0,0,0


#### Remove unnecessary age columns

In [13]:
df_4 = df_3.drop(['asthma_age', 'heart_failure_age', 'age_chronic_heart_disease', 'stroke_age', 'thyroid_age',
                  'liver_age', 'gallbladder_pr_age', 'angina_pectoris_age'], axis=1)
df_4.head()

Unnamed: 0,id,ever_overweight,blood_transfusion_yr,heart_attack_age,fatty_liver,liver_fibrosis,liver_cirrhosis,liver_viral_hepatitis,liver_autoimmune_hepatitis,other_liver_disease,...,relative_diabetes,relative_heart_attack,asthma,heart_failure,chronic_heart_disease,angina_pectoris,stroke,thyroid,liver_disease,gallbladder_pr
0,109263.0,0,,,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,109264.0,0,,,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,109265.0,0,,,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,109266.0,1,,,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
4,109267.0,0,,,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [14]:
df_4.describe()

Unnamed: 0,id,ever_overweight,blood_transfusion_yr,heart_attack_age,fatty_liver,liver_fibrosis,liver_cirrhosis,liver_viral_hepatitis,liver_autoimmune_hepatitis,other_liver_disease,...,relative_diabetes,relative_heart_attack,asthma,heart_failure,chronic_heart_disease,angina_pectoris,stroke,thyroid,liver_disease,gallbladder_pr
count,14986.0,14986.0,1118.0,432.0,14986.0,14986.0,14986.0,14986.0,14986.0,14986.0,...,14986.0,14986.0,14986.0,14986.0,14986.0,14986.0,14986.0,14986.0,14986.0,14986.0
mean,117054.27092,0.257774,2.689624,980.837963,0.013613,0.000667,0.002269,0.005405,0.001335,0.007874,...,0.291405,0.079141,0.004471,0.007073,0.013012,0.005805,0.004804,0.002936,0.001601,0.003804
std,4490.906006,0.437423,1.278323,9583.555375,0.115881,0.025824,0.047579,0.073322,0.036509,0.088389,...,0.454425,0.269967,0.066717,0.083808,0.11333,0.075974,0.06915,0.054108,0.039988,0.061558
min,109263.0,0.0,1.0,16.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,113161.25,0.0,2.0,47.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,117061.5,0.0,3.0,56.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,120943.75,1.0,3.0,65.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,124822.0,1.0,9.0,99999.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [21]:
df_4.to_csv('Resources/medical_conditions_cleaned.csv', index=False)