In [1]:
import pandas as pd
import numpy as np

# 1. Load Data

### 1.1 Medical Conditions

In [62]:
df = pd.read_sas('Resources/P_MCQ.XPT', format='xport')
df.head()

Unnamed: 0,SEQN,MCQ010,MCQ025,MCQ035,MCQ040,MCQ050,AGQ030,MCQ053,MCQ080,MCQ092,...,MCQ300A,MCQ366A,MCQ366B,MCQ366C,MCQ366D,MCQ371A,MCQ371B,MCQ371C,MCQ371D,OSQ230
0,109263.0,2.0,,,,,2.0,2.0,,,...,,,,,,,,,,
1,109264.0,2.0,,,,,,2.0,,2.0,...,,,,,,,,,,
2,109265.0,2.0,,,,,,2.0,,,...,,,,,,,,,,
3,109266.0,2.0,,,,,2.0,2.0,1.0,9.0,...,2.0,2.0,2.0,2.0,1.0,1.0,1.0,1.0,1.0,
4,109267.0,2.0,,,,,2.0,2.0,2.0,2.0,...,2.0,2.0,2.0,2.0,2.0,1.0,1.0,2.0,1.0,


In [63]:
df.shape

(14986, 63)

In [64]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14986 entries, 0 to 14985
Data columns (total 63 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   SEQN     14986 non-null  float64
 1   MCQ010   14986 non-null  float64
 2   MCQ025   2322 non-null   float64
 3   MCQ035   2322 non-null   float64
 4   MCQ040   1423 non-null   float64
 5   MCQ050   1423 non-null   float64
 6   AGQ030   6615 non-null   float64
 7   MCQ053   14986 non-null  float64
 8   MCQ080   10195 non-null  float64
 9   MCQ092   13217 non-null  float64
 10  MCD093   1118 non-null   float64
 11  MCQ149   684 non-null    float64
 12  MCQ151   94 non-null     float64
 13  RHD018   88 non-null     float64
 14  MCQ160A  9232 non-null   float64
 15  MCQ195   2812 non-null   float64
 16  MCQ160B  9232 non-null   float64
 17  MCD180B  361 non-null    float64
 18  MCQ160C  9232 non-null   float64
 19  MCD180C  423 non-null    float64
 20  MCQ160D  9232 non-null   float64
 21  MCD180D  240

In [65]:
df['SEQN'].nunique()

14986

# 2. Clean Data

### 2.1 Only Keep Relevant Columns

In [83]:
df_2 = df[['SEQN', 'MCQ025', 'MCQ080', 'MCD093', 'MCD180B', 'MCD180C', 'MCD180D', 'MCD180E', 'MCD180F',
           'MCD180M', 'MCD180L', 'MCQ510A', 'MCQ510B', 'MCQ510C', 'MCQ510D', 'MCQ510E', 'MCQ510F', 'MCQ570',
           'MCQ300B', 'MCQ300C', 'MCQ300A', 'MCQ160E']]
df_2 = df_2.copy()
df_2.head()

Unnamed: 0,SEQN,MCQ025,MCQ080,MCD093,MCD180B,MCD180C,MCD180D,MCD180E,MCD180F,MCD180M,...,MCQ510B,MCQ510C,MCQ510D,MCQ510E,MCQ510F,MCQ570,MCQ300B,MCQ300C,MCQ300A,MCQ160E
0,109263.0,,,,,,,,,,...,,,,,,,,,,
1,109264.0,,,,,,,,,,...,,,,,,,1.0,,,
2,109265.0,,,,,,,,,,...,,,,,,,,,,
3,109266.0,,1.0,,,,,,,,...,,,,,,,2.0,1.0,2.0,2.0
4,109267.0,,2.0,,,,,,,,...,,,,,,,2.0,2.0,2.0,2.0


### 2.2 Rename Columns

In [84]:
df_2 = df_2.rename(columns={'SEQN':'id', 'MCQ025':'asthma_age', 'MCQ080':'ever_overweight',
                            'MCD093':'blood_transfusion_yr', 'MCD180B':'heart_failure_age',
                            'MCD180C':'age_chronic_heart_disease', 'MCD180D':'angina_pectoris_age',
                            'MCQ160E':'heart_attack',
                            'MCD180E':'heart_attack_age', 'MCD180F':'stroke_age', 'MCD180M':'thyroid_age',
                            'MCD180L':'liver_age', 'MCQ510A':'fatty_liver', 'MCQ510B':'liver_fibrosis',
                            'MCQ510C':'liver_cirrhosis', 'MCQ510D':'liver_viral_hepatitis',
                            'MCQ510E':'liver_autoimmune_hepatitis', 'MCQ510F':'other_liver_disease',
                            'MCQ570':'gallbladder_pr_age', 'MCQ300B':'relative_asthma',
                            'MCQ300C':'relative_diabetes', 'MCQ300A':'relative_heart_attack'})
df_2.head()

Unnamed: 0,id,asthma_age,ever_overweight,blood_transfusion_yr,heart_failure_age,age_chronic_heart_disease,angina_pectoris_age,heart_attack_age,stroke_age,thyroid_age,...,liver_fibrosis,liver_cirrhosis,liver_viral_hepatitis,liver_autoimmune_hepatitis,other_liver_disease,gallbladder_pr_age,relative_asthma,relative_diabetes,relative_heart_attack,heart_attack
0,109263.0,,,,,,,,,,...,,,,,,,,,,
1,109264.0,,,,,,,,,,...,,,,,,,1.0,,,
2,109265.0,,,,,,,,,,...,,,,,,,,,,
3,109266.0,,1.0,,,,,,,,...,,,,,,,2.0,1.0,2.0,2.0
4,109267.0,,2.0,,,,,,,,...,,,,,,,2.0,2.0,2.0,2.0


### 2.3 Create Binary Columns, Fill in Missing Data

In [85]:
df_2['ever_overweight'] = np.where(df_2['ever_overweight']==1, 1, 0)
df_2['fatty_liver'] = np.where(df_2['fatty_liver']==1, 1, 0)
df_2['liver_fibrosis'] = np.where(df_2['liver_fibrosis']==2, 1, 0)
df_2['liver_cirrhosis'] = np.where(df_2['liver_cirrhosis']==3, 1, 0)
df_2['liver_viral_hepatitis'] = np.where(df_2['liver_viral_hepatitis']==4, 1, 0)
df_2['liver_autoimmune_hepatitis'] = np.where(df_2['liver_autoimmune_hepatitis']==5, 1, 0)
df_2['other_liver_disease'] = np.where(df_2['other_liver_disease']==6, 1, 0)
df_2['relative_asthma'] = np.where(df_2['relative_asthma']==1, 1, 0)
df_2['relative_diabetes'] = np.where(df_2['relative_diabetes']==1, 1, 0)
df_2['relative_heart_attack'] = np.where(df_2['relative_heart_attack']==1, 1, 0)
df_2['heart_attack'] = np.where(df_2['heart_attack']==1, 1, 0)
df_2.head()

Unnamed: 0,id,asthma_age,ever_overweight,blood_transfusion_yr,heart_failure_age,age_chronic_heart_disease,angina_pectoris_age,heart_attack_age,stroke_age,thyroid_age,...,liver_fibrosis,liver_cirrhosis,liver_viral_hepatitis,liver_autoimmune_hepatitis,other_liver_disease,gallbladder_pr_age,relative_asthma,relative_diabetes,relative_heart_attack,heart_attack
0,109263.0,,0,,,,,,,,...,0,0,0,0,0,,0,0,0,0
1,109264.0,,0,,,,,,,,...,0,0,0,0,0,,1,0,0,0
2,109265.0,,0,,,,,,,,...,0,0,0,0,0,,0,0,0,0
3,109266.0,,1,,,,,,,,...,0,0,0,0,0,,0,1,0,0
4,109267.0,,0,,,,,,,,...,0,0,0,0,0,,0,0,0,0


### 2.4 Clean Age Variables

In [86]:
df_2['heart_attack_age'].describe()

count      432.000000
mean       980.837963
std       9583.555375
min         16.000000
25%         47.000000
50%         56.000000
75%         65.000000
max      99999.000000
Name: heart_attack_age, dtype: float64

In [87]:
# 122 highest recorded age source: https://www.cnbc.com/2023/02/21/longevity-expert-3-reasons-the-worlds-oldest-person-lived-to-122.html#:~:text=Jeanne%20Calment%2C%20a%20French%20woman,links%20between%20health%20and%20longevity.
df_3 = df_2.loc[((df_2['heart_attack_age']>=0.0)&(df_2['heart_attack_age']<=122.0))|
                (df_2['heart_attack_age'].isna())]
df_3 = df_3.loc[((df_3['asthma_age']>=0.0)&(df_3['asthma_age']<=122.0))|
                (df_3['asthma_age'].isna())]
df_3 = df_3.loc[((df_3['heart_failure_age']>=0.0)&(df_3['heart_failure_age']<=122.0))|
                (df_3['heart_failure_age'].isna())]
df_3 = df_3.loc[((df_3['age_chronic_heart_disease']>=0.0)&(df_3['age_chronic_heart_disease']<=122.0))|
                (df_3['age_chronic_heart_disease'].isna())]
df_3 = df_3.loc[((df_3['angina_pectoris_age']>=0.0)&(df_3['angina_pectoris_age']<=122.0))|
                (df_3['angina_pectoris_age'].isna())]
df_3 = df_3.loc[((df_3['stroke_age']>=0.0)&(df_3['stroke_age']<=122.0))|
                (df_3['stroke_age'].isna())]
df_3 = df_3.loc[((df_3['thyroid_age']>=0.0)&(df_3['thyroid_age']<=122.0))|
                (df_3['thyroid_age'].isna())]
df_3 = df_3.loc[((df_3['liver_age']>=0.0)&(df_3['liver_age']<=122.0))|
                (df_3['liver_age'].isna())]
df_3 = df_3.loc[((df_3['gallbladder_pr_age']>=0.0)&(df_3['gallbladder_pr_age']<=122.0))|
                (df_3['gallbladder_pr_age'].isna())]
df_3.head()

Unnamed: 0,id,asthma_age,ever_overweight,blood_transfusion_yr,heart_failure_age,age_chronic_heart_disease,angina_pectoris_age,heart_attack_age,stroke_age,thyroid_age,...,liver_fibrosis,liver_cirrhosis,liver_viral_hepatitis,liver_autoimmune_hepatitis,other_liver_disease,gallbladder_pr_age,relative_asthma,relative_diabetes,relative_heart_attack,heart_attack
0,109263.0,,0,,,,,,,,...,0,0,0,0,0,,0,0,0,0
1,109264.0,,0,,,,,,,,...,0,0,0,0,0,,1,0,0,0
2,109265.0,,0,,,,,,,,...,0,0,0,0,0,,0,0,0,0
3,109266.0,,1,,,,,,,,...,0,0,0,0,0,,0,1,0,0
4,109267.0,,0,,,,,,,,...,0,0,0,0,0,,0,0,0,0


In [88]:
df_3 = df_3.copy()
df_3.info()

<class 'pandas.core.frame.DataFrame'>
Index: 14890 entries, 0 to 14985
Data columns (total 22 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   id                          14890 non-null  float64
 1   asthma_age                  2278 non-null   float64
 2   ever_overweight             14890 non-null  int64  
 3   blood_transfusion_yr        1097 non-null   float64
 4   heart_failure_age           346 non-null    float64
 5   age_chronic_heart_disease   405 non-null    float64
 6   angina_pectoris_age         227 non-null    float64
 7   heart_attack_age            416 non-null    float64
 8   stroke_age                  467 non-null    float64
 9   thyroid_age                 1050 non-null   float64
 10  liver_age                   446 non-null    float64
 11  fatty_liver                 14890 non-null  int64  
 12  liver_fibrosis              14890 non-null  int64  
 13  liver_cirrhosis             14890 no

### 2.5 Calculate Difference in Years between Heart Attack and Comorbidities

In [89]:
df_3['asthma_yrs'] = df_3['heart_attack_age'] - df_3['asthma_age']
df_3['heart_failure_yrs'] = df_3['heart_attack_age'] - df_3['heart_failure_age']
df_3['chronic_heart_disease_yrs'] = df_3['heart_attack_age'] - df_3['age_chronic_heart_disease']
df_3['angina_pectoris_yrs'] = df_3['heart_attack_age'] - df_3['angina_pectoris_age']
df_3['stroke_yrs'] = df_3['heart_attack_age'] - df_3['stroke_age']
df_3['thyroid_yrs'] = df_3['heart_attack_age'] - df_3['thyroid_age']
df_3['liver_disease_yrs'] = df_3['heart_attack_age'] - df_3['liver_age']
df_3['gallbladder_pr_yrs'] = df_3['heart_attack_age'] - df_3['gallbladder_pr_age']
df_3.head()

Unnamed: 0,id,asthma_age,ever_overweight,blood_transfusion_yr,heart_failure_age,age_chronic_heart_disease,angina_pectoris_age,heart_attack_age,stroke_age,thyroid_age,...,relative_heart_attack,heart_attack,asthma_yrs,heart_failure_yrs,chronic_heart_disease_yrs,angina_pectoris_yrs,stroke_yrs,thyroid_yrs,liver_disease_yrs,gallbladder_pr_yrs
0,109263.0,,0,,,,,,,,...,0,0,,,,,,,,
1,109264.0,,0,,,,,,,,...,0,0,,,,,,,,
2,109265.0,,0,,,,,,,,...,0,0,,,,,,,,
3,109266.0,,1,,,,,,,,...,0,0,,,,,,,,
4,109267.0,,0,,,,,,,,...,0,0,,,,,,,,


In [90]:
df_3.describe()

Unnamed: 0,id,asthma_age,ever_overweight,blood_transfusion_yr,heart_failure_age,age_chronic_heart_disease,angina_pectoris_age,heart_attack_age,stroke_age,thyroid_age,...,relative_heart_attack,heart_attack,asthma_yrs,heart_failure_yrs,chronic_heart_disease_yrs,angina_pectoris_yrs,stroke_yrs,thyroid_yrs,liver_disease_yrs,gallbladder_pr_yrs
count,14890.0,2278.0,14890.0,1097.0,346.0,405.0,227.0,416.0,467.0,1050.0,...,14890.0,14890.0,80.0,140.0,216.0,103.0,104.0,76.0,33.0,77.0
mean,117053.653459,15.599649,0.25722,2.65907,56.965318,57.501235,51.770925,55.545673,56.184154,45.574286,...,0.078509,0.027938,18.05,-1.278571,0.226852,2.029126,-1.451923,1.5,6.242424,9.571429
std,4492.588816,18.712381,0.437116,1.198574,15.67714,12.824525,15.287233,13.79044,15.963892,17.695139,...,0.26898,0.164801,23.679773,7.329911,6.549235,9.802417,13.010735,17.340127,16.894952,16.149327
min,109263.0,1.0,0.0,1.0,16.0,16.0,16.0,16.0,16.0,16.0,...,0.0,0.0,-40.0,-35.0,-32.0,-36.0,-61.0,-36.0,-24.0,-21.0
25%,113161.25,3.0,0.0,2.0,46.25,50.0,42.0,47.0,46.0,31.0,...,0.0,0.0,2.75,-1.0,0.0,0.0,-4.0,-7.0,-3.0,-1.0
50%,117058.5,7.0,0.0,3.0,59.0,59.0,53.0,56.0,56.0,45.0,...,0.0,0.0,17.5,0.0,0.0,0.0,0.0,0.0,5.0,7.0
75%,120944.75,21.0,1.0,3.0,69.0,67.0,62.0,65.0,68.0,60.0,...,0.0,0.0,37.0,0.0,0.0,0.0,1.0,8.25,14.0,21.0
max,124822.0,80.0,1.0,9.0,80.0,80.0,80.0,80.0,80.0,80.0,...,1.0,1.0,79.0,25.0,33.0,35.0,33.0,50.0,53.0,51.0


### 2.6 Clean Age and Year Difference Columns

In [91]:
yr_columns_to_check = ['asthma_yrs', 'heart_failure_yrs', 'chronic_heart_disease_yrs', 'angina_pectoris_yrs',
                       'stroke_yrs', 'thyroid_yrs', 'liver_disease_yrs', 'gallbladder_pr_yrs']

In [92]:
for column in yr_columns_to_check:
    df_3[column] = np.where((df_3[column].isna())|(df_3[column]<0), 0, df_3[column])

In [93]:
age_columns_to_check = ['heart_attack_age', 'asthma_age', 'heart_failure_age', 'age_chronic_heart_disease',
                        'angina_pectoris_age', 'stroke_age', 'thyroid_age', 'liver_age', 'gallbladder_pr_age']

In [94]:
for column in age_columns_to_check:
    df_3[column] = np.where((df_3[column].isna())|(df_3[column]>df_3['heart_attack_age']), 0, df_3[column])

In [95]:
df_3.head()

Unnamed: 0,id,asthma_age,ever_overweight,blood_transfusion_yr,heart_failure_age,age_chronic_heart_disease,angina_pectoris_age,heart_attack_age,stroke_age,thyroid_age,...,relative_heart_attack,heart_attack,asthma_yrs,heart_failure_yrs,chronic_heart_disease_yrs,angina_pectoris_yrs,stroke_yrs,thyroid_yrs,liver_disease_yrs,gallbladder_pr_yrs
0,109263.0,0.0,0,,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,109264.0,0.0,0,,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,109265.0,0.0,0,,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,109266.0,0.0,1,,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,109267.0,0.0,0,,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### 2.6 Add Column for Closest Comorbidity Age

In [96]:
df_3['max_age'] = df_3[age_columns_to_check].max(axis=1)
df_3.head()

Unnamed: 0,id,asthma_age,ever_overweight,blood_transfusion_yr,heart_failure_age,age_chronic_heart_disease,angina_pectoris_age,heart_attack_age,stroke_age,thyroid_age,...,heart_attack,asthma_yrs,heart_failure_yrs,chronic_heart_disease_yrs,angina_pectoris_yrs,stroke_yrs,thyroid_yrs,liver_disease_yrs,gallbladder_pr_yrs,max_age
0,109263.0,0.0,0,,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,109264.0,0.0,0,,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,109265.0,0.0,0,,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,109266.0,0.0,1,,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,109267.0,0.0,0,,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [97]:
df_4 = df_3.drop('blood_transfusion_yr', axis=1)

In [98]:
df_4.to_csv('Resources/medical_conditions_cleaned.csv', index=False)

#### IGNORE

In [35]:
# df_3['asthma'] = np.where(df_3['asthma_age']<=df_3['heart_attack_age'], 1, 0)
# df_3['heart_failure'] = np.where(df_3['heart_failure_age']<=df_3['heart_attack_age'], 1, 0)
# df_3['chronic_heart_disease'] = np.where(df_3['age_chronic_heart_disease']<=df_3['heart_attack_age'], 1, 0)
# df_3['angina_pectoris'] = np.where(df_3['angina_pectoris_age']<=df_3['heart_attack_age'], 1, 0)
# df_3['stroke'] = np.where(df_3['stroke_age']<=df_3['heart_attack_age'], 1, 0)
# df_3['thyroid'] = np.where(df_3['thyroid_age']<=df_3['heart_attack_age'], 1, 0)
# df_3['liver_disease'] = np.where(df_3['liver_age']<=df_3['heart_attack_age'], 1, 0)
# df_3['gallbladder_pr'] = np.where(df_3['gallbladder_pr_age']<=df_3['heart_attack_age'], 1, 0)
# df_3.head()

Unnamed: 0,id,asthma_age,ever_overweight,blood_transfusion_yr,heart_failure_age,age_chronic_heart_disease,angina_pectoris_age,heart_attack_age,stroke_age,thyroid_age,...,relative_heart_attack,heart_attack,asthma,heart_failure,chronic_heart_disease,angina_pectoris,stroke,thyroid,liver_disease,gallbladder_pr
0,109263.0,,0,,,,,,,,...,0,0,0,0,0,0,0,0,0,0
1,109264.0,,0,,,,,,,,...,0,0,0,0,0,0,0,0,0,0
2,109265.0,,0,,,,,,,,...,0,0,0,0,0,0,0,0,0,0
3,109266.0,,1,,,,,,,,...,0,0,0,0,0,0,0,0,0,0
4,109267.0,,0,,,,,,,,...,0,0,0,0,0,0,0,0,0,0
