# Data Preparation

In [48]:
import pandas as pd
import numpy as np

In [49]:
df = pd.read_csv('adult_exam_lab.csv')
df.head()

Unnamed: 0,SEQN,DMARETHN,HSSEX,HSAGEIR,HFA8R,HFF18,HAC5A3,HAC5A4,HAD1,HAD5R,...,G1P,G1PSI,G1PCODE,G1PTIM1,G1PTIM2,G2PSI,I1P,I1PSI,HSSEX2,HSAGEIR2
0,9,1,2,48,16,2,1.0,2.0,2,,...,111.3,6.178,,116.0,122.0,7.882,10.8,64.8,2,48
1,19,2,1,44,13,1,,,2,,...,108.0,5.995,,129.0,142.0,2.792,17.55,105.3,1,44
2,34,2,2,42,12,2,,,2,,...,98.9,5.49,,117.0,129.0,4.94,9.82,58.92,2,42
3,45,2,2,67,16,1,,,2,,...,,,,,,,,,2,67
4,48,1,2,56,12,2,2.0,2.0,2,,...,84.9,4.713,99.0,888.0,888.0,888888.0,7.89,47.34,2,56


In [50]:
df.shape

(7424, 74)

## Parent Diabete History

In [51]:
df["HAC5A3"].value_counts()

HAC5A3
2.0    2072
1.0    1234
8.0      37
Name: count, dtype: int64

In [52]:
df["HAC5A4"].value_counts()

HAC5A4
2.0    2645
1.0     661
8.0      37
Name: count, dtype: int64

In [4]:
#  classified as having parental history of diabetes if either their biological father or mother had diabetes
# "HAC5A3": Did mother have diabetes
# "HAC5A4": Did father have diabetes
# "parental_diabetes": 1 if either parent had diabetes, 0 otherwise
df['parental_diabetes'] = np.where((df['HAC5A3'] == 1) | (df['HAC5A4'] == 1), 1, 0)
df['parental_diabetes'].value_counts()

parental_diabetes
0    5676
1    1748
Name: count, dtype: int64

## Education level

In [53]:
df["HFA8R"].value_counts()

HFA8R
12    2161
16     509
17     475
14     474
8      471
10     445
11     405
9      381
13     353
6      309
0      276
7      225
3      178
5      177
4      172
15     155
2      140
1       75
99      25
88      18
Name: count, dtype: int64

In [5]:
# Education level was dichotomized at 12 years
# "HFA8R": Education level
# 00: Never attended or kindergarten only
# 01- 17: Highest grade or year of regular school completed
# 88: Blank but applicable
# 99: Don't know
# "education": 1 if education level is greater than 12, 0 otherwise
df['education'] = np.where(df['HFA8R'] > 12, 1, 0)
df['education'].value_counts()

education
0    5415
1    2009
Name: count, dtype: int64

## Income

In [54]:
df['HFF18'].value_counts()

HFF18
2    4009
1    3267
8      91
9      47
0      10
Name: count, dtype: int64

In [55]:
# "HFF18": annual household income
# 0: no income; 1: Less than $20,000 ; 2: $20,000 or more ; 8: Blank but applicable ; 9: Don't know
df['income'] = np.where(df['HFF18'] == 2, 1, 0)
df['income'].value_counts()

income
1    4009
0    3415
Name: count, dtype: int64

## Smokers

In [56]:
df['HAR1'].value_counts()

HAR1
1    4352
2    3072
Name: count, dtype: int64

## Activity level
A: vigorously active; B: moderately active; C: lightly active; D: sedentary

In [6]:
# define the activity level thresholds
vigorous_threshold = 6

# Define a dictionary to pair 'S' columns with their corresponding 'MET' columns
activity_pairs = {
    'HAT1S': 'HAT1MET',
    'HAT3S': 'HAT2MET',
    'HAT5S': 'HAT4MET',
    'HAT7S': 'HAT6MET',
    'HAT9S': 'HAT8MET',
    'HAT11S': 'HAT10MET',
    'HAT13S': 'HAT12MET',
    'HAT15S': 'HAT14MET',
    'HAT17S': 'HAT16MET',
    'HAT20S': 'HAT19MET',
    'HAT22S': 'HAT21MET',
    'HAT24S': 'HAT23MET',
    'HAT26S': 'HAT25MET'
}

# Initialize the count columns
df['Vigorous_Activity_Count'] = 0
df['Moderate_Activity_Count'] = 0

# Calculate the counts for vigorous and moderate activities
for s_col, met_col in activity_pairs.items():
    df.loc[df[met_col] >= vigorous_threshold, 'Vigorous_Activity_Count'] += df[s_col]
    df.loc[df[met_col] < vigorous_threshold, 'Moderate_Activity_Count'] += df[s_col]

# Define a function to determine the activity level category
def get_activity_level(row):
    if row['HSAGEIR'] >= 60 and row['Vigorous_Activity_Count'] >= 12:
        return 'A' #'vigorously active'
    elif row['HSAGEIR'] < 60 and row['Vigorous_Activity_Count'] >= 28:
        return 'A' #'vigorously active'
    elif row['Moderate_Activity_Count'] + row['Vigorous_Activity_Count'] >= 20:
        if row['Vigorous_Activity_Count'] <= 8:
            return 'B' #'moderately active'
        else:
            return 'C' #'lightly active'
    elif row['Vigorous_Activity_Count'] > 0 or row['Moderate_Activity_Count'] > 0:
        return 'C' #'lightly active'
    else:
        return 'D' #'sedentary'

# Apply the function to each row
df['Activity_Level'] = df.apply(get_activity_level, axis=1)

# Output the dataframe with the new columns
df[['Vigorous_Activity_Count', 'Moderate_Activity_Count', 'Activity_Level']].head(10)


Unnamed: 0,Vigorous_Activity_Count,Moderate_Activity_Count,Activity_Level
0,0,2,C
1,0,0,D
2,0,0,D
3,0,43,B
4,0,18,C
5,0,61,B
6,13,17,C
7,0,0,D
8,0,30,B
9,0,0,D


## Leg length

In [7]:
# leg length
# "BMPHT"-"BMPSITHT": Standing height (cm) - Sitting height (cm)
# "leg_length": Standing height - Sitting height
df['leg_length'] = df['BMPHT'] - df['BMPSITHT']
df.head()


Unnamed: 0,SEQN,DMARETHN,HSSEX,HSAGEIR,HFA8R,HFF18,HAC5A3,HAC5A4,HAD1,HAD5R,...,I1P,I1PSI,HSSEX2,HSAGEIR2,parental_diabetes,education,Vigorous_Activity_Count,Moderate_Activity_Count,Activity_Level,leg_length
0,9,1,2,48,16,2,1.0,2.0,2,,...,10.8,64.8,2,48,1,1,0,2,C,72.0
1,19,2,1,44,13,1,,,2,,...,17.55,105.3,1,44,0,1,0,0,D,84.1
2,34,2,2,42,12,2,,,2,,...,9.82,58.92,2,42,0,0,0,0,D,77.1
3,45,2,2,67,16,1,,,2,,...,,,2,67,0,1,0,43,B,
4,48,1,2,56,12,2,2.0,2.0,2,,...,7.89,47.34,2,56,0,0,0,18,C,78.8


## Leg length-to-height ratio

In [8]:
# leg length-to-height ratio
# (BMPHT-BMPSITHT)/BMPHT
df['leg_length_ratio'] = df['leg_length'] / df['BMPHT']
df.head()

Unnamed: 0,SEQN,DMARETHN,HSSEX,HSAGEIR,HFA8R,HFF18,HAC5A3,HAC5A4,HAD1,HAD5R,...,I1PSI,HSSEX2,HSAGEIR2,parental_diabetes,education,Vigorous_Activity_Count,Moderate_Activity_Count,Activity_Level,leg_length,leg_length_ratio
0,9,1,2,48,16,2,1.0,2.0,2,,...,64.8,2,48,1,1,0,2,C,72.0,0.458891
1,19,2,1,44,13,1,,,2,,...,105.3,1,44,0,1,0,0,D,84.1,0.471677
2,34,2,2,42,12,2,,,2,,...,58.92,2,42,0,0,0,0,D,77.1,0.484906
3,45,2,2,67,16,1,,,2,,...,,2,67,0,1,0,43,B,,
4,48,1,2,56,12,2,2.0,2.0,2,,...,47.34,2,56,0,0,0,18,C,78.8,0.459207


## Body fat
Body fat only has 6778 eligible subjects on paper.  
But I count for 6673.

### Step1: Exclude pregnant, had pacemaker

In [9]:
# body fat
# count FFM first
# for male
# FFM = -10.678 + 0.262 weight + 0.652 height^2 / resistance + 0.015 resistance
# for female
# FFM = -9.529 + 0.168 weight + 0.696 height^2 / resistance + 0.016 resistance
# TBF(total body fat) = weight - FFM
# %BF(percent body fat) = TBF / weight
# "BMPWT": Weight (kg); "BMPHT": Standing height (cm); "PEP12A1": BIA resistance (ohms) (12 years and over)
# "PEPPACE": Examinee had pacemaker--cannot obtain BIA; "MAPF12R": Pregnancy status; 
# 1: Yes, 2: No, 8: Blank but applicable, 9: Don't know

# Participants who were pregnant, who had cardiac pacemakers, 
# or who had previously undergone limb amputation were excluded from the measurement of biometrical impedance
new_df = df[(df['MAPF12R'] != 1) & (df['PEPPACE'] != 1)]
new_df.shape


(7410, 81)

### Step2: Exclude whom didn't eligible to take BIA test

In [10]:
# first filt "PEP12A1" (BIA resistance) is not null and is not 8888
new_df = df[(df['PEP12A1'].notnull()) & (df['PEP12A1'] != 8888)]
new_df.shape

(6673, 81)

### Calculate Percent Body Fat

In [11]:
# calculate FFM
def calculate_FFM(row):
    # female or male
    if row["HSSEX"] == 1: # 1:male
        return -10.678 + 0.262 * row['BMPWT'] + 0.652 * row['BMPHT']**2 / row['PEP12A1'] + 0.015 * row['PEP12A1']
    else: # 2: female
        return -9.529 + 0.168 * row['BMPWT'] + 0.696 * row['BMPHT']**2 / row['PEP12A1'] + 0.016 * row['PEP12A1']
new_df['FFM'] = new_df.apply(calculate_FFM, axis=1)
new_df['TBF'] = new_df['BMPWT'] - new_df['FFM']
new_df['%BF'] = new_df['TBF'] / new_df['BMPWT']
new_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['FFM'] = new_df.apply(calculate_FFM, axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['TBF'] = new_df['BMPWT'] - new_df['FFM']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['%BF'] = new_df['TBF'] / new_df['BMPWT']


Unnamed: 0,SEQN,DMARETHN,HSSEX,HSAGEIR,HFA8R,HFF18,HAC5A3,HAC5A4,HAD1,HAD5R,...,parental_diabetes,education,Vigorous_Activity_Count,Moderate_Activity_Count,Activity_Level,leg_length,leg_length_ratio,FFM,TBF,%BF
0,9,1,2,48,16,2,1.0,2.0,2,,...,1,1,0,2,C,72.0,0.458891,43.510731,24.539269,0.360606
1,19,2,1,44,13,1,,,2,,...,0,1,0,0,D,84.1,0.471677,63.619565,22.580435,0.261954
2,34,2,2,42,12,2,,,2,,...,0,0,0,0,D,77.1,0.484906,38.118489,19.081511,0.333593
4,48,1,2,56,12,2,2.0,2.0,2,,...,0,0,0,18,C,78.8,0.459207,60.149494,48.900506,0.448423
6,52,3,1,50,12,2,2.0,2.0,2,,...,0,0,13,17,C,82.8,0.464646,60.141719,19.458281,0.244451


In [12]:
new_df.shape

(6673, 84)

### Merge body fat dataframe with original dataframe on 'SEQN', add column, "%BF"

In [13]:
# Merge body fat dataframe with original dataframe on 'SEQN', add column, "%BF"
df = pd.merge(df, new_df[['SEQN', '%BF']], on='SEQN', how='left')
df.shape

(7424, 82)

## HOMA-IR

### Exclude diabete subjects, by "HAD1"

In [14]:
# 1=Yes  2=No  8=Blank but applicable  9=Don't know      
df['HAD1'].value_counts()

HAD1
2    6661
1     754
9       9
Name: count, dtype: int64

In [15]:
df['HAD1'].isnull().sum()

0

In [16]:
# exclude diabetes
HOMA_df = df[(df['HAD1'] == 2)]
HOMA_df.shape

(6661, 82)

### Check the range of glucose and insulin for HOMA2-IR model amd filter the data

In [17]:
# Check the range 
# plasma glucose ranged from 3.0 to 25.0 mmol/l and insulin ranged from 20 to 400 pmol/l
# "G1PSI": Plasma glucose (mmol/l); "I1PSI": Serum Insulin (pmol/l)
print(HOMA_df['G1PSI'].between(3.0, 25.0).sum())
print(HOMA_df['I1PSI'].between(20, 400).sum())
print((HOMA_df['G1PSI'].between(3.0, 25.0) & HOMA_df['I1PSI'].between(20.0, 400.0)).sum())

6340
6159
6142


In [18]:
# filter the data
HOMA_df = HOMA_df[(HOMA_df['G1PSI'].between(3.0, 25.0)) & (HOMA_df['I1PSI'].between(20.0, 400.0))]
HOMA_df.shape

(6142, 82)

### Output the subjects whom we want to calculate the HOMA-IR value

In [19]:
tmp_df = HOMA_df[['SEQN', 'G1PSI', 'I1PSI']]
tmp_df.head()
tmp_df.to_csv('HOMA.csv', index=False)

### Get the result back from HOMA2-IR model

In [20]:
# read the result
HOMAIR_df = pd.read_csv('HOMA_IR.csv')
HOMAIR_df.head()

Unnamed: 0,SEQN,G1PSI,I1PSI,HOMA2 %B,HOMA2 %S,HOMA2_IR
0,9,6.178,64.8,71.4,78.6,1.272265
1,19,5.995,105.3,105.9,49.2,2.03252
2,34,5.49,58.92,84.4,88.7,1.127396
3,48,4.713,47.34,98.7,114.5,0.873362
4,51,4.896,240.0,274.8,23.5,4.255319


In [21]:
# merge the result
# merge by 'SEQN'
HOMAIR_df = pd.merge(HOMA_df, HOMAIR_df, on='SEQN')
HOMAIR_df.head()

Unnamed: 0,SEQN,DMARETHN,HSSEX,HSAGEIR,HFA8R,HFF18,HAC5A3,HAC5A4,HAD1,HAD5R,...,Moderate_Activity_Count,Activity_Level,leg_length,leg_length_ratio,%BF,G1PSI_y,I1PSI_y,HOMA2 %B,HOMA2 %S,HOMA2_IR
0,9,1,2,48,16,2,1.0,2.0,2,,...,2,C,72.0,0.458891,0.360606,6.178,64.8,71.4,78.6,1.272265
1,19,2,1,44,13,1,,,2,,...,0,D,84.1,0.471677,0.261954,5.995,105.3,105.9,49.2,2.03252
2,34,2,2,42,12,2,,,2,,...,0,D,77.1,0.484906,0.333593,5.49,58.92,84.4,88.7,1.127396
3,48,1,2,56,12,2,2.0,2.0,2,,...,18,C,78.8,0.459207,0.448423,4.713,47.34,98.7,114.5,0.873362
4,51,3,1,44,7,1,,,2,,...,61,B,85.4,0.47313,,4.896,240.0,274.8,23.5,4.255319


In [22]:
HOMAIR_df.shape

(6142, 87)

### Merge HOMAIR dataframe with original dataframe on 'SEQN', add column, "HOMA2_IR"

In [23]:
# Merge HOMAIR dataframe with original dataframe on 'SEQN', add column, "HOMA2_IR"
df = pd.merge(df, HOMAIR_df[['SEQN', 'HOMA2_IR']], on='SEQN', how='left')
df.shape

(7424, 83)

## Glucose intolerance

### Exclude Diabetes, by "HAD1"
Subjects with diabetes are 839 in paper, but I count 763.  

In [24]:
df["HAD1"].value_counts()

HAD1
2    6661
1     754
9       9
Name: count, dtype: int64

In [25]:
# drop HAD1 != 2
OGTT_df = df[(df['HAD1'] == 2)]
OGTT_df.shape

(6661, 83)

### Exclude whom didn't receive OGTT, which "G1PCODE" is null means which didn't receive OGTT
Subjects with diabetes are 705 in paper, but I count 698.  

In [26]:
# the number of subjects receive OGTT
OGTT_df["G1PCODE"].isnull().sum()

5963

In [27]:
# keep "G1PCODE" is null
OGTT_df = OGTT_df[OGTT_df["G1PCODE"].isnull()]
OGTT_df.shape

(5963, 83)

### Exclude whose time between glucose challenge ane second venipuncture is out of range [105, 135]

In [28]:
# G1PTIM1 needs to between 105 to 135 minutes
len(OGTT_df[OGTT_df["G1PTIM1"].between(105, 135)])


5812

In [29]:
OGTT_df = OGTT_df[OGTT_df["G1PTIM1"].between(105, 135)]
OGTT_df.shape

(5812, 83)

### Classification of glucose intolerance

In [30]:
OGTT_df.columns

Index(['SEQN', 'DMARETHN', 'HSSEX', 'HSAGEIR', 'HFA8R', 'HFF18', 'HAC5A3',
       'HAC5A4', 'HAD1', 'HAD5R', 'HAD6', 'HAR1', 'HAT1S', 'HAT1MET', 'HAT2',
       'HAT2MET', 'HAT3S', 'HAT4', 'HAT4MET', 'HAT5S', 'HAT6', 'HAT6MET',
       'HAT7S', 'HAT8', 'HAT8MET', 'HAT9S', 'HAT10', 'HAT10MET', 'HAT11S',
       'HAT12', 'HAT12MET', 'HAT13S', 'HAT14', 'HAT14MET', 'HAT15S', 'HAT16',
       'HAT16MET', 'HAT17S', 'HAT18', 'HAT19CD', 'HAT19MET', 'HAT20S',
       'HAT21CD', 'HAT21MET', 'HAT22S', 'HAT23CD', 'HAT23MET', 'HAT24S',
       'HAT25CD', 'HAT25MET', 'HAT26S', 'WTPFEX6', 'BMPWT', 'BMPBMI', 'BMPHT',
       'BMPSITHT', 'PEPPREG', 'PEPPACE', 'PEP12A1', 'MAPF12', 'MAPF12R',
       'HXPH2', 'HSSEX3', 'HSAGEIR3', 'G1P', 'G1PSI', 'G1PCODE', 'G1PTIM1',
       'G1PTIM2', 'G2PSI', 'I1P', 'I1PSI', 'HSSEX2', 'HSAGEIR2',
       'parental_diabetes', 'education', 'Vigorous_Activity_Count',
       'Moderate_Activity_Count', 'Activity_Level', 'leg_length',
       'leg_length_ratio', '%BF', 'HOMA2_IR'],
  

In [31]:
# “normal” for fasting plasma glucose <7.8 mmol/l and 2-h plasma glucose <7.8 mmol/l, 
# “impaired glucose tolerance” (IGT) for fasting plasma glucose <7.8 mmol/l and 2-h plasma glucose 7.8–11.1 mmol/l, and 
# “diabetes” for fasting plasma glucose ≥7.8 mmol/l or 2-h plasma glucose ≥11.1 mmol/l.
# "G1PSI": fasting plasma glucose
# "G2PSI": 2-h plasma glucose

def classify_OGTT(row):
    if row['G1PSI'] < 7.8 and row['G2PSI'] < 7.8:
        return 'N' #'normal'
    elif row['G1PSI'] < 7.8 and row['G2PSI'] >= 7.8 and row['G2PSI'] < 11.1:
        return 'I' #'IGT'
    else:
        return 'D' #'diabetes'

OGTT_df['OGTT_Class'] = OGTT_df.apply(classify_OGTT, axis=1)
OGTT_df['OGTT_Class'].value_counts()

OGTT_Class
N    3587
I    1449
D     776
Name: count, dtype: int64

### Merge OGTT dataframe with original dataframe on 'SEQN', add column, "OGTT_Class"
N: normal; I: IGT(impaired glucose tolerance); D: diabetes

In [32]:
df = pd.merge(df, OGTT_df[['SEQN', 'OGTT_Class']], on='SEQN', how='left')
df.shape

(7424, 84)

## Only keep the columns need to use

In [33]:
df.columns

Index(['SEQN', 'DMARETHN', 'HSSEX', 'HSAGEIR', 'HFA8R', 'HFF18', 'HAC5A3',
       'HAC5A4', 'HAD1', 'HAD5R', 'HAD6', 'HAR1', 'HAT1S', 'HAT1MET', 'HAT2',
       'HAT2MET', 'HAT3S', 'HAT4', 'HAT4MET', 'HAT5S', 'HAT6', 'HAT6MET',
       'HAT7S', 'HAT8', 'HAT8MET', 'HAT9S', 'HAT10', 'HAT10MET', 'HAT11S',
       'HAT12', 'HAT12MET', 'HAT13S', 'HAT14', 'HAT14MET', 'HAT15S', 'HAT16',
       'HAT16MET', 'HAT17S', 'HAT18', 'HAT19CD', 'HAT19MET', 'HAT20S',
       'HAT21CD', 'HAT21MET', 'HAT22S', 'HAT23CD', 'HAT23MET', 'HAT24S',
       'HAT25CD', 'HAT25MET', 'HAT26S', 'WTPFEX6', 'BMPWT', 'BMPBMI', 'BMPHT',
       'BMPSITHT', 'PEPPREG', 'PEPPACE', 'PEP12A1', 'MAPF12', 'MAPF12R',
       'HXPH2', 'HSSEX3', 'HSAGEIR3', 'G1P', 'G1PSI', 'G1PCODE', 'G1PTIM1',
       'G1PTIM2', 'G2PSI', 'I1P', 'I1PSI', 'HSSEX2', 'HSAGEIR2',
       'parental_diabetes', 'education', 'Vigorous_Activity_Count',
       'Moderate_Activity_Count', 'Activity_Level', 'leg_length',
       'leg_length_ratio', '%BF', 'HOMA2_IR', 'OG

In [34]:
# keep ['SEQN', 'DMARETHN', 'HSSEX', 'HSAGEIR', 'education', 'HFF18', 'parental_diabetes', 
#        'HAD1', 'HAD5R', 'HAD6', 'HAR1', 'BMPWT', 'BMPHT', 'BMPSITHT','WTPFEX6'
#       'PEPPREG', 'PEPPACE', 'PEP12A1', 'MAPF12R', 'HXPH2', 'G1PSI', 'G1PCODE', 'G1PTIM1', 'G1PTIM2', 'G2PSI',
#        'I1PSI', 'Activity_Level', 'leg_length', 'leg_length_ratio', '%BF', 'HOMA2_IR', 'OGTT_Class']
# drop other columns
tmp_df = df[['SEQN', 'DMARETHN', 'HSSEX', 'HSAGEIR', 'education', 'HFF18', 'parental_diabetes', 
        'HAD1', 'HAD5R', 'HAD6', 'HAR1', 'BMPWT', 'BMPHT', 'BMPSITHT', 'WTPFEX6',
       'PEPPREG', 'PEPPACE', 'PEP12A1', 'MAPF12R', 'HXPH2', 'G1PSI', 'G1PCODE', 'G1PTIM1', 'G1PTIM2', 'G2PSI',
        'I1PSI', 'Activity_Level', 'leg_length', 'leg_length_ratio', '%BF', 'HOMA2_IR', 'OGTT_Class']]


## Clean the data

### Height (BMPHT)
BMPHT = 88888



In [35]:
df.shape

(7424, 84)

In [36]:
# drop the columns which "BMPHT" is 88888
df = df[df['BMPHT'] != 88888]
df.shape

(7408, 84)

### Weight (BMPWT)
BMPWT = 88888


In [37]:
# drop the columns which "BMPWT" is 888888
df = df[df['BMPWT'] != 888888]
df.shape

(7398, 84)

### Sitting Height (BMPSITHT)
BMPSITHT = 888888


In [38]:
# drop the columns which "BMPSITHT" is 888888
df = df[df['BMPSITHT'] != 88888]
df.shape

(7226, 84)

In [39]:
df['BMPWT'].isnull().sum()

0

In [40]:
df['BMPSITHT'].isnull().sum()

132

In [41]:
# drop the columns which df['BMPSITHT'].isnull()
df = df[df['BMPSITHT'].notnull()]
df.shape

(7094, 84)

leg_length, leg_length_ratio don't have null value.

In [42]:
# rename columns
tmp_df.rename(columns={'HFF18': 'income', 'HAD1' :'diabetes', 'HAD5R': 'diabetes_age', 'HAR1': 'smoke', 
                   'BMPWT': 'Weight_kg', 'BMPHT':'Height', 'WTPFEX6':'Weighted', 'MAPF12R':'pregancy',
                   'HXPH2': 'menstrual_cycles', 'PEP12A1': 'BIA_resistance'}, inplace = True)
tmp_df.columns

Index(['SEQN', 'DMARETHN', 'HSSEX', 'HSAGEIR', 'education', 'income',
       'parental_diabetes', 'diabetes', 'diabetes_age', 'HAD6', 'smoke',
       'Weight_kg', 'Height', 'BMPSITHT', 'Weighted', 'PEPPREG', 'PEPPACE',
       'BIA_resistance', 'pregancy', 'menstrual_cycles', 'G1PSI', 'G1PCODE',
       'G1PTIM1', 'G1PTIM2', 'G2PSI', 'I1PSI', 'Activity_Level', 'leg_length',
       'leg_length_ratio', '%BF', 'HOMA2_IR', 'OGTT_Class'],
      dtype='object')

## Count Weight

In [43]:
# count weighted mean of ['Height',  'leg_length', 'leg_length_ratio']
# 'WXPFEX6' is the weight of each subject
# weighted mean = sum(weight * value) / sum(weight)
# 'Height', 'leg_length', 'leg_length_ratio'
# 'Height_weighted' = 'Height' * 'Weighted'
# 'leg_length_weighted' = 'leg_length' * 'Weighted'
# 'leg_length_ratio_weighted' = 'leg_length_ratio' * 'Weighted'
# 'Height_weighted_mean' = sum('Height_weighted') / sum('Weighted')
# 'leg_length_weighted_mean' = sum('leg_length_weighted') / sum('Weighted')
# 'leg_length_ratio_weighted_mean' = sum('leg_length_ratio_weighted') / sum('Weighted')
df = tmp_df
df['Height_weighted'] = df['Height'] * df['Weighted']
df['leg_length_weighted'] = df['leg_length'] * df['Weighted']
df['leg_length_ratio_weighted'] = df['leg_length_ratio'] * df['Weighted']
Height_weighted_mean = df['Height_weighted'].sum() / df['Weighted'].sum()
leg_length_weighted_mean = df['leg_length_weighted'].sum() / df['Weighted'].sum()
leg_length_ratio_weighted_mean = df['leg_length_ratio_weighted'].sum() / df['Weighted'].sum()
# weighted standard deviation
# sqrt(sum(weight * (value - mean)^2) / (((N-1)/N) * sum(weight)))
# 'Height_weighted_std' = sqrt(sum('Weighted' * ('BMPHT' - 'Height_weighted_mean')^2) / (((N-1)/N) * sum('Weighted')))
# 'leg_length_weighted_std' = sqrt(sum('Weighted' * ('leg_length' - 'leg_length_weighted_mean')^2) / (((N-1)/N) * sum('Weighted')))
# 'leg_length_ratio_weighted_std' = sqrt(sum('Weighted' * ('leg_length_ratio' - 'leg_length_ratio_weighted_mean')^2) / (((N-1)/N) * sum('Weighted')))
Height_weighted_std = np.sqrt((df['Weighted'] * (df['Height'] - Height_weighted_mean)**2).sum() / (((df.shape[0]-1)/df.shape[0]) * df['Weighted'].sum()))
leg_length_weighted_std = np.sqrt((df['Weighted'] * (df['leg_length'] - leg_length_weighted_mean)**2).sum() / (((df.shape[0]-1)/df.shape[0]) * df['Weighted'].sum()))
leg_length_ratio_weighted_std = np.sqrt((df['Weighted'] * (df['leg_length_ratio'] - leg_length_ratio_weighted_mean)**2).sum() / (((df.shape[0]-1)/df.shape[0]) * df['Weighted'].sum()))
Height_weighted_mean, Height_weighted_std, leg_length_weighted_mean, leg_length_weighted_std, leg_length_ratio_weighted_mean, leg_length_ratio_weighted_std


(206.5527441773961,
 1836.70729473813,
 -1469.3384712745285,
 11627.326056570857,
 -8.812949899026352,
 69.87530754201686)

In [44]:
# calculate z-score of 'BMPHT', 'leg_length', 'leg_length_ratio'
# 'BMPHT_zscore' = ('BMPHT' - 'Height_weighted_mean') / 'Height_weighted_std'
# 'leg_length_zscore' = ('leg_length' - 'leg_length_weighted_mean') / 'leg_length_weighted_std'
# 'leg_length_ratio_zscore' = ('leg_length_ratio' - 'leg_length_ratio_weighted_mean') / 'leg_length_ratio_weighted_std'
df['BMPHT_zscore'] = (df['Height'] - Height_weighted_mean) / Height_weighted_std
df['leg_length_zscore'] = (df['leg_length'] - leg_length_weighted_mean) / leg_length_weighted_std
df['leg_length_ratio_zscore'] = (df['leg_length_ratio'] - leg_length_ratio_weighted_mean) / leg_length_ratio_weighted_std

In [45]:
df["BF"] = df["%BF"]*100
df.head()

Unnamed: 0,SEQN,DMARETHN,HSSEX,HSAGEIR,education,income,parental_diabetes,diabetes,diabetes_age,HAD6,...,%BF,HOMA2_IR,OGTT_Class,Height_weighted,leg_length_weighted,leg_length_ratio_weighted,BMPHT_zscore,leg_length_zscore,leg_length_ratio_zscore,BF
0,9,1,2,48,1,2,1,2,,,...,0.360606,1.272265,I,3062901.384,1405537.92,8958.176673,-0.027034,0.132562,0.132691,36.060646
1,19,2,1,44,1,1,0,2,,,...,0.261954,2.03252,N,687663.874,324355.198,1819.154223,-0.015382,0.133602,0.132874,26.195401
2,34,2,2,42,0,2,0,2,,,...,0.333593,1.127396,N,802182.03,388982.607,2446.431491,-0.02589,0.133,0.133064,33.359285
3,45,2,2,67,1,1,0,2,,,...,,,,0.0,,,-0.023495,,,
4,48,1,2,56,0,2,0,2,,,...,0.448423,0.873362,,4677452.208,2147920.944,12517.021818,-0.01903,0.133147,0.132696,44.84228


In [46]:
df["menstrual_cycles"].value_counts()

menstrual_cycles
13.0    26
12.0    12
11.0    10
14.0     8
15.0     8
10.0     4
99.0     4
18.0     3
17.0     3
16.0     2
88.0     1
9.0      1
Name: count, dtype: int64

In [47]:
df.to_csv("diabete.csv", index=False)

## End