# NHANES EDA - Data Exploration and Summary Stats

The purpose of this notebook is the visually explore the aggregated data from the csv file created in step 1, as well as to determine summary statistics. 

## Import Packages

In [17]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

## Load Data

In [18]:
data = pd.read_csv('data/nhanes.csv')

## Table 1

In [19]:
# Mapping pregnancy and postpartum status correctly
data['custom_pregnancy_group'] = data.apply(
    lambda x: 'Postpartum' if x['postpartum'] == 1 else ('Pregnant' if x['pregnancy-status'] == 1 else 'Not-Pregnant'), 
    axis=1
)

# Determine Low Income status based on income-to-poverty ratio
data['Low_Income'] = data['income-to-poverty-ratio'] < 1.85

# Function to calculate descriptive statistics
def calculate_statistics(df, group, condition):
    stats = df.groupby([group, condition]).agg(
        Count=('age', 'count'),
        Mean_Age=('age', 'mean')
    ).reset_index()
    stats['% Standard Error'] = df.groupby([group, condition])['age'].apply(
        lambda x: (x.std() / np.sqrt(x.count())) / x.mean() * 100).reset_index(drop=True)
    
    return stats

# Calculating statistics for each group by Race-Ethnicity
stats_race_ethnicity = calculate_statistics(data, 'custom_pregnancy_group', 'race-ethnicity')

# Calculating statistics for each group by Low Income
stats_low_income = calculate_statistics(data, 'custom_pregnancy_group', 'Low_Income')

# Calculating statistics for each group by Education Level
stats_education_level = calculate_statistics(data, 'custom_pregnancy_group', 'edu-level')

# Displaying the results
print("By Race-Ethnicity:\n", stats_race_ethnicity)
print("\nBy Low Income:\n", stats_low_income)
print("\nBy Education Level:\n", stats_education_level)


By Race-Ethnicity:
    custom_pregnancy_group  race-ethnicity  Count   Mean_Age  % Standard Error
0            Not-Pregnant             1.0   1010  32.405941          0.713657
1            Not-Pregnant             2.0    532  32.667293          0.960058
2            Not-Pregnant             3.0   1948  32.574949          0.507926
3            Not-Pregnant             4.0   1129  32.607617          0.659764
4            Not-Pregnant             5.0    593  32.892074          0.904159
5              Postpartum             1.0     86  29.209302          2.373077
6              Postpartum             2.0     50  27.400000          2.965894
7              Postpartum             3.0    100  28.630000          1.898010
8              Postpartum             4.0     75  27.026667          2.185283
9              Postpartum             5.0     25  29.480000          2.524195
10               Pregnant             1.0    161  27.217391          1.499615
11               Pregnant             2.0   

In [None]:
## create it in R instead 
# https://cran.r-project.org/web/packages/tableone/tableone.pdf

### class imbalance - can either address, and/or look at depression scores on continous scale


## Explore Data

### Demographics

In [None]:
# Age distribution
plt.figure(figsize=(10, 6))
plt.hist(nhanes['age'], bins=10, color='skyblue', edgecolor='black')
plt.title('Distribution of Ages in NHANES Dataset')
plt.xlabel('Age')
plt.ylabel('Frequency')
plt.grid(axis='y', alpha=0.75)
plt.show()

In [None]:
# average age of pregnant vs not pregnant
nhanes['pregnancy-status']

In [None]:
# Pregnancy and age
pregnant_participants = nhanes[nhanes['pregnancy-status']== 1]

In [None]:
# Plot the distribution of ages for pregnant participants
plt.figure(figsize=(10, 6))
plt.hist(pregnant_participants['age'], bins=15, color='salmon', edgecolor='black')
plt.title('Distribution of Ages for Pregnant Participants in NHANES Dataset')
plt.xlabel('Age')
plt.ylabel('Frequency')
plt.grid(axis='y', alpha=0.75)
plt.show()


In [None]:
nhanes['depression'].value_counts()

In [None]:
# There is a class imbalance in the outcome variable 