In [69]:
import pandas as pd
import nhanes_loader
import matplotlib.pyplot as plt
import numpy as np
from sklearn.linear_model import LinearRegression
import importlib
import nhanes.regressions
importlib.reload(nhanes.regressions)
from nhanes.regressions import generate_demo_group, do_regression

In [2]:
adult_final = pd.read_pickle('data/adult_final.pkl.gz')

In [3]:
youth_final = pd.read_pickle('data/youth_final.pkl.gz')

Some of the spirometry variables of interest are not ratios.

In [34]:
varsnonratio = [ 'EXPIR','FEF75', 'FEV05', 'FEV1', 'FEV3', 'FEV6', 'FVC',  'MMEF', 'PEFR']

Some of the variables of interest are ratios.

In [33]:
varsratio= ['FEF75OFVC', 'FEF75OPEFR', 
            'FEV05OFEV3', 'FEV05OFVC', 
            'FEV1OFEV3',  'FEV1OFEV6', 
            'FEV1OFVC', 'FEV3OFEV6',
            'FEV3OFVC', 'FEV6OFVC', 
            'MMEFOFVC', 'MMEFOPEFR', 
            'PEFROFEV1', 'PEFROFEV6', 
            'PEFROFVC']

We want to run linear regressions for these ratio variables against age for various demographics groups.

24 demographic combinations:
* male vs female (2)
* black vs non-black vs white-only vs mexican-only (4)
* all ages vs over age cut-off ( 20 for men and 18 for women) vs under age cut-off (2)

Similarly, we run linear regressions for the non-ratio variables against height, age, and age^2.

In [67]:
def get_regression_results(regression_type, variables):
    """Generate array of regression results for various demographic combinations.
       regression_type can be 'age' or 'height_age'"""
    
    regression_results = []

    for sex in ('male', 'female'):
        for ethnicity in ('black', 'non-black', 'white', 'mexican'):
            for age_range in ('all', 'under', 'over'):
                 # generate demographic group dataframe
                demo_group = generate_demo_group(adult_final, youth_final, ethnicity, sex, age_range)

                # do linear regression for variables
                for var in variables:
                    intercept, slope, r_squared = do_regression(regression_type, demo_group, var) 
                    slope = slope[0]
                    std = demo_group.loc[:, var].std()
                    count = demo_group.loc[:, var].count()
                    regression_results.append({
                        'sex' : sex,
                        'ethnicity' : ethnicity,
                        'age_range' : age_range,
                        'var' : var,
                        'intercept' : intercept,
                        'slope' : slope,
                        'r_squared' : r_squared,
                        'std' : std,
                        'count' : count
                    })  
    return regression_results

In [71]:
ratio_var_regression_results = get_regression_results('age', varsratio)

In [72]:
pd.DataFrame(ratio_var_regression_results).to_csv('data/ratio_var_regresssions.csv')

In [73]:
nonratio_var_regression_results = get_regression_results('age_height', varsnonratio)

In [74]:
pd.DataFrame(nonratio_var_regression_results).to_csv('data/nonratio_var_regresssions.csv')