In [62]:
import pandas as pd
import nhanes_loader
import matplotlib.pyplot as plt
import numpy as np
from sklearn.linear_model import LinearRegression
from nhanes.regressions import generate_demo_group, do_age_regression

In [2]:
adult_final = pd.read_pickle('data/adult_final.pkl.gz')

In [3]:
youth_final = pd.read_pickle('data/youth_final.pkl.gz')

Some of the spirometry variables of interest are not ratios.

In [34]:
varsnonratio = [ 'EXPIR','FEF75', 'FEV05', 'FEV1', 'FEV3', 'FEV6', 'FVC',  'MMEF', 'PEFR']

Some of the variables of interest are ratios.

In [33]:
varsratio= ['FEF75OFVC', 'FEF75OPEFR', 
            'FEV05OFEV3', 'FEV05OFVC', 
            'FEV1OFEV3',  'FEV1OFEV6', 
            'FEV1OFVC', 'FEV3OFEV6',
            'FEV3OFVC', 'FEV6OFVC', 
            'MMEFOFVC', 'MMEFOPEFR', 
            'PEFROFEV1', 'PEFROFEV6', 
            'PEFROFVC']

We can load demo_groups from data/demo_groups.csv if need be. 

24 combinations:
* male vs female (2)
* black vs non-black vs white-only vs mexican-only (4)
* all ages vs over age cut-off ( 20 for men and 18 for women) vs under age cut-off (2)

In [63]:
regression_results = []

for sex in ('male', 'female'):
    for ethnicity in ('black', 'non-black', 'white', 'mexican'):
        for age_range in ('all', 'under', 'over'):
             # generate demographic group dataframe
            demo_group = generate_demo_group(adult_final, youth_final, ethnicity, sex, age_range)
            
            # do linear regression against age for variables
            for ratio_var in varsratio:
                intercept, slope, r_squared = do_age_regression(demo_group, ratio_var) 
                slope = slope[0]
                std = demo_group.loc[:,ratio_var].std()
                count = demo_group.loc[:,ratio_var].count()
                regression_results.append({
                    'sex' : sex,
                    'ethnicity' : ethnicity,
                    'age_range' : age_range,
                    'ratio_var' : ratio_var,
                    'intercept' : intercept,
                    'slope' : slope,
                    'r_squared' : r_squared,
                    'std' : std,
                    'count' : count
                })

In [64]:
pd.DataFrame(regression_results)

Unnamed: 0,sex,ethnicity,age_range,ratio_var,intercept,slope,r_squared,std,count
0,male,black,all,FEF75OFVC,49.956255,-0.476841,0.159870,18.045439,1077
1,male,black,all,FEF75OPEFR,24.082129,-0.232074,0.187692,8.105487,1077
2,male,black,all,FEV05OFEV3,70.393382,-0.025488,0.002871,7.197914,1077
3,male,black,all,FEV05OFVC,70.383868,-0.122010,0.053420,7.987646,1077
4,male,black,all,FEV1OFEV3,89.495076,-0.071238,0.041230,5.308651,1077
...,...,...,...,...,...,...,...,...,...
355,female,mexican,over,MMEFOFVC,1.220272,-0.007217,0.156591,0.297469,1005
356,female,mexican,over,MMEFOPEFR,65.992047,-0.418650,0.249134,13.681124,1005
357,female,mexican,over,PEFROFEV1,1.974454,0.007986,0.108032,0.396322,1005
358,female,mexican,over,PEFROFEV6,1.813283,0.003258,0.021238,0.364672,1005
