In [1]:
import pandas as pd

In [2]:
### custome functions 
def modify_race_col(row) -> int:
    '''
    logic to make a compariable filed to HISPALLP_C in the post 2019 datasets
    '''
    HISPAN_I = row['HISPAN_I']
    RACERPI2 = row['RACERPI2']
    
    if HISPAN_I != 12:
        return 1

    match RACERPI2:
        case 1:
            return 2
        case 2:
            return 3
        case 3:
            return 5
        case 5:
            return 99
        case 6:
            return 7
        case _:
            return 0

### the convert function switch the numeric values to text
def convert_region_to_txt(row) ->str:
    match row['REGION']:
        case 1:
            return 'Northeast'
        case 2:
            return 'Midwest'
        case 3:
            return 'South'
        case 4:
            return 'West'
            
def convert_sex_c_to_txt(row) -> str:
    match row['SEX_C']:
        case 1:
            return 'Male'
        case 2:
            return 'Female'
        case 7:
            return 'Refused'
        case 8:
            return 'Not Ascertained'
        case _:
            return "Don't Know"

def convert_question_fields(row, arg1) -> str:
    match row[arg1]:
        case 1:
            return 'Yes'
        case 2:
            return 'No'
        case 7:
            return 'Refused'
        case 8: 
            return 'Not Ascertained'
        case 9:
            return 'Don\'t Know'
        case _:
             return 'Don\'t Know'

def convert_HISPALLP_C_to_txt(row) -> str:
    match row['HISPALLP_C']:
        case 1:
            return 'Hispanic'
        case 2:
            return 'Non-Hispanic White only'
        case 3:
            return 'Non-Hispanic Black/African American only'
        case 4:
            return 'Non-Hispanic Asian only'
        case 5:
            return 'Non-Hispanic AIAN only'
        case 6:
            return 'Non-Hispanic AIAN and any other group'
        case 7:
            return 'Other single and multiple races'
        case 97:
            return 'Refused'
        case 98:
            return 'Not Ascertained'
        case 99:
            return "Don't Know"


def create_data_file_layout_dict() -> dict:
    '''
    intake text file that filed layouts for .dat 
    files
    '''
    layout_dict = dict()

    with open('data/file_layouts.txt') as f:
        for line in f:
            clean_line = line.replace('\n','').replace('\t','').split(',')
            layout_dict[int(clean_line[0])] = [int(x) for x in clean_line[1:]]

    return layout_dict


def create_dat_file_vars() -> dict:
    '''
    create a dictonary that is used to process all the .dat files
    this stores all the renames and postions to pull from.
    '''
    #get the field layout for each file
    layout_dict = create_data_file_layout_dict()

    #the fields we will select and for what year
    dat_fields = [1,2,5,6,8,9,12,13,14,17]
    data_2014_fields = [24,25,26,29,31]
    data_2011_2013_fields = [24,25,26,29,37]
    data_2008_2010_fields = [23,24,25,28,36]
    data_2005_2007_fields = [25,33]

    #the renames for each file
    dat_file_renames = {
        1: 'SRVY_YR',
        2: 'HHX',
        5: 'FMX',
        6: 'FPX',
        8: 'WTFA_SC',
        9: 'REGION',
        12: 'SEX',
        13: 'HISPAN_I',
        14: 'RACERPI2',
        17: 'AGE_P',
    }

    dat_file_2014_renames = {24: 'CWGHT_TC', 25:'CHGHT_TC', 26: 'BMI_SC', 29: 'ADD2', 31: 'AUTISM'}
    dat_file_2011_2013_renames = {24: 'CWGHT_TC', 25:'CHGHT_TC', 26: 'BMI_SC', 29: 'ADD2', 37: 'AUTISM'}
    dat_file_2008_2010_renames = {23: 'CWGHT_TC', 24:'CHGHT_TC', 25: 'BMI_SC', 28: 'ADD2', 36: 'AUTISM'}
    dat_file_2005_2007_renames = {25: 'ADD2', 33: 'AUTISM'}
    
    
    #create final dic used to process the .dat files
    dat_files = {
        2014:{
            'file_nm':'data/child14.dat',
            'file_layout':layout_dict[2014],
            'select_fields':dat_fields + data_2014_fields,
            'rename_fiedls': dat_file_renames | dat_file_2014_renames
        },
        2013:{
            'file_nm':'data/child13.dat',
            'file_layout':layout_dict[2013],
            'select_fields': dat_fields + data_2011_2013_fields,
            'rename_fiedls': dat_file_renames | dat_file_2011_2013_renames  
        },
        2012:{
            'file_nm':'data/child12.dat',
            'file_layout':layout_dict[2012],
            'select_fields': dat_fields + data_2011_2013_fields,
            'rename_fiedls': dat_file_renames | dat_file_2011_2013_renames  
        },
        2011:{
            'file_nm':'data/child11.dat',
            'file_layout':layout_dict[2011],
            'select_fields': dat_fields + data_2011_2013_fields,
            'rename_fiedls': dat_file_renames | dat_file_2011_2013_renames  
        },
        2010:{
            'file_nm':'data/child10.dat',
            'file_layout':layout_dict[2010],
            'select_fields': dat_fields + data_2008_2010_fields,
            'rename_fiedls': dat_file_renames | dat_file_2008_2010_renames  
        },  
        2009:{
            'file_nm':'data/child09.dat',
            'file_layout':layout_dict[2009],
            'select_fields': dat_fields + data_2008_2010_fields,
            'rename_fiedls': dat_file_renames | dat_file_2008_2010_renames  
        },  
        2008:{
            'file_nm':'data/child08.dat',
            'file_layout':layout_dict[2008],
            'select_fields': dat_fields + data_2008_2010_fields,
            'rename_fiedls': dat_file_renames | dat_file_2008_2010_renames  
        },  
        2007:{
            'file_nm':'data/child07.dat',
            'file_layout':layout_dict[2007],
            'select_fields': dat_fields + data_2005_2007_fields,
            'rename_fiedls': dat_file_renames | dat_file_2005_2007_renames  
        },  
        2006:{
            'file_nm':'data/child06.dat',
            'file_layout':layout_dict[2006],
            'select_fields': dat_fields + data_2005_2007_fields,
            'rename_fiedls': dat_file_renames | dat_file_2005_2007_renames  
        }, 
        2005:{
            'file_nm':'data/child05.dat',
            'file_layout':layout_dict[2005],
            'select_fields': dat_fields + data_2005_2007_fields,
            'rename_fiedls': dat_file_renames | dat_file_2005_2007_renames  
        },  
    } 

    return dat_files

In [3]:
## create file list for processing
post_19_files = ('data/child23.csv','data/child22.csv','data/child21.csv','data/child20.csv','data/child19.csv',)

#pre_19 import vars
pre_19_csv = ('data/child18.csv','data/child17.csv','data/child16.csv','data/child15.csv',)

pre_19_renames = {
    'WTFA_SC':'WTFA_C',
    'SEX':'SEX_C',
    'AGE_P':'AGEP_C',
    'ADD2':'ADHDEV_C',
    'ADD2N':'ADHDNW_C',
    'CWGHT_TC':'WEIGHTLBTC_C',
    'CHGHT_TC':'HEIGHTTC_C',
    'BMI_SC':'BMICAT_C',
    'AUTISM': 'ASDEV_C',
    'AUTISMN': 'ASDNW_C', 
}


dat_files_2005_2014 = create_dat_file_vars()


In [4]:
### load 2005 to 2014 data
dat_df = pd.DataFrame()

for key, value in dat_files_2005_2014.items():
    df = pd.read_fwf(value['file_nm'],widths=value['file_layout'],header=None)
    df = df[value['select_fields']]
    df = df.rename(columns=value['rename_fiedls'])
    dat_df = pd.concat([dat_df,df])


dat_df.set_index(['HHX','SRVY_YR','FMX','FPX'],inplace=True)
print(dat_df.head())
print(len(dat_df))

                     WTFA_SC  REGION  SEX  HISPAN_I  RACERPI2  AGE_P  \
HHX SRVY_YR FMX FPX                                                    
13  2014    1   3       6039       4    1         3         1      0   
20  2014    1   4       3145       4    2        12         1      2   
25  2014    1   4       5497       2    2        12         1     16   
29  2014    1   5        592       2    2        12         1      7   
34  2014    1   2       6553       1    2        12         2     12   

                     CWGHT_TC  CHGHT_TC  BMI_SC  ADD2  AUTISM  
HHX SRVY_YR FMX FPX                                            
13  2014    1   3         NaN       NaN     NaN   NaN     NaN  
20  2014    1   4         NaN       NaN     NaN   2.0     2.0  
25  2014    1   4        66.0     135.0  2179.0   2.0     2.0  
29  2014    1   5         NaN       NaN     NaN   2.0     2.0  
34  2014    1   2        64.0     100.0  1717.0   2.0     2.0  
115390


In [5]:
### load 2015 to 2018
df_2015_2018 = pd.DataFrame()

for file in pre_19_csv:

    if file == 'data/child15.csv':
        cols = ['SRVY_YR','HHX','FMX','FPX','WTFA_SC','REGION','SEX',
                'AGE_P','CHGHT_TC','CWGHT_TC','BMI_SC','HISPAN_I',
                'RACERPI2','ADD2','AUTISM'
                ]
    else:
        cols = ['SRVY_YR','HHX','FMX','FPX','WTFA_SC','REGION','SEX',
                'AGE_P','CHGHT_TC','CWGHT_TC','BMI_SC','HISPAN_I',
                'RACERPI2','ADD2','ADD2N','AUTISM','AUTISMN'
                ]
    
    df = pd.read_csv(file,
                     sep=',',
                     header=0,
                     index_col = ['HHX','SRVY_YR','FMX','FPX'],
                     usecols=cols
                    )

    df_2015_2018 = pd.concat([df_2015_2018,df])

##print the length
print(len(df_2015_2018))
print(df_2015_2018.head())



40512
                     SEX  HISPAN_I  RACERPI2  AGE_P  REGION  WTFA_SC  ADD2  \
HHX SRVY_YR FMX FPX                                                          
4   2018    1   3      1        12         1     16       2     6055   2.0   
6   2018    1   3      1        12         1      9       3    11581   2.0   
8   2018    1   3      1        12         1      3       2     3558   2.0   
10  2018    1   4      1        12         1      0       1     4447   NaN   
13  2018    1   2      1        12         2     10       3     4567   2.0   

                     AUTISM  ADD2N  AUTISMN  CHGHT_TC  CWGHT_TC  BMI_SC  
HHX SRVY_YR FMX FPX                                                      
4   2018    1   3       2.0    NaN      NaN      96.0     996.0  2687.0  
6   2018    1   3       2.0    NaN      NaN       NaN       NaN     NaN  
8   2018    1   3       2.0    NaN      NaN       NaN       NaN     NaN  
10  2018    1   4       NaN    NaN      NaN       NaN       NaN     NaN  
13 

In [6]:
### merge 2015 to 2018 and 2005 to 2014 dataframes and strt to do clean up
df_2005_2018 = pd.concat([dat_df,df_2015_2018]).reset_index()
df_2005_2018.set_index(['HHX','SRVY_YR','FMX','FPX'], inplace=True)


duplicates = df_2005_2018.index.duplicated()
print(df_2005_2018.index[duplicates])
print(len(df_2005_2018))

MultiIndex([], names=['HHX', 'SRVY_YR', 'FMX', 'FPX'])
155902


In [7]:
##create HISPALLP_C
df_2005_2018['HISPALLP_C'] = df_2005_2018.apply(modify_race_col, axis=1)
print(df_2005_2018['HISPALLP_C'].isnull().sum())

##drop columns 'HISPAN_I','RACERPI2' as they are not needed.
df_2005_2018.drop(['HISPAN_I','RACERPI2'],axis=1,inplace=True)


##renaming columns to help combined
df_2005_2018 = df_2005_2018.rename(columns=pre_19_renames)
print(df_2005_2018.head())
print(len(df_2005_2018))

0
                     WTFA_C  REGION  SEX_C  AGEP_C  WEIGHTLBTC_C  HEIGHTTC_C  \
HHX SRVY_YR FMX FPX                                                            
13  2014    1   3      6039       4      1       0           NaN         NaN   
20  2014    1   4      3145       4      2       2           NaN         NaN   
25  2014    1   4      5497       2      2      16          66.0       135.0   
29  2014    1   5       592       2      2       7           NaN         NaN   
34  2014    1   2      6553       1      2      12          64.0       100.0   

                     BMICAT_C  ADHDEV_C  ASDEV_C  ADHDNW_C  ASDNW_C  \
HHX SRVY_YR FMX FPX                                                   
13  2014    1   3         NaN       NaN      NaN       NaN      NaN   
20  2014    1   4         NaN       2.0      2.0       NaN      NaN   
25  2014    1   4      2179.0       2.0      2.0       NaN      NaN   
29  2014    1   5         NaN       2.0      2.0       NaN      NaN   
34  2014   

In [8]:
## create the final df with the post 2019 files
final_df = pd.DataFrame()

for file in post_19_files:

    if file in ('data/child22.csv','data/child20.csv'):
        cols = ['HHX','SRVY_YR','WTFA_C','REGION',
                'SEX_C','AGEP_C','HEIGHTTC_C',
                'WEIGHTLBTC_C','BMICAT_C','HISPALLP_C',
                'ADHDEV_C','ADHDNW_C','ASDEV_C','ASDNW_C'
                ]
    else:
        cols = ['HHX','SRVY_YR','WTFA_C','REGION',
                'SEX_C','AGEP_C','HISPALLP_C',
                'ADHDEV_C','ADHDNW_C','ASDEV_C','ASDNW_C'
                ]

    
    df = pd.read_csv(file,
                     sep=',',
                     header=0,
                     index_col = ['HHX','SRVY_YR'],
                     usecols =cols
                    )
    final_df = pd.concat([final_df,df])



final_df['FMX'] = 1
final_df['FPX'] = 1

final_df.reset_index(inplace=True)
final_df.set_index(['HHX','SRVY_YR','FMX','FPX'],inplace=True)

duplicates = final_df.index.duplicated()


print(final_df.head())
print(len(final_df))
print(final_df.index[duplicates])


                         HISPALLP_C  REGION  SEX_C  AGEP_C  ASDNW_C  ASDEV_C  \
HHX     SRVY_YR FMX FPX                                                        
H045277 2023    1   1             3       3      2      14      NaN      2.0   
H021192 2023    1   1             2       3      2      11      NaN      2.0   
H025576 2023    1   1             2       3      1      15      NaN      2.0   
H058458 2023    1   1             2       3      2       8      NaN      2.0   
H047432 2023    1   1             3       3      1      12      NaN      2.0   

                         ADHDNW_C  ADHDEV_C     WTFA_C  BMICAT_C  \
HHX     SRVY_YR FMX FPX                                            
H045277 2023    1   1         NaN       2.0  13012.875       NaN   
H021192 2023    1   1         NaN       2.0  16680.509       NaN   
H025576 2023    1   1         NaN       2.0   5404.923       NaN   
H058458 2023    1   1         NaN       2.0   9758.143       NaN   
H047432 2023    1   1         1

In [9]:
final_df = pd.concat([final_df,df_2005_2018], axis = 0).reset_index()
final_df.set_index(['HHX','SRVY_YR','FMX','FPX'],inplace=True)
print(final_df.head())
print(len(final_df))

                         HISPALLP_C  REGION  SEX_C  AGEP_C  ASDNW_C  ASDEV_C  \
HHX     SRVY_YR FMX FPX                                                        
H045277 2023    1   1             3       3      2      14      NaN      2.0   
H021192 2023    1   1             2       3      2      11      NaN      2.0   
H025576 2023    1   1             2       3      1      15      NaN      2.0   
H058458 2023    1   1             2       3      2       8      NaN      2.0   
H047432 2023    1   1             3       3      1      12      NaN      2.0   

                         ADHDNW_C  ADHDEV_C     WTFA_C  BMICAT_C  \
HHX     SRVY_YR FMX FPX                                            
H045277 2023    1   1         NaN       2.0  13012.875       NaN   
H021192 2023    1   1         NaN       2.0  16680.509       NaN   
H025576 2023    1   1         NaN       2.0   5404.923       NaN   
H058458 2023    1   1         NaN       2.0   9758.143       NaN   
H047432 2023    1   1         1

In [10]:
### reformat fields
final_df['REGION'] = final_df.apply(convert_region_to_txt, axis=1)
final_df['SEX_C'] = final_df.apply(convert_sex_c_to_txt, axis=1)
final_df['HISPALLP_C'] = final_df.apply(convert_HISPALLP_C_to_txt, axis=1)
final_df['ADHDEV_C'] = final_df.apply(convert_question_fields, axis=1, args=('ADHDEV_C',))
final_df['ADHDNW_C'] = final_df.apply(convert_question_fields, axis=1, args=('ADHDNW_C',))
final_df['ASDEV_C'] = final_df.apply(convert_question_fields, axis=1, args=('ASDEV_C',))
final_df['ASDNW_C'] = final_df.apply(convert_question_fields, axis=1, args=('ASDNW_C',))

In [12]:
final_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,HISPALLP_C,REGION,SEX_C,AGEP_C,ASDNW_C,ASDEV_C,ADHDNW_C,ADHDEV_C,WTFA_C,BMICAT_C,WEIGHTLBTC_C,HEIGHTTC_C
HHX,SRVY_YR,FMX,FPX,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
H045277,2023,1,1,Non-Hispanic Black/African American only,South,Female,14,Don't Know,No,Don't Know,No,13012.875,,,
H021192,2023,1,1,Non-Hispanic White only,South,Female,11,Don't Know,No,Don't Know,No,16680.509,,,
H025576,2023,1,1,Non-Hispanic White only,South,Male,15,Don't Know,No,Don't Know,No,5404.923,,,
H058458,2023,1,1,Non-Hispanic White only,South,Female,8,Don't Know,No,Don't Know,No,9758.143,,,
H047432,2023,1,1,Non-Hispanic Black/African American only,South,Male,12,Don't Know,No,Yes,Yes,20404.132,,,


In [13]:
## export the file for use latter
final_df.to_csv('data/final_data_set.csv')

In [18]:
final_df[final_df['WTFA_C'].isnull()]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,HISPALLP_C,REGION,SEX_C,AGEP_C,ASDNW_C,ASDEV_C,ADHDNW_C,ADHDEV_C,WTFA_C,BMICAT_C,WEIGHTLBTC_C,HEIGHTTC_C
HHX,SRVY_YR,FMX,FPX,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
