In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import missingno as mgno


In [57]:
NSCH19 = pd.read_sas("data/nsch_2019e_topical.sas7bdat")
#NSCH22 = pd.read_sas("data/nsch_2022e_topical.sas7bdat")

In [None]:
NSCH19.info()

In [64]:
geo_features = ['FIPSST']
## K7Q02R_R is number of days missed in school because of illness or injury.
## Responses are binned in intervals of 3 days, with 1 meaning no missed days, 5 meaning 11+ days, 6 meaning no enrolled
## in school
edu_features = ['K7Q02R_R']
med_features = ['K4Q27', #Was there any time when child needed health care but did not receive?
                #Note that if K4Q27 = 2 (No), then AVAILABLE to TRANSPORTCC are null

                'AVAILABLE', 'APPOINTMENT', 'ISSUECOST', 
                'NOTELIG', 'NOTOPEN', 'TRANSPORTCC',
                'CURRCOV', 
                'HOWMUCH', #how much was paid for health care
                'INSTYPE', #public/ priv/ none
                'K3Q04_R', #any type of health ins
                'K12Q12', #govt assistance plan (yes/ no)
                # Note that if CURRCOV = 2 (No), then K12Q12 is null

                'K3Q21B', #How often are cost reasonable?  If HOWMUCH=1, then K3Q21B is null
                'K3Q20', #How often does health insurance cover needs? If CURRCOV=2, then K3Q20 is null
                'K3Q22', #How often allowed to see providers?  If CURRCOV=2, then K3Q22 is null
                'K3Q25', #Problems Paying for Medical or Health Care
                'K4Q01', #is there place can take child when they are sick
                'K4Q24_R', #Did this child see a specialist?
                'K4Q26', #How difficult to get specialist care?  If K4Q24_R=3, then K4Q26 is null.
                
                'S4Q01', #Did this child visit a doctor?
                'K5Q31_R', #Provider Communication with School.  If S4Q01=2, then K5Q31_R is null.
                'K5Q32' #Satisfactory communication from doctor to school.  If S4Q01=2 or K5Q31_R=2 or 3, then K5Q32 is null.
                ]           

features = geo_features + edu_features + med_features

In [None]:
def FIPS_to_State(data, state='both'):
    '''
    This function includes a column corresponding the state name and/ or abbreviation corresponding
    to the FIPS code

    Arguments:
    data = NSCH dataframe (in general, any dataframe with column 'FIPSST')
    state = 'abbr' for abbreviation only, 'full' for full state name, defaults to 'both'

    Returns: 
    dataframe with appended column(s) and FIPPST column changed to int type
    '''
    FIPS_state = pd.read_csv('data/FIPS_State.csv')
    FIPS_state['FIPSST'] = FIPS_state.FIPS

    data.FIPSST = data.FIPSST.apply(int)

    if state == 'abbr':
        data = data.merge(FIPS_state[['FIPSST', 'ABBR']], on='FIPSST')

    if state == 'full':
        data = data.merge(FIPS_state[['FIPSST', 'STATE']], on='FIPSST') 

    if state == 'both':
        data = data.merge(FIPS_state[['FIPSST', 'STATE', 'ABBR']], on='FIPSST')                

    return data


In [None]:
## This calculates the null correlation between features
mgno.heatmap(NSCH19[features])

In [None]:
NSCH19 = FIPS_to_State(NSCH19)

In [None]:
## This will list the percent of data which is missing by state
## I borrowed the code from https://stackoverflow.com/questions/46106954/using-isnull-and-groupby-on-a-pandas-dataframe

NSCH19_nullByState = NSCH19[features].isnull().groupby(NSCH19['ABBR']).mean()
NSCH19_nullByState.sort_values(by="K7Q02R_R", ascending=False)


In [67]:
def clean_NSCH(df, features, response = 'K7Q02R_R', 
               dropna_response = True, 
               cond_nans = True,
               rep_cond_nans = 'NA'):
    '''
    This function cleans up a dataframe in a similar style/ format of the NSCH data.  The reason for
    making a function is so that we can apply this generally to different years and to account for
    the train/ test split (e.g. imputation should be done on the training set to prevent data leakage).

    Arguments:
    df = NSCH dataframe
    features = list, a list of features we want to keep (including response variable)
    response = str, the response variable
    dropna_response = bool, drop nan from the response variable
    cond_nans = replaces conditional nans (i.e. those that depend on other feature values) 
                with rep_cond_nans value

                
    '''
    df = df[features]
    
    # Note: response nans should be dropped before data imputation
    if dropna_response == True: df = df[df[response].notna()]

    if cond_nans == True:
        for feat in ['AVAILABLE', 'APPOINTMENT', 'ISSUECOST', 
                'NOTELIG', 'NOTOPEN', 'TRANSPORTCC']:
            df.loc[NSCH19['K4Q27'] == 2, feat] = rep_cond_nans

        if 'K12Q12' in features: df.loc[NSCH19['CURRCOV'] == 2, 'K12Q12'] = rep_cond_nans
        if 'K3Q21B' in features: df.loc[NSCH19['HOWMUCH'] == 1, 'K3Q21B'] = rep_cond_nans        
        if 'K3Q20' in features: df.loc[NSCH19['CURRCOV'] == 2, 'K3Q20'] = rep_cond_nans        
        if 'K3Q22' in features: df.loc[NSCH19['CURRCOV'] == 2, 'K3Q22'] = rep_cond_nans  

        if 'K4Q26'and 'K4Q24_R' in features: df.loc[NSCH19['K4Q24_R'] == 3, 'K4Q26'] = rep_cond_nans
        if 'K5Q31_R'and 'S4Q01' in features: df.loc[NSCH19['S4Q01'] == 2, 'K5Q31_R'] = rep_cond_nans

        if 'K5Q32'and 'S4Q01' in features: df.loc[NSCH19['S4Q01'] == 2, 'K5Q32'] = rep_cond_nans
        if 'K5Q32'and 'K5Q31_R' in features: df.loc[NSCH19['K5Q31_R'] == 2, 'K5Q32'] = rep_cond_nans
        if 'K5Q32'and 'K5Q31_R' in features: df.loc[NSCH19['K5Q31_R'] == 3, 'K5Q32'] = rep_cond_nans 


    return df

In [71]:
clean_NSCH19 = clean_NSCH(NSCH19, features)

  df.loc[NSCH19['K4Q27'] == 2, feat] = rep_cond_nans
  df.loc[NSCH19['K4Q27'] == 2, feat] = rep_cond_nans
  df.loc[NSCH19['K4Q27'] == 2, feat] = rep_cond_nans
  df.loc[NSCH19['K4Q27'] == 2, feat] = rep_cond_nans
  df.loc[NSCH19['K4Q27'] == 2, feat] = rep_cond_nans
  df.loc[NSCH19['K4Q27'] == 2, feat] = rep_cond_nans
  if 'K12Q12' in features: df.loc[NSCH19['CURRCOV'] == 2, 'K12Q12'] = rep_cond_nans
  if 'K3Q21B' in features: df.loc[NSCH19['HOWMUCH'] == 1, 'K3Q21B'] = rep_cond_nans
  if 'K3Q20' in features: df.loc[NSCH19['CURRCOV'] == 2, 'K3Q20'] = rep_cond_nans
  if 'K3Q22' in features: df.loc[NSCH19['CURRCOV'] == 2, 'K3Q22'] = rep_cond_nans
  if 'K4Q26'and 'K4Q24_R' in features: df.loc[NSCH19['K4Q24_R'] == 3, 'K4Q26'] = rep_cond_nans
  if 'K5Q31_R'and 'S4Q01' in features: df.loc[NSCH19['S4Q01'] == 2, 'K5Q31_R'] = rep_cond_nans
  if 'K5Q32'and 'S4Q01' in features: df.loc[NSCH19['S4Q01'] == 2, 'K5Q32'] = rep_cond_nans


In [77]:
clean_NSCH19.K3Q25

1        2.0
3        2.0
5        2.0
6        2.0
8        2.0
        ... 
29425    NaN
29427    2.0
29430    2.0
29431    2.0
29432    2.0
Name: K3Q25, Length: 21021, dtype: float64