<a href="https://colab.research.google.com/github/elizabethyoo/team_9/blob/data%2Fhousehold_pulse/Household_data_cleaning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Import library

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import glob
import re
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import requests
from bs4 import BeautifulSoup
import time
import re

In [3]:
files = glob.glob(f"/content/drive/MyDrive/group_9/Datasets/household_pulse/pulse*.csv") 
excel_files = glob.glob(f"/content/drive/MyDrive/group_9/Datasets/household_pulse/*.xlsx") 

In [6]:
state_number_ref= {
    '01':'Alabama',
    '02':'Alaska',
    '04':'Arizona',
    '05':'Arkansas',
    '06':'California',
    '08':'Colorado',
    '09':'Connecticut',
    '10':'Delaware',
    '11':'District of Columbia',
    '12':'Florida',
    '13':'Georgia',
    '15':'Hawaii',
    '16':'Idaho',
    '17':'Illinois',
    '18':'Indiana',
    '19':'Iowa',
    '20':'Kansas',
    '21':'Kentucky',
    '22':'Louisiana',
    '23':'Maine',
    '24':'Maryland',
    '25':'Massachusetts',
    '26':'Michigan',
    '27':'Minnesota',
    '28':'Mississippi',
    '29':'Missouri',
    '30':'Montana',
    '31':'Nebraska',
    '32':'Nevada',
    '33':'New Hampshire',
    '34':'New Jersey',
    '35':'New Mexico',
    '36':'New York',
    '37':'North Carolina',
    '38':'North Dakota',
    '39':'Ohio',
    '40':'Oklahoma',
    '41':'Oregon',
    '42':'Pennsylvania',
    '44':'Rhode Island',
    '45':'South Carolina',
    '46':'South Dakota',
    '47':'Tennessee',
    '48':'Texas',
    '49':'Utah',
    '50':'Vermont',
    '51':'Virginia',
    '53':'Washington',
    '54':'West Virginia',
    '55':'Wisconsin',
    '56':'Wyoming'
}


# Getting dates of when the data is collected for each week of the survey
URL = 'https://www.census.gov/programs-surveys/household-pulse-survey/datasets.html'
page = requests.get(URL)
soup = BeautifulSoup(page.content, 'html.parser')
txt = soup.get_text()
wk_str = re.findall('(?<=Household Pulse Survey PUF: )[^\s]+\s\d+\s\S\s[^\s]+\s\d+', txt)
wk_number = list(reversed(np.arange(1,len(wk_str)+1)))
wk_date_dict = dict(zip(wk_number, wk_str))

## Create a function to clean the data

In [44]:
def data_cleaning(df):
  '''
  A function to create features based on different questionaires/ answers. 
    - FOOD_SCARCITY
    - HOUSING SECURITY 
    - SURVEY YEAR
    - AGE
    - GENDER
    - RACE
    - HISPANIC ORIGIN
    - EDUCATION
    - INCOME
    - RECEIVED_COVID_VACCINE
    - COVID_VACC_DOSE
  '''
 
  ### FOOD_SCARCITY ###
  df.loc[((df['CURFOODSUF']==3)|(df['CURFOODSUF']==4)), 'FOOD_SCARCITY'] = True
  df.loc[((df['CURFOODSUF']==1)|(df['CURFOODSUF']==2)), 'FOOD_SCARCITY'] = False
 

  ### HOUSING SECURITY ###
  if df.WEEK.unique() >= 13:
    df.loc[((df['RENTCUR']==2)|(df['MORTCONF']==1)), 'HOUSING_INSECURITY'] = True
    df.loc[((df['MORTCUR']==2)|(df['MORTCONF']==1)), 'HOUSING_INSECURITY'] = True

    df.loc[((df['RENTCUR']==1)|(df['MORTCONF']!=1)), 'HOUSING_INSECURITY'] = False
    df.loc[((df['MORTCUR']==1)|(df['MORTCONF']!=1)), 'HOUSING_INSECURITY'] = False
  
  else:
    df.loc[((df['MORTLMTH']==2)|(df['MORTCONF']==1)), 'HOUSING_INSECURITY'] = True
    df.loc[((df['MORTLMTH']==1)|(df['MORTCONF']!=1)), 'HOUSING_INSECURITY'] = False


  ### YEAR ###
  if df.WEEK.unique() >= 22:
    df['YEAR'] = 2021
  else:
    df['YEAR'] = 2020
  

  ### AGE ###
  df['AGE'] = df['YEAR'] - df['TBIRTH_YEAR']
  bins = [0, 18, 25, 40, 50, 60, 70, 120]
  labels = ['<18 ','18-24', '25-39', '40-49', '50-59', '60-69', '70+']
  df['AGE_GRP'] = pd.cut(df.AGE, bins, labels = labels,include_lowest = True)


  ### GENDER ###
  if 'EGENDER' in df.columns:
    df['GENDER'] = df['EGENDER']
  if 'EGENID_BIRTH' in df.columns:
     df['GENDER'] = df['EGENID_BIRTH']

  df['MALE'] = np.where(df['GENDER']==1, True, False)
  df['FEMALE'] = np.where(df['GENDER']==2, True, False)


  ### RACE ###
  df.loc[df['RRACE']==1, 'RACE'] = 'White_alone'
  df.loc[df['RRACE']==2, 'RACE'] = 'Black_alone'
  df.loc[df['RRACE']==3, 'RACE'] = 'Asian,_alone'
  df.loc[df['RRACE']==4, 'RACE'] = 'Any_other_race_alone_or_race_in_combination'


  ### HISPANIC ORIGIN ###
  df.loc[df['RHISPANIC']==1, 'HISPANIC_ORIGIN'] = True
  df.loc[df['RHISPANIC']==2, 'HISPANIC_ORIGIN'] = False


  ### EDUCATION ###
  df.loc[df['EEDUC']==1, 'EDUCATION'] = 'Less than high school'
  df.loc[((df['EEDUC']==2)|(df['EEDUC']==3)), 'EDUCATION'] = 'High school diploma or GED'
  df.loc[((df['EEDUC']==4)|(df['EEDUC']==5)), 'EDUCATION'] = 'Some college/associate degree'
  df.loc[df['EEDUC']>5, 'EDUCATION'] = "Bachelor's degree or higher"
  

  ### INCOME ###
  df.loc[df['INCOME']==1, 'INCOME_BEFORE_TAX'] = 'Less than $25,000'
  df.loc[df['INCOME']==2, 'INCOME_BEFORE_TAX'] = '$25,000 - $34,999'
  df.loc[df['INCOME']==3, 'INCOME_BEFORE_TAX'] = '$35,000 - $49,999'
  df.loc[df['INCOME']==4, 'INCOME_BEFORE_TAX'] = '$50,000 - $74,999'
  df.loc[df['INCOME']==5, 'INCOME_BEFORE_TAX'] = '$75,000 - $99,999'
  df.loc[df['INCOME']==6, 'INCOME_BEFORE_TAX'] = '$100,000 - $149,999'
  df.loc[df['INCOME']==7, 'INCOME_BEFORE_TAX'] = '$150,000 - $199,999'
  df.loc[df['INCOME']==8, 'INCOME_BEFORE_TAX'] = '$200,000 and above'


  ### COVID-19 VACCINATION RATE ###
  if 'RECVDVACC' in df.columns:
    df.loc[df['RECVDVACC']==1, 'RECEIVED_COVID_VACCINE'] = True
    df.loc[df['RECVDVACC']==2, 'RECEIVED_COVID_VACCINE'] = False
  else:
    df.loc[:, 'RECEIVED_COVID_VACCINE'] = np.nan
  

  ### COVID_19_VACC_DOSAGE ###
  if 'DOSESRV' in df.columns:
    df.loc[df['DOSESRV']==1, 'COVID_VACC_DOSE'] = 'received all required doses'
    df.loc[df['DOSESRV']==2, 'COVID_VACC_DOSE'] = 'plan to receive all required doses'
    df.loc[df['DOSESRV']==3, 'COVID_VACC_DOSE'] = "don't plan to receive all required doses"
  else:
    df.loc[:, 'COVID_VACC_DOSE'] = np.nan
 
  
  ### CHLDCARE ###
  if 'CHLDCARE' in df.columns:
    df.loc[df['CHLDCARE']==1, 'CHILDCARE'] = True
    df.loc[df['CHLDCARE']==2, 'CHILDCARE'] = False
    df.loc[df['CHLDCARE']==3, 'CHILDCARE'] = np.nan # not applicable
  else:
    df.loc[:, 'CHILDCARE'] = np.nan

  ### Convert wk column to date period ###
  if df.WEEK.unique() >= 22:
      year = '2021'
  else:
      year = '2020'
  date_period = df.WEEK.map(wk_date_dict).unique()

  mnth_start, day_start, dash, mnth_end,day_end = date_period[0].split()
  start_date = mnth_start + day_start + year
  end_date = mnth_end + day_end + year
  df['START_DATE'] = pd.to_datetime(start_date, format= '%B%d%Y')
  df['END_DATE'] = pd.to_datetime(end_date, format= '%B%d%Y')

  return df

## Read and clean the data by week then append them all into one big dataframe

In [47]:
col_to_keep = [
               'SCRAM', 'WEEK', 'EST_ST', 'EST_MSA',  
               'PWEIGHT', 'FOOD_SCARCITY','HOUSING_INSECURITY', 'YEAR', 'AGE',
               'AGE_GRP', 'GENDER', 'MALE','FEMALE', 'RACE', 'HISPANIC_ORIGIN',
               'EDUCATION', 'INCOME_BEFORE_TAX', 'RECEIVED_COVID_VACCINE', 
               'COVID_VACC_DOSE', 'CHILDCARE', 'START_DATE', 'END_DATE'
               ]
               
combined_cleaned_df = pd.DataFrame()

for file_name in files:
  df = pd.read_csv(file_name, na_values=[-88, -99])
  df_2 = data_cleaning(df) 
  df_3 = df_2.loc[:,col_to_keep]
  combined_cleaned_df = pd.concat([combined_cleaned_df, df_3])

In [49]:
combined_cleaned_df.shape

(3014808, 22)

In [50]:
combined_cleaned_df.head()

Unnamed: 0,SCRAM,WEEK,EST_ST,EST_MSA,PWEIGHT,FOOD_SCARCITY,HOUSING_INSECURITY,YEAR,AGE,AGE_GRP,GENDER,MALE,FEMALE,RACE,HISPANIC_ORIGIN,EDUCATION,INCOME_BEFORE_TAX,RECEIVED_COVID_VACCINE,COVID_VACC_DOSE,CHILDCARE,START_DATE,END_DATE
0,V370000001S10010150000113,37,1,,3311.293102,False,False,2021,65,60-69,1,True,False,White_alone,True,Some college/associate degree,,True,received all required doses,,2021-09-01,2021-09-13
1,V370000001S15010392400113,37,1,,1163.798474,False,False,2021,48,40-49,2,False,True,White_alone,True,Bachelor's degree or higher,"$50,000 - $74,999",True,received all required doses,,2021-09-01,2021-09-13
2,V370000001S18010218900123,37,6,31080.0,2752.586374,False,False,2021,49,40-49,2,False,True,White_alone,True,Some college/associate degree,"Less than $25,000",True,received all required doses,,2021-09-01,2021-09-13
3,V370000001S33010449200123,37,1,,2818.850471,False,False,2021,39,25-39,2,False,True,White_alone,True,Bachelor's degree or higher,"$50,000 - $74,999",True,received all required doses,False,2021-09-01,2021-09-13
4,V370000001S37010131800113,37,1,,3311.293102,False,False,2021,75,70+,1,True,False,White_alone,True,Some college/associate degree,"$50,000 - $74,999",True,received all required doses,,2021-09-01,2021-09-13


In [51]:
combined_cleaned_df.tail()

Unnamed: 0,SCRAM,WEEK,EST_ST,EST_MSA,PWEIGHT,FOOD_SCARCITY,HOUSING_INSECURITY,YEAR,AGE,AGE_GRP,GENDER,MALE,FEMALE,RACE,HISPANIC_ORIGIN,EDUCATION,INCOME_BEFORE_TAX,RECEIVED_COVID_VACCINE,COVID_VACC_DOSE,CHILDCARE,START_DATE,END_DATE
74408,V019950006S50560167700013,1,56,,632.943065,False,False,2020,47,40-49,1,True,False,White_alone,True,Some college/associate degree,"$75,000 - $99,999",,,,2020-04-23,2020-05-05
74409,V019950006S54560981370011,1,56,,503.836089,False,False,2020,64,60-69,2,False,True,White_alone,True,High school diploma or GED,,,,,2020-04-23,2020-05-05
74410,V019950006S70560411410012,1,19,,214.985437,False,False,2020,65,60-69,2,False,True,White_alone,True,Bachelor's degree or higher,"$75,000 - $99,999",,,,2020-04-23,2020-05-05
74411,V019950006S73560729560012,1,56,,158.205899,False,False,2020,38,25-39,2,False,True,White_alone,True,Bachelor's degree or higher,"$200,000 and above",,,,2020-04-23,2020-05-05
74412,V019950006S93560466860013,1,56,,263.343871,False,False,2020,74,70+,1,True,False,White_alone,True,Bachelor's degree or higher,"$200,000 and above",,,,2020-04-23,2020-05-05


## Upload the dataset onto Gdrive

In [52]:
combined_cleaned_df.to_csv('/content/drive/MyDrive/group_9/Datasets/household_pulse/cleaned_cmbn_household.csv', index= False)