In [36]:
import pandas as pd

In [37]:
#Reading our CSV file as a Pandas dataframe.
training_set_features = pd.read_csv('training_set_features.csv')

In [38]:
#This function takes in a dataframe (df), 
#then replaces some of its ordinal labeled data with numbers
#and returns an updated dataframe.
def label_encoding(df):
        age_map = {
        '18 - 34 Years':1,
        '35 - 44 Years':2,
        '45 - 54 Years':3,
        '55 - 64 Years':4,
        '65+ Years':5
        }

        education_map = {
            '< 12 Years':1,
            '12 Years':2,
            'Some College':3,
            'College Graduate':4
        }

        income_map = {
            'Below Poverty':1,
            '<= $75,000, Above Poverty':2,
            '> $75,000':3
        }

        df['age_group_numerical'] = df.age_group.map(age_map)
        df['education_numerical'] = df.education.map(education_map)
        df['income_poverty_numerical'] = df.income_poverty.map(income_map)
        return df.drop(columns=['age_group','education','income_poverty'])

In [39]:
#This function takes in a dataframe (df),
#then calls the label_encoding function on it,
#then calls the get_dummies function on it,
#and returns an updated dataframe (df2).
def data_preprocessing(df):
    df2 = pd.get_dummies(label_encoding(df), prefix_sep='_', drop_first=True)
    return df2

In [47]:
#Calling the data_preprocessing function on the 
#dataframe we imported (training_set_features), 
#and then exporting the resulting dataframe as a CSV file.
data_preprocessing(training_set_features).to_csv('encoded_training_set_features.csv')

In [48]:
df = data_preprocessing(training_set_features)
df

Unnamed: 0,respondent_id,h1n1_concern,h1n1_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,...,employment_occupation_qxajmpny,employment_occupation_rcertsgn,employment_occupation_tfqavkke,employment_occupation_ukymxvdu,employment_occupation_uqqtjvyb,employment_occupation_vlluhbov,employment_occupation_xgwztkwe,employment_occupation_xqwwgdyp,employment_occupation_xtkaffoo,employment_occupation_xzmlyyjv
0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,0,0,0,0,0,0,0,0,0,0
1,1,3.0,2.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,...,0,0,0,0,0,0,1,0,0,0
2,2,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,1,0
3,3,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
4,4,2.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,...,0,0,0,0,0,0,0,0,0,0
5,5,3.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,...,0,0,0,0,0,1,0,0,0,0
6,6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,1,0
7,7,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,...,0,0,0,0,0,0,0,1,0,0
8,8,0.0,2.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,...,0,0,0,0,0,0,0,0,0,0
9,9,2.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,...,0,0,0,0,0,0,0,0,0,0


In [49]:
for col in df.columns:
    print(col)

respondent_id
h1n1_concern
h1n1_knowledge
behavioral_antiviral_meds
behavioral_avoidance
behavioral_face_mask
behavioral_wash_hands
behavioral_large_gatherings
behavioral_outside_home
behavioral_touch_face
doctor_recc_h1n1
doctor_recc_seasonal
chronic_med_condition
child_under_6_months
health_worker
health_insurance
opinion_h1n1_vacc_effective
opinion_h1n1_risk
opinion_h1n1_sick_from_vacc
opinion_seas_vacc_effective
opinion_seas_risk
opinion_seas_sick_from_vacc
household_adults
household_children
age_group_numerical
education_numerical
income_poverty_numerical
race_Hispanic
race_Other or Multiple
race_White
sex_Male
marital_status_Not Married
rent_or_own_Rent
employment_status_Not in Labor Force
employment_status_Unemployed
hhs_geo_region_bhuqouqj
hhs_geo_region_dqpwygqj
hhs_geo_region_fpwskwrf
hhs_geo_region_kbazzjca
hhs_geo_region_lrircsnp
hhs_geo_region_lzgpxyit
hhs_geo_region_mlyzmhmf
hhs_geo_region_oxchjgsf
hhs_geo_region_qufhixun
census_msa_MSA, Principle City
census_msa_Non-MSA


In [50]:
df.head(10)

Unnamed: 0,respondent_id,h1n1_concern,h1n1_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,...,employment_occupation_qxajmpny,employment_occupation_rcertsgn,employment_occupation_tfqavkke,employment_occupation_ukymxvdu,employment_occupation_uqqtjvyb,employment_occupation_vlluhbov,employment_occupation_xgwztkwe,employment_occupation_xqwwgdyp,employment_occupation_xtkaffoo,employment_occupation_xzmlyyjv
0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,0,0,0,0,0,0,0,0,0,0
1,1,3.0,2.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,...,0,0,0,0,0,0,1,0,0,0
2,2,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,1,0
3,3,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
4,4,2.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,...,0,0,0,0,0,0,0,0,0,0
5,5,3.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,...,0,0,0,0,0,1,0,0,0,0
6,6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,1,0
7,7,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,...,0,0,0,0,0,0,0,1,0,0
8,8,0.0,2.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,...,0,0,0,0,0,0,0,0,0,0
9,9,2.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,...,0,0,0,0,0,0,0,0,0,0
