In [34]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

#import cleaned data
data_set = pd.read_csv('clean_data.csv', header=0)

# #import data schema
# data_schema = pd.read_csv('./cleaned_HCMST_2017_schema.csv', header=0, index_col=0)
# data_set

In [35]:
#definite functions to convert the data set into the format used for EDA
def create_met_places(df):
    places = df.columns[list(df.columns).index('school')::]
    p_cols = df[places].copy()
    p_cols = p_cols.replace({'yes': 1, 'no': 0})
    #use idxmax to identify 'yes' i.e. 1
    p_cols = p_cols.idxmax(axis=1).copy()
    df.insert(loc=df.shape[1], column='places_met', value=pd.Series(p_cols))
    return df

def fix_household_inc(df):
    hh_labels = df['Household_Income'].unique()
    #leave out the first (<$5,000) and last elements ($250,000), create a dictionary for the middle elements
    inc_dict = {}
    for l in hh_labels:
        if ' to ' in l:
            k = int(l[0:l.find(' to ',0)].replace('$','').replace(',',''))
            inc_dict[l] = k
    inc_dict_val = sorted(list(inc_dict.values()))
    for l in hh_labels:
        if ' to ' in l:
            inc_dict[l] = inc_dict_val.index(inc_dict[l]) + 1
    inc_dict['Less than $5,000'] = 0
    inc_dict['$250,000 or more'] = len(hh_labels) - 1
    df = df.replace({'Household_Income': inc_dict})
    #create feature 'Household_Income2'
    hh_inc2 = df['Household_Income'].copy()
    hh_inc2 = hh_inc2.apply(lambda x: x**2)
    df.insert(loc=6, column='Household_Income2', value=pd.Series(hh_inc2))
    return df

def fix_rel_attendance(df):
    #Recategorized "refused"
    rel_dict = {'More than once a week': 0, 'Once a week': 1, 'Once or twice a month': 2, 
           'A few times a year': 3, 'Once a year or less': 4, 'Never': 5}
    df = df.replace({'Religious_Attendance': rel_dict})
    df = df.replace({'Religious_Attendance': {'Refused': 'A few times a year'}})
    return df


def fix_politics(df):
    #Recategorize "refused"
    pol_dict = {'Strong Republican': 0, 'Leans Republican': 1, 'Not Strong Republican': 2,
           'Undecided/Independent/Other': 3, 'Not Strong Democrat': 4, 'Leans Democrat': 5,
            'Strong Democrat': 6}
    df = df.replace({'Politics': pol_dict})
    df = df.replace({'P_Politics': pol_dict})
    df = df.replace({'P_Politics': {'Refused': 3}})
    pol_col = abs(df.Politics - df.P_Politics)
    df.insert(loc=8, column='Pol_Diff', value=pd.Series(pol_col))
    return df
    
def fix_education(df):
    p_edu_list = list(df['P_Education'].unique())
    p_edu_dict = {x: 'Less than high school' for x in p_edu_list[p_edu_list.index('12th grade no diploma'):]}
    p_edu_dict[p_edu_list[0]] = 'High school'
    for i in [2, 4]:
        p_edu_dict[p_edu_list[i]] = 'Some college'
    for i in [1, 3, 5]:
        p_edu_dict[p_edu_list[i]] = 'Bachelor\'s degree or higher'
    df = df.replace({'P_Education': p_edu_dict})
    #convert educational categories to ordinal form, create a new feature called 'Edu_Diff'
    edu_cat_dict = {'Bachelor\'s degree or higher': 0, 'Some college': 1, 'High school': 2, 'Less than high school': 3}
    df = df.replace({'Education': edu_cat_dict})
    df = df.replace({'P_Education': edu_cat_dict})
    edu_col = abs(df.Education - df.P_Education)
    df.insert(loc=8, column='Edu_Diff', value=pd.Series(edu_col))
    return df

def fix_age(df):
    #eliminate any NaN elements due to the absolute subtraction of 'Age' from 'P_Age'
    df = df[df['Age_Diff'].notna()]
    return df

In [36]:
#master function to transform data set with auxiliary functions
def data_preprocess(df):
    aux_f = [create_met_places, fix_household_inc, fix_rel_attendance, fix_politics, fix_education, fix_age]
    data = df.copy()
    for f in aux_f:
        data = f(data)
    return data

In [37]:
data_set2 = data_preprocess(data_set)

In [None]:
#split data set into in sample and out-of-sample (OOS) data,
#so that a machine learning model can be built, and we can use the model to classify OOS data
oos_size = int(0.10*data_set.shape[0])
ins_size = data_set.shape[0] - oos_size
data = data_set[:ins_size]
oos_data = data_set[ins_size:]