In [760]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy as sp
import seaborn as sns
from sklearn import preprocessing as pp
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

In [761]:
# Features are mainly demographic data from onset of survey,
# excluding 1 relationship centric feature - 'relationship quality'

# Goal is to only use data gathered at the start of the survey, Wave 1,
# to try to predict the outcome of partnered respondents, based on respondent and partner demographic data
# This allows us to investigate and discover what type of individuals 
# have better chance of surviving a 6 year relationship

# EDA & Cleaning

In [762]:
df_w5 = pd.read_stata('./datasets/HCMST_wave_5_supplement_ver_1.dta')
df_w4 = pd.read_stata('./datasets/wave_4_supplement_v1_2.dta')
df_w123 = pd.read_stata('./datasets/HCMST_ver_3.04.dta')

In [763]:
w5_cols = df_w5.columns.tolist()
w4_cols = df_w4.columns.tolist()
w123_cols = df_w123.columns.tolist()

In [764]:
# Checking number of features for each wave
print(len(w5_cols), len(w4_cols), len(w123_cols))

78 62 387


In [765]:
# checking rows and columns for each wave
print(df_w123.shape, df_w4.shape, df_w5.shape)

(4002, 387) (4002, 62) (4002, 78)


In [766]:
# 
wave_1_couples = df_w123.loc[(df_w123['qflag'] == 'partnered') & (df_w123['partner_deceased'] == 'not deceased'), 'caseid_new':]

In [767]:
wave_1_couples.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2981 entries, 0 to 4001
Columns: 387 entries, caseid_new to w3_nonmbtiming_month
dtypes: category(325), float32(16), float64(39), int32(4), int8(2), object(1)
memory usage: 2.1+ MB


In [768]:
# Get relationship status of couples that have deceased partners
w2_dead = wave_1_couples[wave_1_couples.w2_broke_up == 'partner passed away'].index
w3_dead = wave_1_couples[wave_1_couples.w3_broke_up == 'partner deceased'].index
w4_dead = df_w4[df_w4.w4_broke_up == 'partner passed away'].index
w5_dead = df_w5[df_w5.w5_broke_up == 'partner deceased'].index

print(len(wave_1_couples.loc[wave_1_couples.loc[w2_dead,['qflag']].index][wave_1_couples.qflag == 'partnered'].index))
print('')
print(len(wave_1_couples.loc[wave_1_couples.loc[w3_dead,['w2_broke_up']].index][wave_1_couples.w2_broke_up == 'still together'].index))
print('') 
print(len(wave_1_couples.loc[wave_1_couples.loc[w4_dead,['w3_broke_up']].index][wave_1_couples.w3_broke_up == 'still together'].index))
print('') 
print(len(df_w4.loc[df_w4.loc[w5_dead,['w4_broke_up']].index][df_w4.w4_broke_up == 'still together'].index))

40

  import sys
  if __name__ == '__main__':
  # This is added back by InteractiveShellApp.init_path()




18

12

8


  del sys.path[0]


In [769]:
w2_dead_st = wave_1_couples.loc[wave_1_couples.loc[w2_dead,['qflag','caseid_new']].index][wave_1_couples.qflag == 'partnered'].caseid_new.values
w3_dead_st = wave_1_couples.loc[wave_1_couples.loc[w3_dead,['w2_broke_up','caseid_new']].index][wave_1_couples.w2_broke_up == 'still together'].caseid_new.values
w4_dead_st = wave_1_couples.loc[wave_1_couples.loc[w4_dead,['w3_broke_up','caseid_new']].index][wave_1_couples.w3_broke_up == 'still together'].caseid_new.values
w5_dead_st = df_w4.loc[df_w4.loc[w5_dead,['w4_broke_up','caseid_new']].index][df_w4.w4_broke_up == 'still together'].caseid_new.values

  """Entry point for launching an IPython kernel.
  
  This is separate from the ipykernel package so we can avoid doing imports until
  after removing the cwd from sys.path.


In [770]:
# prior relationship status of all couples that have deceased partners is still together
tot_dead_list = np.concatenate((w2_dead_st,w3_dead_st,w4_dead_st,w5_dead_st))

In [771]:
wave_2_couples_broke_up = wave_1_couples.loc[wave_1_couples['w2_broke_up']=='broke up']
wave_3_couples_broke_up = wave_1_couples.loc[wave_1_couples['w3_broke_up']=='broke up']
w4_breakup = df_w4.loc[df_w4['w4_broke_up']=='broke up']
w5_breakup = df_w5.loc[df_w5['w5_broke_up']=='broke up']
survived_couples=df_w5.loc[df_w5['w5_broke_up']=='still together']

In [772]:
w2_breakup_list=wave_2_couples_broke_up['caseid_new'].tolist()
w3_breakup_list=wave_3_couples_broke_up['caseid_new'].tolist()
w4_breakup_list=w4_breakup['caseid_new'].tolist()
w5_breakup_list=w5_breakup['caseid_new'].tolist()
survived_couples_list=survived_couples['caseid_new'].tolist()

In [773]:
# Must make sure all couples in dataset are partnered right from the start
cp_index = np.concatenate((w4_breakup.index,w5_breakup.index,survived_couples.index))
wave_1_couples.loc[cp_index].head()

Unnamed: 0,caseid_new,weight1,weight2,ppage,ppagecat,ppagect4,ppeduc,ppeducat,ppethm,ppgender,...,w3_mbtiming_year,w3_mbtiming_month,w3_q5,w3_q6,w3_q7,w3_q8,w3_q9,w3_q10,w3_nonmbtiming_year,w3_nonmbtiming_month
22,44486,3581,3581.0,48,45-54,45-59,"some college, no degree",some college,"black, non-hispanic",female,...,,,,,,,,,,
73,97195,43431,43431.0,19,18-24,18-29,"some college, no degree",some college,"white, non-hispanic",male,...,,,yes,no,"no, did not marry [xNameP]",,,,,
97,121515,39909,39909.0,51,45-54,45-59,bachelors degree,bachelor's degree or higher,"white, non-hispanic",female,...,,,yes,no,"no, did not marry [xNameP]",,,,,
119,144545,109536,109536.0,54,45-54,45-59,professional or doctorate degree,bachelor's degree or higher,"other, non-hispanic",male,...,,,yes,no,"no, did not marry [xNameP]",,,,,
122,147245,69278,69278.0,24,18-24,18-29,high school graduate - high school diploma or ...,high school,"white, non-hispanic",male,...,,,yes,yes,"no, did not marry [xNameP]",,,,,


In [774]:
# for x in tot_dead_st:
#     if x in w2_breakup_list or x in w3_breakup_list or x in w4_breakup_list or x in w5_breakup_list:
#         print(x)

In [775]:
# check the imbalance of the data
print(len(w5_breakup_list)+len(w4_breakup_list)+len(w3_breakup_list)+len(w2_breakup_list))
print(len(tot_dead_list)+len(survived_couples_list))

503
1144


In [776]:
breakup_list = np.concatenate((w5_breakup_list,w4_breakup_list,w3_breakup_list,w2_breakup_list), axis=0)
together_list = np.concatenate((tot_dead_list,survived_couples_list), axis=0)

In [777]:
# only want wave 1 features
features = wave_1_couples.loc[:,:'coresident'].columns
couple_data = wave_1_couples[features]

In [778]:
couple_data.set_index(keys=['caseid_new'], inplace=True)

In [779]:
# dropping obviously irrelevant columns
cols_to_drop = couple_data.loc[w2_breakup_list,'pphhcomp11_member2_age':'weight_couples_coresident'].columns

In [780]:
couple_data.drop(labels=cols_to_drop, axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [781]:
couple_breakup = couple_data.loc[breakup_list,:]
couple_together = couple_data.loc[together_list,:]

In [782]:
print(couple_breakup.shape)
print(couple_together.shape)

(503, 231)
(1144, 231)


In [783]:
couple_breakup['relationship_outcome_6yrs'] = [1 for x in range(couple_breakup.shape[0])]
couple_together['relationship_outcome_6yrs'] = [0 for x in range(couple_together.shape[0])]

In [784]:
print(couple_breakup.relationship_outcome_6yrs.unique())
print(couple_together.relationship_outcome_6yrs.unique())

[1]
[0]


In [785]:
couple_data = pd.concat([couple_breakup, couple_together], axis=0)

In [786]:
# drop more unnecessary columns that will not be useful features
couple_data.drop(labels=['weight1','weight2'], axis=1, inplace=True)
# drop age category columns, already have age continuous column
couple_data.drop(labels=['ppagecat','ppagect4'], axis=1, inplace=True)
# use education category instead of education
couple_data.drop(labels=['ppeduc'], axis=1, inplace=True)
# use continuous household income instead of categorical household income
couple_data.drop(labels=['ppincimp'], axis=1, inplace=True)
# Drop MSA metropolitan statistical area, irrelevancy
couple_data.drop(labels=['ppmsacat'], axis=1, inplace=True)
# drop ppt01, ppt1317, ppt25, ppt612, features aggregated in children_in_hh
couple_data.drop(labels=['ppt01','ppt1317','ppt25','ppt612'], axis=1, inplace=True)
# drop ppq14arace, redundant column, there are individual race columns
couple_data.drop(labels=['ppq14arace'], axis=1, inplace=True)
# drop ppppcmdate_yrmo, pppadate_yrmo, date of survey is irrelevant
couple_data.drop(labels=['ppppcmdate_yrmo','pppadate_yrmo'], axis=1, inplace=True)
# drop HCMST_main_interview_yrmo, date of interview is irrelevant
# drop interview duration, qflag - all partnered
couple_data.drop(labels=['HCMST_main_interview_yrmo','duration','qflag'], axis=1, inplace=True)
# drop papglb_status, same as glbstatus
# drop recsource, source of recruitment is irrelevant
# drop s1a,s2,q3_codes,q5,q15a1_compressed,q17c,q17d redundant or too many NAs, 
couple_data.drop(labels=['glbstatus','papglb_status','recsource','s1','s1a','s2','q3_codes','q5','q15a1_compressed','q17c','q17d'], axis=1, inplace=True)
# drop q18a_1, q18a_2, q18a_3, q18b_codes, q18c_codes, low variance and too many NAs
couple_data.drop(labels=['q18a_1','q18a_2','q18a_3','q18b_codes','q18c_codes','q18a_refused'], axis=1, inplace=True)
# drop q20, q21a_refusal, q21b_refusal,q21c_refusal,q21d_refusal,q21e,q21e_refusal,q24_codes
# 'q31_9','q31_other_text_entered','q33_7','q33_other_text_entered',not usable column
# q35_codes, q35_text_entered, summary_q24_total
couple_data.drop(labels=['q20','q21a_refusal','q21b_refusal','q21c_refusal','q21d_refusal',
                        'q21e','q21e_refusal','q24_codes','q31_9','q31_other_text_entered',
                        'q33_7','q33_other_text_entered','q35_codes','q35_text_entered',
                        'summary_q24_total'], axis=1, inplace=True)
# drop marrynotreally, marrycountry, civilnotreally, partner_deceased
# partner_religion_reclassified, partner_religion_child_reclass, own_religion_child_reclass
# q32_internet, how_met_online, potential_partner_gender_recodes, how_long_ago_first_met_cat
# duplicated representation of previous columns, too many NA
couple_data.drop(labels=['marrynotreally','marrycountry','civilnotreally','partner_deceased',
                        'partner_religion_reclassified','partner_religion_child_reclass',
                        'own_religion_child_reclass','q32_internet','how_met_online',
                        'either_internet','either_internet_adjusted','potential_partner_gender_recodes',
                        'how_long_ago_first_met_cat','pphouseholdsize'], axis=1, inplace=True)
# drop q24_R_friend, q24_P_friend, q24_R_family, q24_P_family, q24_R_neighbor, q24_P_neighbor
# q24_R_cowork, q24_P_cowork,papreligion,q13b,respondent_religion_at_16,partner_religion_at_16
# q7b, q8b,q30
# columns are aggregated into other columns
couple_data.drop(labels=['q24_R_friend','q24_P_friend','q24_R_family','q24_P_family','q24_R_neighbor','q24_P_neighbor',
                        'q24_R_cowork','q24_P_cowork','papreligion','q13b','respondent_religion_at_16',
                        'partner_religion_at_16','q7b','q8b','q30'], axis=1, inplace=True)
# drop home_country_recode, too many NAs
couple_data.drop(labels=['home_country_recode'], axis=1, inplace=True)

In [787]:
# Iterate through each series in the dataframe, impute nulls with highest mode, binary values
import random
sr = random.SystemRandom()

def impute_bin_cols(cols):
    
    for col in cols:
        # get number of keys
        key_arr = [k for k in couple_data[col].value_counts().sort_values(ascending=False).to_dict().keys()]
        greater_key = key_arr[0]
        greater_counts = couple_data[col].value_counts().to_dict()[greater_key]

        if len(couple_data[col].value_counts().sort_values(ascending=False).to_dict().keys()) > 1:
            lesser_key = key_arr[1]
            lesser_counts = couple_data[col].value_counts().to_dict()[lesser_key]

            if greater_counts > lesser_counts:
                couple_data[col].fillna(greater_key, inplace=True)
            else:
                couple_data[col].fillna(sr.choice([greater_key,lesser_key]), inplace=True)
        else:
            couple_data[col].fillna(greater_key, inplace=True)
    
    return couple_data[cols].isnull().sum()

In [788]:
couple_data.isnull().sum().sort_values(ascending=False)

q22                                 1266
q17b                                 955
q26                                  730
q21d                                 698
q17a                                 692
parental_approval                    451
q21c                                 344
how_long_ago_first_cohab             344
q24_internet_game                     33
q24_internet_chat                     33
q24_vol_org                           33
q24_customer                          33
q24_bar_restaurant                    33
q24_internet_dating                   33
q24_internet_social_networking        33
q24_private_party                     33
q24_internet_community                33
q24_internet_other                    33
q24_public                            33
q24_military                          33
q24_blind_date                        33
q24_vacation                          33
q24_singles_service_non_internet      33
q24_church                            33
q24_btwn_I_sig_o

In [789]:
# investigate null features
# drop features with more than 70% null
couple_data.drop(labels=['q22'], axis=1, inplace=True)
# drop q17a, use q17b instead, current marriage already taken into account in 'married' column
couple_data.drop(labels=['q17a'], axis=1, inplace=True)

In [790]:
# fill null and 'refused' values
couple_data.q17b.fillna('never married', inplace=True)
couple_data.q17b = couple_data.q17b.map(lambda x: 'never married' if x == 'refused' else x)

couple_data.q26 = couple_data.q26.map(lambda x: 'did not attend same college or university' if x == 'refused' else x)

couple_data.parental_approval.fillna("don't approve or don't know", inplace=True)
# fill null values for continuous variable
couple_data.q21d.fillna(couple_data.q21d.median(), inplace=True)
couple_data.q21c.fillna(couple_data.q21c.median(), inplace=True)
couple_data.how_long_ago_first_cohab.fillna(couple_data.how_long_ago_first_cohab.median(), inplace=True)
# partner mum years of education
couple_data.partner_mom_yrsed.fillna(couple_data.partner_mom_yrsed.median(), inplace=True)
# distancemoved_10mi
couple_data.distancemoved_10mi.fillna(couple_data.distancemoved_10mi.median(), inplace=True)
# how_long_ago_first_romantic
couple_data.how_long_ago_first_romantic.fillna(couple_data.how_long_ago_first_romantic.median(), inplace=True)
# how_long_relationship
couple_data.how_long_relationship.fillna(couple_data.how_long_relationship.median(), inplace=True)
# q21b how old were you when your romantic relationship began with your partner
couple_data.q21b.fillna(couple_data.q21b.median(), inplace=True)
# respondent_mom_yrsed
couple_data.respondent_mom_yrsed.fillna(couple_data.respondent_mom_yrsed.median(), inplace=True)
# fill null and convert relationship quality to integers
couple_data.relationship_quality.fillna('fair', inplace=True)
couple_data.relationship_quality = couple_data.relationship_quality.map(lambda x: 5 if x == 'excellent' else 4
                                if x == 'good' else 3 if x == 'fair' else 2 if x == 'poor' else 1)
# Filling null values for gender attraction
couple_data[['gender_attraction','same_sex_couple']].loc[couple_data.gender_attraction.isnull()]
couple_data.gender_attraction.fillna('opposite gender only', inplace=True)
# q21a how old were you when you first met
couple_data.q21a.fillna(couple_data.q21a.median(), inplace=True)
# how_long_ago_first_met
couple_data.how_long_ago_first_met.fillna(couple_data.how_long_ago_first_met.median(), inplace=True)
# q16 how many of your relatives do you see in person at least once a month
couple_data.q16 = couple_data.q16.astype('float64')
couple_data.q16.fillna(np.median([int(i) for i in couple_data.q16.unique() if np.isnan(i) == False]), inplace=True)
# age difference
couple_data.age_difference.fillna(couple_data.age_difference.median(), inplace=True)# q9 how old is your partner
couple_data.q9.fillna(couple_data.q9.median(), inplace=True)
# respondent race 
couple_data.respondent_race.fillna('NH white', inplace=True)
# partner religion at 16 years old
couple_data.partner_relig_16_cat.fillna('No religion', inplace=True)
# respondent religion at 16 years old
couple_data.respondent_relig_16_cat.fillna('No religion', inplace=True)
# papglb_friend
# do you have any friends or relatives who you know to be gay, lesbian, or bis 
couple_data.papglb_friend.fillna('i would prefer to not answer this question', inplace=True)
# partner years of education
couple_data.partner_yrsed.fillna(couple_data.partner_yrsed.median(), inplace=True)
# partner race
couple_data.partner_race.fillna('NH white', inplace=True)

In [791]:
# q24 columns 
cols = [x for x in couple_data.columns if 'q24_' in x]
impute_bin_cols(cols)

q24_met_online                      0
q24_R_sig_other                     0
q24_P_sig_other                     0
q24_btwn_I_cowork                   0
q24_btwn_I_friend                   0
q24_btwn_I_family                   0
q24_btwn_I_sig_other                0
q24_btwn_I_neighbor                 0
q24_school                          0
q24_college                         0
q24_military                        0
q24_church                          0
q24_vol_org                         0
q24_customer                        0
q24_bar_restaurant                  0
q24_internet_dating                 0
q24_internet_social_networking      0
q24_internet_game                   0
q24_internet_chat                   0
q24_internet_community              0
q24_internet_other                  0
q24_public                          0
q24_private_party                   0
q24_blind_date                      0
q24_vacation                        0
q24_singles_service_non_internet    0
q24_business

In [792]:
# more binary columns
cols = ['met_through_as_coworkers','met_through_friends','met_through_family',
        'papevangelical','met_through_as_neighbors','US_raised','coresident']
impute_bin_cols(cols)

met_through_as_coworkers    0
met_through_friends         0
met_through_family          0
papevangelical              0
met_through_as_neighbors    0
US_raised                   0
coresident                  0
dtype: int64

In [793]:
# age column should be continuous
couple_data.ppage = couple_data.ppage.astype('float64')
# number of adults over 18+ in HH should be continuous
couple_data.ppt18ov = couple_data.ppt18ov.astype('float64')
# relationship_quality also continuous
couple_data.relationship_quality = couple_data.relationship_quality.astype('float64')

In [794]:
# loop through every feature and check values again
for col in couple_data.columns:
    if couple_data[col].dtype.name == 'category' or couple_data[col].dtype.name == 'object':
        print(couple_data[col].value_counts())
        print('------------------------')

bachelor's degree or higher    669
some college                   463
high school                    371
less than high school          144
Name: ppeducat, dtype: int64
------------------------
white, non-hispanic       1238
hispanic                   175
black, non-hispanic        118
2+ races, non-hispanic      65
other, non-hispanic         51
Name: ppethm, dtype: int64
------------------------
male      838
female    809
Name: ppgender, dtype: int64
------------------------
yes    1368
no      279
Name: pphhhead, dtype: int64
------------------------
a one-family house detached from any other house     1222
a building with 2 or more apartments                  240
a one-family house attached to one or more houses     127
a mobile home                                          57
boat, rv, van, etc.                                     1
Name: pphouse, dtype: int64
------------------------
married                860
never married          312
living with partner    284
divorced       

No     1592
Yes      55
Name: q24_fam_brother_active, dtype: int64
------------------------
No     1599
Yes      48
Name: q24_fam_mother_active, dtype: int64
------------------------
No     1623
Yes      24
Name: q24_fam_father_active, dtype: int64
------------------------
No     1616
Yes      31
Name: q24_fam_other_active, dtype: int64
------------------------
No     1615
Yes      32
Name: q24_fam_cousins_active, dtype: int64
------------------------
No     1639
Yes       8
Name: q24_fam_aunt_niece_active, dtype: int64
------------------------
No     1640
Yes       7
Name: q24_fam_uncle_nephew_active, dtype: int64
------------------------
No     1646
Yes       1
Name: q24_fam_grandmother_active, dtype: int64
------------------------
No     1647
Yes       0
Name: q24_fam_grandfather_active, dtype: int64
------------------------
No     1644
Yes       3
Name: q24_fam_sister_passive, dtype: int64
------------------------
No     1644
Yes       3
Name: q24_fam_brother_passive, dtype: int64


In [795]:
cols = [col for col in couple_data.columns if 'pprace' in col]
# Convert all refused value into Nan
for col in cols:
    # get hi
    couple_data[col] = couple_data[col].map(lambda x: np.nan if x == 'refused' else x)
# Impute them with larger count
impute_bin_cols(cols)

pprace_white                   0
pprace_black                   0
pprace_nativeamerican          0
pprace_asianindian             0
pprace_chinese                 0
pprace_filipino                0
pprace_japanese                0
pprace_korean                  0
pprace_vietnamese              0
pprace_otherasian              0
pprace_hawaiian                0
pprace_guamanian               0
pprace_samoan                  0
pprace_otherpacificislander    0
pprace_someotherrace           0
dtype: int64

In [796]:
# yes, friends                                  590
# no                                            454
# yes, both                                     452
# yes, relatives                                115
# i would prefer to not answer this question     36
# Name: papglb_friend, dtype: int64
# we can induce from the fact that they did not choose to answer the question that
# they have reservations about telling the truth. Most likely, answer is yes
couple_data.papglb_friend = couple_data.papglb_friend.map(lambda x: 'yes, both' if x == 'i would prefer to not answer this question' else x)

In [797]:
couple_data[couple_data.q4 == 'other, please specify'].loc[:,['alt_partner_gender','q4','same_sex_couple']]

Unnamed: 0_level_0,alt_partner_gender,q4,same_sex_couple
caseid_new,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
106296,male,"other, please specify",different sex couple
630534,female,"other, please specify",different sex couple


In [798]:
# remove ambiguous values from q4
# female                   845
# male                     800
# other, please specify      2
couple_data.q4.loc[106296] = 'male'
couple_data.q4.loc[630534] = 'female'

In [799]:
# remove ambiguous from q6a, q6b, 
# no (not latino or hispanic), white
couple_data.q6a = couple_data.q6a.map(lambda x: 'no (not latino or hispanic)' if x == 'refused' else x)
couple_data.q6b = couple_data.q6b.map(lambda x: 'white' if x == 'refused' else x)

In [800]:
# remove ambiguous from q7a, q8a, q13a,q19,q25,q27,q28,q31_1,q31_2,q31_3,q31_4,q31_5,q31_6,q31_7,q31_8
# q33_1,q33_2,q33_3
cols = ['q7a','q8a','q13a','q19','q25','q27','q28','q31_1','q31_2','q31_3','q31_4','q31_5','q31_6','q31_7',
        'q31_8','q33_1','q33_2','q33_3','q33_4','q33_5','q33_6']
for col in cols:
    couple_data[col] = couple_data[col].map(lambda x: np.nan if x == 'refused' else x)
impute_bin_cols(cols)

q7a      0
q8a      0
q13a     0
q19      0
q25      0
q27      0
q28      0
q31_1    0
q31_2    0
q31_3    0
q31_4    0
q31_5    0
q31_6    0
q31_7    0
q31_8    0
q33_1    0
q33_2    0
q33_3    0
q33_4    0
q33_5    0
q33_6    0
dtype: int64

In [801]:
# remove ambiguous from q10,q11, q14
# no formal education
couple_data.q10 = couple_data.q10.map(lambda x: 'no formal education' if x == 'refused' else x)
couple_data.q11 = couple_data.q11.map(lambda x: 'no formal education' if x == 'refused' else x)
couple_data.q14 = couple_data.q14.map(lambda x: 'no formal education' if x == 'refused' else x)

In [802]:
# democrat                         716
# republican                       404
# independent                      262
# no preference                    238
# another party, please specify     22
# refused                            5
# Name: q12, dtype: int64
couple_data.q12 = couple_data.q12.map(lambda x: 'no preference' if x == 'refused' else x)

In [803]:
# i earned more                      757
# partner earned more                669
# we earned about the same amount    211
# refused                             10
# Name: q23, dtype: int64
couple_data.q23 = couple_data.q23.map(lambda x: 'we earned about the same amount' if x == 'refused' else x)

In [804]:
# father and mother                      787
# neither father nor mother are alive    441
# mother only                            334
# father only                             78
# refused                                  7
# Name: q29, dtype: int64
couple_data.q29 = couple_data.q29.map(lambda x: 'father and mother' if x == 'refused' else x)

In [805]:
# no, we did not meet through the internet                                    1453
# yes, an internet dating or matchmaking site (like eharmony or match.com)      72
# yes, a social networking site (like facebook or myspace)                      38
# yes, a different kind of internet service                                     37
# yes, an internet chat room                                                    28
# yes, an internet classified advertising site (like craigslist)                13
# refused                                                                        6
# Name: q32, dtype: int64
couple_data.q32 = couple_data.q32.map(lambda x: 'no, we did not meet through the internet' if x == 'refused' else x)

In [806]:
# check features with 1 distinct value more than or equals to 95% of the sample
# if feature important, seek workaround, otherwise remove
def get_low_var(df):
    low_var_cols = []
    cols = df.columns
    
    for col in cols:
        arr = np.array(df[col].value_counts() / df.shape[0])
        for prop in arr:
            if prop >= 0.95:
                low_var_cols.append(col)
                
    return low_var_cols

In [807]:
# remove features with 1 distinct value more than or equals to 95% of sample
low_var_features = get_low_var(couple_data)
couple_data.drop(labels=low_var_features, axis=1, inplace=True)

In [808]:
# remove identical columns based on data dictionary
couple_data.drop(labels=['q4','q6a','q6b','q8a','q9','q10','q11','q13a','q14','q19','q21a','q21b','q21c'], axis=1, inplace=True)

In [809]:
# rename question columns
couple_data.rename(index=str, columns={'q4':'partner_gender','q7a':'partner_christ_type',
                                      'q12':'partner_politic_view','q16':'relatives_seen_per_month',
                                      'q17b':'marriage_count','q21d':'age_when_married',
                                       'q23':'higher_income_earner','q25':'same_high_school',
                                      'q26':'same_college_uni','q27':'grow_up_same_city_town',
                                      'q28':'both_parents_knew_before_met','q29':'parent_alive',
                                      'q31_1':'met_partner_work','q31_2':'met_partner_school',
                                      'q31_3':'met_partner_church','q31_4':'met_partner_online_dating',
                                      'q31_6':'met_partner_nightclub','q31_8':'met_partner_private_party',
                                      'q32':'met_partner_internet','q33_1':'fam_intro_partner',
                                      'q33_2':'friend_intro_partner','q33_3':'colleague_intro_partner',
                                      'q33_6':'self_intro_partner'}, inplace=True)

In [810]:
# Remove more duplicate columns
couple_data.drop(labels=['met_partner_online_dating','met_partner_internet',
                        'met_partner_school','met_partner_church',
                        'met_partner_private_party','met_partner_nightclub'], axis=1, inplace=True)

In [811]:
# Group values in ppwork into employee, self-employed, not-working 
couple_data.ppwork = couple_data.ppwork.map(lambda x: 'employee' if x.find('employee') >= 0 else \
                                           'self-employed' if x.find('self-employed') >= 0 else \
                                           'not-working')

In [812]:
# Drop relationship_quality as feature will carry most weight in the prediction
# We also will want more objective than subjective features
couple_data.drop(labels=['relationship_quality'], axis=1, inplace=True)
# remove q34, same as relationship quality
# remove ppmarit, already have married or not column
# remove ppeducat, already have years of education
# remove ppage, already have age difference
# remove children in household, not very informative as data is concentrated on 0 children
# remove ppethm, already have respondent / partner race
# remove number of adults in household, data too concentrated on smaller numbers
# remove ppgender, redundant as we only need to know whether couple is same sex or not
# remove pphouse, EDA shows type of house dont affect relationship outcome
# remove ppreg4 and ppreg9, columns only pertains to USA
# remove pprent, EDA shows type of rental dont affect relationship outcome
# remove ppwork, EDA shows status of employment dont affect relationship outcome
# remove papevangelical, EDA shows whether born again or evangelical dont affect relationship outcome
# remove ppnet, EDA shows whether respondent have his own internet access dont affect relationship outcome
# remove marriage count cause too low variance
# remove gender attraction, redundant column as already have same sex couple columns
# remove alt partner gender, already have same sex couple column
# Will be more useful if most of the pp columns came with partner counterparts
# Features will not be very informative if it only concerns
couple_data.drop(labels=['q34','ppmarit','ppeducat','ppage','children_in_hh','ppethm','pphispan', \
                        'pprace_white','pprace_black','pprace_someotherrace','ppt18ov', \
                        'pphouse','ppreg4','ppreg9','pprent','ppwork','ppnet','relatives_seen_per_month', \
                        'papglb_friend','pphhhead','hhinc','marriage_count','gender_attraction', \
                        'alt_partner_gender'], axis=1, inplace=True)

### Create new Features that compares Respondent to Partner Characteristics

In [813]:
# Include gender in higher income earner column to make it more informative
couple_income = couple_data.higher_income_earner.astype('object') + '_' + couple_data.ppgender.astype('object')
couple_data.higher_income_earner = couple_income.map(lambda x: 'male_earn_more' if x.find('earned more_male') >= 0 else \
                 'female_earn_more' if x.find('earned more_female') >= 0 else 'both_earn_same')

In [814]:
# Couple political views combination
# Make sure unique values for both are the same
couple_data.partner_politic_view = couple_data.partner_politic_view.map(lambda x: 'other' if x != 'democrat' and x != 'republican' else x)
couple_political_view_comb = couple_data.partner_politic_view.astype('object') + '_' + couple_data.pppartyid3.astype('object') 

In [815]:
couple_political_view_comb.value_counts()

democrat_democrat        621
republican_republican    324
other_democrat           267
other_republican         228
democrat_republican       92
republican_democrat       77
other_other               32
republican_other           3
democrat_other             3
dtype: int64

In [816]:
# Remove Duplicates
couple_data['couple_politic_view_comb'] = couple_political_view_comb.map(lambda x: 'democrat_other' if x.find('democrat') >= 0 and x.find('other') >= 0 \
                                else 'republican_other' if x.find('republican') >= 0 and x.find('other') >= 0 \
                                else 'democrat_republican' if x.find('democrat') >= 0 and x.find('republican') >= 0 \
                                else x)

In [817]:
# remove individual political view columns
couple_data.drop(labels=['partner_politic_view','pppartyid3'], axis=1, inplace=True)

In [818]:
# couple evangelical / born again or not, combinations
couple_data.partner_christ_type = couple_data.partner_christ_type.map(lambda x: 'evang or born again' if x == 'yes' else x)
couple_data.papevangelical = couple_data.papevangelical.map(lambda x: 'evang or born again' if x == 'yes' else x)

In [819]:
couple_evang_comb = couple_data.partner_christ_type.astype('object') + '_' + couple_data.papevangelical.astype('object')
couple_data['couple_evang_comb'] = couple_evang_comb.map(lambda x: 'no_evang or born again' if x.find('evang or born again') >= 0 and x.find('no') >= 0 else x)

In [820]:
# remove individual evang columns
couple_data.drop(labels=['partner_christ_type','papevangelical'], axis=1, inplace=True)

In [821]:
# couples are assumed to have not changed their religion
# group no religion, jewish and neiether christian nor jewish together as other
couple_data.respondent_relig_16_cat = couple_data.respondent_relig_16_cat.map(lambda x: 'other' if x != 'Protestant or oth Christian' and \
                                       x != 'Catholic' else x)
couple_data.partner_relig_16_cat = couple_data.partner_relig_16_cat.map(lambda x: 'other' if x != 'Protestant or oth Christian' and \
                                       x != 'Catholic' else x)

In [822]:
couple_relig_comb = couple_data.respondent_relig_16_cat.astype('object') + '_' + couple_data.partner_relig_16_cat.astype('object')
couple_data['couple_relig_comb'] = couple_relig_comb.map(lambda x: 'Protestant or oth Christian_Catholic' if x.find('Protestant or oth Christian') >= 0 and \
                     x.find('Catholic') >= 0 else 'Protestant or oth Christian_other' if x.find('Protestant or oth Christian') >= 0 and \
                     x.find('other') >= 0 else 'Catholic_other' if x.find('Catholic') >= 0 and x.find('other') >= 0 \
                      else x)

In [823]:
# remove individual religion columns
couple_data.drop(labels=['respondent_relig_16_cat','partner_relig_16_cat'], axis=1, inplace=True)

In [824]:
# it seems like age when married column does not tally with married or not column,
# since couple is not married, there should not be an age when they were married
couple_data[couple_data.married == 'not married'].loc[:,['age_when_married']].age_when_married.value_counts()

25.0    692
Name: age_when_married, dtype: int64

In [825]:
# dropping age when married column
couple_data.drop(labels=['age_when_married'], axis=1, inplace=True)

In [826]:
# trim leading and trailing white spaces
couple_data.respondent_race = couple_data.respondent_race.map(lambda x: x.strip())
couple_data.partner_race = couple_data.partner_race.map(lambda x: x.strip())
print(couple_data.respondent_race.value_counts())
print(couple_data.partner_race.value_counts())

NH white                 1283
Hispanic                  175
NH black                  124
NH Asian Pac Islander      37
NH Other                   14
NH Amer Indian             14
Name: respondent_race, dtype: int64
NH white                 1330
Hispanic                  132
NH black                  125
NH Asian Pac Islander      27
NH Other                   22
NH Amer Indian             11
Name: partner_race, dtype: int64


In [827]:
# To make race columns more useful, we will reduce it to just two unique values, NH white & others
couple_data.respondent_race = couple_data.respondent_race.map(lambda x: 'other' if x != 'NH white' else x)
couple_data.partner_race = couple_data.partner_race.map(lambda x: 'other' if x != 'NH white' else x)

In [828]:
# Get couple race combinations
couple_race_comb = couple_data.respondent_race.astype('object') + '_' + couple_data.partner_race.astype('object')
couple_data['couple_race_comb'] = couple_race_comb.map(lambda x: 'NH white_other' if x.find('NH white') >= 0 and x.find('other') >= 0 else x)

In [829]:
# Drop individual race columns
couple_data.drop(labels=['respondent_race','partner_race'], axis=1, inplace=True)

In [830]:
# drop ppgender, only pertains to individual
couple_data.drop(labels=['ppgender'], axis=1, inplace=True)

In [831]:
# Create new feature called partner to respondent years of education
couple_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1647 entries, 33536 to 4621371
Data columns (total 46 columns):
higher_income_earner            1647 non-null object
same_high_school                1647 non-null object
same_college_uni                1647 non-null object
grow_up_same_city_town          1647 non-null object
both_parents_knew_before_met    1647 non-null object
parent_alive                    1647 non-null object
met_partner_work                1647 non-null object
fam_intro_partner               1647 non-null object
friend_intro_partner            1647 non-null object
colleague_intro_partner         1647 non-null object
self_intro_partner              1647 non-null object
q24_met_online                  1647 non-null category
q24_school                      1647 non-null category
q24_college                     1647 non-null category
q24_church                      1647 non-null category
q24_vol_org                     1647 non-null category
q24_customer                    1

In [832]:
# pickle cleaned data
couple_data.to_pickle('./couple_data')