In [329]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy as sp
from sklearn import preprocessing as pp
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

In [330]:
# Features are mainly demographic data from onset of survey,
# excluding 1 relationship centric feature - 'relationship quality'

# Goal is to only use data gathered at the start of the survey, Wave 1,
# to try to predict the outcome of partnered respondents, based on respondent and partner demographic data
# This allows us to investigate and discover what type of individuals 
# have better chance of surviving a 6 year relationship

In [331]:
df_w5 = pd.read_stata('/Users/david.yan/Downloads/HCMST_wave_5_supplement_ver_1.dta')
df_w4 = pd.read_stata('/Users/david.yan/Downloads/wave_4_supplement_v1_2.dta')
df_w123 = pd.read_stata('/Users/david.yan/Downloads/HCMST_ver_3.04.dta')

In [332]:
w5_cols = df_w5.columns.tolist()
w4_cols = df_w4.columns.tolist()
w123_cols = df_w123.columns.tolist()

In [333]:
print(len(w5_cols), len(w4_cols), len(w123_cols))

78 62 387


In [334]:
print(df_w123.shape, df_w4.shape, df_w5.shape)

(4002, 387) (4002, 62) (4002, 78)


In [335]:
wave_1_couples = df_w123.loc[(df_w123['qflag'] == 'partnered') & (df_w123['partner_deceased'] == 'not deceased'), 'caseid_new':]

In [336]:
wave_1_couples.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2981 entries, 0 to 4001
Columns: 387 entries, caseid_new to w3_nonmbtiming_month
dtypes: category(325), float32(16), float64(39), int32(4), int8(2), object(1)
memory usage: 2.1+ MB


In [337]:
# Must make sure all couples in dataset are partnered right from the start
cp_index = np.concatenate((w4_breakup.index,w5_breakup.index,survived_couples.index))
wave_1_couples.loc[cp_index].head()

Unnamed: 0,caseid_new,weight1,weight2,ppage,ppagecat,ppagect4,ppeduc,ppeducat,ppethm,ppgender,...,w3_mbtiming_year,w3_mbtiming_month,w3_q5,w3_q6,w3_q7,w3_q8,w3_q9,w3_q10,w3_nonmbtiming_year,w3_nonmbtiming_month
22,44486,3581,3581.0,48,45-54,45-59,"some college, no degree",some college,"black, non-hispanic",female,...,,,,,,,,,,
73,97195,43431,43431.0,19,18-24,18-29,"some college, no degree",some college,"white, non-hispanic",male,...,,,yes,no,"no, did not marry [xNameP]",,,,,
97,121515,39909,39909.0,51,45-54,45-59,bachelors degree,bachelor's degree or higher,"white, non-hispanic",female,...,,,yes,no,"no, did not marry [xNameP]",,,,,
119,144545,109536,109536.0,54,45-54,45-59,professional or doctorate degree,bachelor's degree or higher,"other, non-hispanic",male,...,,,yes,no,"no, did not marry [xNameP]",,,,,
122,147245,69278,69278.0,24,18-24,18-29,high school graduate - high school diploma or ...,high school,"white, non-hispanic",male,...,,,yes,yes,"no, did not marry [xNameP]",,,,,


In [338]:
wave_2_couples_broke_up = wave_1_couples.loc[wave_1_couples['w2_broke_up']=='broke up']
wave_3_couples_broke_up = wave_1_couples.loc[wave_1_couples['w3_broke_up']=='broke up']
w4_breakup = df_w4.loc[df_w4['w4_broke_up']=='broke up']
w5_breakup = df_w5.loc[df_w5['w5_broke_up']=='broke up']
survived_couples=df_w5.loc[df_w5['w5_broke_up']=='still together']

In [339]:
w2_breakup_list=wave_2_couples_broke_up['caseid_new'].tolist()
w3_breakup_list=wave_3_couples_broke_up['caseid_new'].tolist()
w4_breakup_list=w4_breakup['caseid_new'].tolist()
w5_breakup_list=w5_breakup['caseid_new'].tolist()
survived_couples_list=survived_couples['caseid_new'].tolist()

In [340]:
print(len(w5_breakup_list)+len(w4_breakup_list)+len(w3_breakup_list)+len(w2_breakup_list))
print(len(survived_couples_list))

503
1066


In [341]:
breakup_list = np.concatenate((w5_breakup_list,w4_breakup_list,w3_breakup_list,w2_breakup_list), axis=0)
together_list = survived_couples_list

In [342]:
features = wave_1_couples.loc[:,:'coresident'].columns
couple_data = wave_1_couples[features]

In [343]:
couple_data.set_index(keys=['caseid_new'], inplace=True)

In [344]:
# dropping obviously irrelevant columns
cols_to_drop = couple_data.loc[w2_breakup_list,'pphhcomp11_member2_age':'weight_couples_coresident'].columns

In [345]:
couple_data.drop(labels=cols_to_drop, axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [346]:
couple_breakup = couple_data.loc[breakup_list,:]
couple_together = couple_data.loc[together_list,:]

In [347]:
print(couple_breakup.shape)
print(couple_together.shape)

(503, 231)
(1066, 231)


In [348]:
couple_breakup['relationship_outcome_6yrs'] = [1 for x in range(couple_breakup.shape[0])]
couple_together['relationship_outcome_6yrs'] = [0 for x in range(couple_together.shape[0])]

In [349]:
print(couple_breakup.relationship_outcome_6yrs.unique())
print(couple_together.relationship_outcome_6yrs.unique())

[1]
[0]


In [350]:
couple_data = pd.concat([couple_breakup, couple_together], axis=0)

In [351]:
# drop more unnecessary columns that will not be useful features
couple_data.drop(labels=['weight1','weight2'], axis=1, inplace=True)

In [352]:
# drop age category columns, already have age continuous column
couple_data.drop(labels=['ppagecat','ppagect4'], axis=1, inplace=True)

In [353]:
# use education category instead of education
couple_data.drop(labels=['ppeduc'], axis=1, inplace=True)

In [354]:
# use continuous household income instead of categorical household income
couple_data.drop(labels=['ppincimp'], axis=1, inplace=True)

In [355]:
# Drop MSA metropolitan statistical area, irrelevancy
couple_data.drop(labels=['ppmsacat'], axis=1, inplace=True)

In [356]:
# drop ppt01, ppt1317, ppt25, ppt612, features aggregated in children_in_hh
couple_data.drop(labels=['ppt01','ppt1317','ppt25','ppt612'], axis=1, inplace=True)

In [357]:
# drop ppq14arace, redundant column, there are individual race columns
couple_data.drop(labels=['ppq14arace'], axis=1, inplace=True)

In [358]:
# drop ppppcmdate_yrmo, pppadate_yrmo, date of survey is irrelevant
couple_data.drop(labels=['ppppcmdate_yrmo','pppadate_yrmo'], axis=1, inplace=True)

In [359]:
# drop HCMST_main_interview_yrmo, date of interview is irrelevant
# drop interview duration, qflag - all partnered
couple_data.drop(labels=['HCMST_main_interview_yrmo','duration','qflag'], axis=1, inplace=True)

In [360]:
# drop papglb_status, same as glbstatus
# drop recsource, source of recruitment is irrelevant
# drop s1a,s2,q3_codes,q5,q15a1_compressed,q17c,q17d redundant or too many NAs, 
couple_data.drop(labels=['glbstatus','papglb_status','recsource','s1','s1a','s2','q3_codes','q5','q15a1_compressed','q17c','q17d'], axis=1, inplace=True)

In [361]:
# drop q18a_1, q18a_2, q18a_3, q18b_codes, q18c_codes, low variance and too many NAs
couple_data.drop(labels=['q18a_1','q18a_2','q18a_3','q18b_codes','q18c_codes','q18a_refused'], axis=1, inplace=True)

In [362]:
# drop q20, q21a_refusal, q21b_refusal,q21c_refusal,q21d_refusal,q21e,q21e_refusal,q24_codes
# 'q31_9','q31_other_text_entered','q33_7','q33_other_text_entered',not usable column
# q35_codes, q35_text_entered, summary_q24_total
couple_data.drop(labels=['q20','q21a_refusal','q21b_refusal','q21c_refusal','q21d_refusal',
                        'q21e','q21e_refusal','q24_codes','q31_9','q31_other_text_entered',
                        'q33_7','q33_other_text_entered','q35_codes','q35_text_entered',
                        'summary_q24_total'], axis=1, inplace=True)

In [363]:
# drop marrynotreally, marrycountry, civilnotreally, partner_deceased
# partner_religion_reclassified, partner_religion_child_reclass, own_religion_child_reclass
# q32_internet, how_met_online, potential_partner_gender_recodes, how_long_ago_first_met_cat
# duplicated representation of previous columns, too many NA
couple_data.drop(labels=['marrynotreally','marrycountry','civilnotreally','partner_deceased',
                        'partner_religion_reclassified','partner_religion_child_reclass',
                        'own_religion_child_reclass','q32_internet','how_met_online',
                        'either_internet','either_internet_adjusted','potential_partner_gender_recodes',
                        'how_long_ago_first_met_cat'], axis=1, inplace=True)

In [364]:
# drop q24_R_friend, q24_P_friend, q24_R_family, q24_P_family, q24_R_neighbor, q24_P_neighbor
# q24_R_cowork, q24_P_cowork,papreligion,q13b,respondent_religion_at_16,partner_religion_at_16
# q7b, q8b,q30
# columns are aggregated into other columns
couple_data.drop(labels=['q24_R_friend','q24_P_friend','q24_R_family','q24_P_family','q24_R_neighbor','q24_P_neighbor',
                        'q24_R_cowork','q24_P_cowork','papreligion','q13b','respondent_religion_at_16',
                        'partner_religion_at_16','q7b','q8b','q30'], axis=1, inplace=True)

In [365]:
# drop home_country_recode, too many NAs
couple_data.drop(labels=['home_country_recode'], axis=1, inplace=True)

In [368]:
couple_data.shape

(1569, 154)

In [426]:
# Iterate through each series in the dataframe, impute nulls with highest mode, binary values
import random
sr = random.SystemRandom()

def impute_bin_cols(cols):
    
    for col in cols:
        # get number of keys
        greater_key = (list(couple_data[col].value_counts().sort_values(ascending=False).to_dict().keys())[0])
        greater_counts = couple_data[col].value_counts().to_dict()[greater_key]

        if len(couple_data[col].value_counts().sort_values(ascending=False).to_dict().keys()) > 1:
            lesser_key = (list(couple_data[col].value_counts().sort_values(ascending=False).to_dict().keys())[1])
            lesser_counts = couple_data[col].value_counts().to_dict()[lesser_key]

            if greater_counts > lesser_counts:
                couple_data[col].fillna(greater_key, inplace=True)
            else:
                couple_data[col].fillna(sr.choice([greater_key,lesser_key]), inplace=True)
        else:
            couple_data[col].fillna(greater_key, inplace=True)
    
    return couple_data[cols].isnull().sum()

In [515]:
couple_data.isnull().sum().sort_values(ascending=False)

relationship_outcome_6yrs           0
q17b                                0
q23                                 0
q21d                                0
q21c                                0
q21b                                0
q21a                                0
q19                                 0
gender_attraction                   0
q16                                 0
coresident                          0
q14                                 0
q13a                                0
q12                                 0
q11                                 0
q10                                 0
q9                                  0
q8a                                 0
q25                                 0
q26                                 0
q27                                 0
q28                                 0
q33_5                               0
q33_4                               0
q33_3                               0
q33_2                               0
q33_1       

In [370]:
# investigate null features
# drop features with more than 70% null
couple_data.drop(labels=['q22'], axis=1, inplace=True)

In [371]:
# drop q17a, use q17b instead, current marriage already taken into account in 'married' column
couple_data.drop(labels=['q17a'], axis=1, inplace=True)

In [372]:
# fill null and 'refused' values
couple_data.q17b.fillna('never married', inplace=True)
couple_data.q17b = couple_data.q17b.map(lambda x: 'never married' if x == 'refused' else x)

couple_data.q26 = couple_data.q26.map(lambda x: 'did not attend same college or university' if x == 'refused' else x)

couple_data.parental_approval.fillna("don't approve or don't know", inplace=True)

In [375]:
# fill null values for continuous variable
couple_data.q21d.fillna(couple_data.q21d.median(), inplace=True)
couple_data.q21c.fillna(couple_data.q21c.median(), inplace=True)
couple_data.how_long_ago_first_cohab.fillna(couple_data.how_long_ago_first_cohab.median(), inplace=True)

In [389]:
# q24 columns 
cols = [x for x in couple_data.columns if 'q24_' in x]
impute_bin_cols(cols)

In [514]:
# more binary columns
cols = ['met_through_as_coworkers','met_through_friends','met_through_family',
        'papevangelical','met_through_as_neighbors','US_raised','coresident']
impute_bin_cols(cols)

met_through_as_coworkers    0
met_through_friends         0
met_through_family          0
papevangelical              0
met_through_as_neighbors    0
US_raised                   0
coresident                  0
dtype: int64

In [431]:
# partner mum years of education
couple_data.partner_mom_yrsed.fillna(couple_data.partner_mom_yrsed.median(), inplace=True)

In [435]:
# distancemoved_10mi
couple_data.distancemoved_10mi.fillna(couple_data.distancemoved_10mi.median(), inplace=True)

In [441]:
# how_long_ago_first_romantic
couple_data.how_long_ago_first_romantic.fillna(couple_data.how_long_ago_first_romantic.median(), inplace=True)

In [444]:
# how_long_relationship
couple_data.how_long_relationship.fillna(couple_data.how_long_relationship.median(), inplace=True)

In [447]:
# q21b how old were you when your romantic relationship began with your partner
couple_data.q21b.fillna(couple_data.q21b.median(), inplace=True)

In [450]:
# respondent_mom_yrsed
couple_data.respondent_mom_yrsed.fillna(couple_data.respondent_mom_yrsed.median(), inplace=True)

In [455]:
# fill null and convert relationship quality to integers
couple_data.relationship_quality.fillna('fair', inplace=True)
couple_data.relationship_quality = couple_data.relationship_quality.map(lambda x: 5 if x == 'excellent' else 4
                                if x == 'good' else 3 if x == 'fair' else 2 if x == 'poor' else 1)

In [461]:
# Filling null values for gender attraction
couple_data[['gender_attraction','same_sex_couple']].loc[couple_data.gender_attraction.isnull()]

Unnamed: 0_level_0,gender_attraction,same_sex_couple
caseid_new,Unnamed: 1_level_1,Unnamed: 2_level_1
4528265,,different sex couple
799451,,different sex couple
1601511,,different sex couple
1674621,,different sex couple
2780461,,different sex couple
4346371,,different sex couple
4489454,,different sex couple


In [478]:
couple_data.gender_attraction.fillna('opposite gender only', inplace=True)

In [482]:
# q21a how old were you when you first met
couple_data.q21a.fillna(couple_data.q21a.median(), inplace=True)

In [479]:
# how_long_ago_first_met
couple_data.how_long_ago_first_met.fillna(couple_data.how_long_ago_first_met.median(), inplace=True)

In [476]:
# q16 how many of your relatives do you see in person at least once a month
couple_data.q16.fillna(np.median([int(i) for i in couple_data.q16.unique() if np.isnan(i) == False]), inplace=True)
couple_data.q16 = couple_data.q16.astype('float64')

In [485]:
# age difference
couple_data.age_difference.fillna(couple_data.age_difference.median(), inplace=True)

In [488]:
# q9 how old is your partner
couple_data.q9.fillna(couple_data.q9.median(), inplace=True)

In [512]:
# respondent race 
couple_data.respondent_race.fillna('NH white', inplace=True)

In [505]:
# partner religion at 16 years old
couple_data.partner_relig_16_cat.fillna('No religion', inplace=True)

In [502]:
# respondent religion at 16 years old
couple_data.respondent_relig_16_cat.fillna('No religion', inplace=True)

In [492]:
# papglb_friend
# do you have any friends or relatives who you know to be gay, lesbian, or bis 
couple_data.papglb_friend.fillna('i would prefer to not answer this question', inplace=True)

In [498]:
# partner years of education
couple_data.partner_yrsed.fillna(couple_data.partner_yrsed.median(), inplace=True)

In [495]:
# partner race
couple_data.partner_race.fillna('NH white', inplace=True)