In [415]:
import numpy as np
import pandas as pd
#import stata file using pandas
raw_data = pd.read_stata('./HCMST_2017_public_data_v1.1_stata/HCMST_2017_draft_v1.1.dta')
#import data schema
data_schema = pd.read_csv('./cleaned_HCMST_2017_schema.csv', header=0, index_col=0)
#uses variables labels in data schema to create a survey glossary of terms
data_glossary = data_schema['variable_label']

In [416]:
print(data_glossary.loc['ppincimp'])
raw_data.shape

Household Income 


(3510, 285)

In [417]:
raw_data['w6_q12']

0                  Leans Republican
1                  Leans Republican
2                    Leans Democrat
3       Undecided/Independent/Other
4                   Strong Democrat
5       Undecided/Independent/Other
6                  Leans Republican
7                    Leans Democrat
8                               NaN
9                    Leans Democrat
10                   Leans Democrat
11      Undecided/Independent/Other
12                  Strong Democrat
13                   Leans Democrat
14              Not Strong Democrat
15                  Strong Democrat
16      Undecided/Independent/Other
17                  Strong Democrat
18                 Leans Republican
19                  Strong Democrat
20      Undecided/Independent/Other
21                  Strong Democrat
22              Not Strong Democrat
23                   Leans Democrat
24      Undecided/Independent/Other
25      Undecided/Independent/Other
26                 Leans Republican
27                 Leans Rep

## Analysis Goals

The following are the questions we are trying to answer with the data set:

1) Where are the top fives places couples met that resulted in the marriages that lasted the longest?

2) Does the difference in income between partners affect the length of marriage?

3) Does religiosity affect the length of marriage?

4) Does political leanings affect the length of marriage?

5) Does ethnicity affect the length of marriage?

6) Does the age at which couples met affect the length of marriage?

**During Data Cleaning, we should only collect features which help us answer these questions**


In [418]:
#show all rows, set max column width to 1000px to see the full text
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', 1000)
data_schema = data_schema.reset_index()
data_schema

Unnamed: 0,variable_name,storage_type,display_format,value_label,variable_label
0,CaseID,int,%8.0g,,Case ID
1,CASEID_NEW,long,%12.0g,,Longitudnal CaseID
2,qflag,byte,%8.0g,QFLAG,DOV: Qualification Flag
3,weight1,double,%12.0g,,Post-Stratification weight for Genpop (n=2994)
4,weight1_freqwt,float,%9.0g,,"wgt to CPS adult pop, scaled down by 2994/3110=round(weight1*2436295 > 95/3110)"
5,weight2,double,%12.0g,,Post-Stratification weight for LGB (n=551)
6,weight1a,double,%12.0g,,Post-Stratification weight for total consented Genpop respondents (n=3110)
7,weight1a_freqwt,float,%9.0g,,"weighted up to CPS adult population, =round(weight1a*243629595/3110)"
8,weight_combo,float,%9.0g,,"weight that combines all LGB subjects weighted down, with gen pop"
9,weight_combo_freqwt,float,%9.0g,,frequency weight version of weight_combo


In [419]:
#choose categories to keep
cat_to_keep = ['S1', 'Q21A_Year', 'Q21D_2_Year',
               'ppp20072', 'interracial_5cat', 'ppage', 
               'w6_q9', 'ppeducat', 'w6_q10', 'ppgender', 'ppincimp', 'ppethm', 'w6_q6b', 'partyid7', 'w6_q12']
cat_new_labels = ['Married', 'FirstMet_Year', 'Married_Year',
                  'Religious_Attendance', 'Interracial_Couple', 'Age', 'P_Age',
                 'Education', 'P_Education', 'Gender', 'Household_Income', 'Ethnicity', 'P_Ethnicity', 'Politics', 'P_Politics']

In [420]:
#use for loop to collect more categories
for i in range(152,167):
    cat_to_keep.append(data_schema['variable_name'].iloc[i])
    cat_new_labels.append(data_schema['variable_label'].iloc[i])

In [421]:
#use for loop to collect more categories
for i in range(251,272):
    cat_to_keep.append(data_schema['variable_name'].iloc[i])
    cat = data_schema['variable_name'].iloc[i]
    cat_new_labels.append(cat[cat.find('_', len(data_schema['variable_name'].iloc[i])//2)+1:])

In [422]:
#clean cat labels
for c in range(len(cat_new_labels)):
    if 'Q14: ' in cat_new_labels[c]:
        cat_new_labels[c] = cat_new_labels[c].replace('Q14: ', '')

In [423]:
#new data set
data_set = raw_data[cat_to_keep].copy()
data_set.columns = cat_new_labels 

In [424]:
data_set

Unnamed: 0,Married,FirstMet_Year,Married_Year,Religious_Attendance,Interracial_Couple,Age,P_Age,Education,P_Education,Gender,...,game,chat,org,public,blind_date,vacation,serve_nonint,trip,neighbors,met_online
0,"No, I am not Married",,,Never,no,30,26.0,Some college,HS graduate or GED,Male,...,no,no,no,no,no,no,no,no,no,yes
1,"Yes, I am Married",1983,,Never,no,55,52.0,Bachelor's degree or higher,Masters degree,Female,...,no,no,no,no,no,no,no,no,no,no
2,"Yes, I am Married",2006,,Once or twice a month,no,47,45.0,Bachelor's degree or higher,Associate degree,Male,...,no,no,no,no,no,no,no,no,no,yes
3,"No, I am not Married",,,Never,no,28,26.0,Less than high school,HS graduate or GED,Female,...,yes,no,no,no,no,no,no,no,no,yes
4,"Yes, I am Married",1983,,Once a year or less,no,59,59.0,Bachelor's degree or higher,Bachelors degree,Female,...,no,no,no,no,yes,no,no,no,no,no
5,"Yes, I am Married",1981,,Once a week,no,59,60.0,High school,Bachelors degree,Male,...,no,no,no,no,no,no,yes,no,no,no
6,"Yes, I am Married",1966,,Once a week,no,66,67.0,High school,"Some college, no degree",Female,...,no,no,no,no,yes,no,no,no,no,no
7,"Yes, I am Married",1967,,Never,no,65,65.0,Some college,Professional or Doctorate degree,Female,...,no,no,no,yes,no,no,no,no,no,no
8,"No, I am not Married",,,,,27,,Bachelor's degree or higher,,Male,...,,,,,,,,,,
9,"No, I am not Married",,,More than once a week,no,65,72.0,High school,12th grade no diploma,Female,...,no,no,no,no,no,no,no,no,no,no


In [425]:
data_set.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3510 entries, 0 to 3509
Data columns (total 51 columns):
 #   Column                             Non-Null Count  Dtype   
---  ------                             --------------  -----   
 0   Married                            3510 non-null   category
 1   FirstMet_Year                      2856 non-null   category
 2   Married_Year                       132 non-null    category
 3   Religious_Attendance               3394 non-null   category
 4   Interracial_Couple                 3365 non-null   category
 5   Age                                3510 non-null   category
 6   P_Age                              3374 non-null   float64 
 7   Education                          3510 non-null   category
 8   P_Education                        3394 non-null   category
 9   Gender                             3510 non-null   category
 10  Household_Income                   3510 non-null   category
 11  Ethnicity                          3510 non

In [426]:
data_set

Unnamed: 0,Married,FirstMet_Year,Married_Year,Religious_Attendance,Interracial_Couple,Age,P_Age,Education,P_Education,Gender,...,game,chat,org,public,blind_date,vacation,serve_nonint,trip,neighbors,met_online
0,"No, I am not Married",,,Never,no,30,26.0,Some college,HS graduate or GED,Male,...,no,no,no,no,no,no,no,no,no,yes
1,"Yes, I am Married",1983,,Never,no,55,52.0,Bachelor's degree or higher,Masters degree,Female,...,no,no,no,no,no,no,no,no,no,no
2,"Yes, I am Married",2006,,Once or twice a month,no,47,45.0,Bachelor's degree or higher,Associate degree,Male,...,no,no,no,no,no,no,no,no,no,yes
3,"No, I am not Married",,,Never,no,28,26.0,Less than high school,HS graduate or GED,Female,...,yes,no,no,no,no,no,no,no,no,yes
4,"Yes, I am Married",1983,,Once a year or less,no,59,59.0,Bachelor's degree or higher,Bachelors degree,Female,...,no,no,no,no,yes,no,no,no,no,no
5,"Yes, I am Married",1981,,Once a week,no,59,60.0,High school,Bachelors degree,Male,...,no,no,no,no,no,no,yes,no,no,no
6,"Yes, I am Married",1966,,Once a week,no,66,67.0,High school,"Some college, no degree",Female,...,no,no,no,no,yes,no,no,no,no,no
7,"Yes, I am Married",1967,,Never,no,65,65.0,Some college,Professional or Doctorate degree,Female,...,no,no,no,yes,no,no,no,no,no,no
8,"No, I am not Married",,,,,27,,Bachelor's degree or higher,,Male,...,,,,,,,,,,
9,"No, I am not Married",,,More than once a week,no,65,72.0,High school,12th grade no diploma,Female,...,no,no,no,no,no,no,no,no,no,no


In [427]:
data_set[['FirstMet_Year','Married_Year']][data_set['Married_Year'].notna()]

Unnamed: 0,FirstMet_Year,Married_Year
22,,2015
31,,1989
62,,1972
86,,1965
122,,1992
148,,1968
153,,1981
178,,1988
188,,1992
254,,1991


In [428]:
#Married couples do not report the first year at which they met. Let's combine them as 'First_Together'
data_set.insert(loc=1,column='First_Together',value=pd.Series(data_set['FirstMet_Year'].copy()))
for i in range(data_set['FirstMet_Year'].shape[0]):
    if data_set['FirstMet_Year'].iloc[i] is np.nan:
        data_set['First_Together'].iloc[i] = data_set['Married_Year'].iloc[i]

In [429]:
#check to see if 'First_Together' has successfully incorporated 'Married_Year' values
data_set[['First_Together','Married_Year']][data_set['Married_Year'].notna()]

Unnamed: 0,First_Together,Married_Year
22,2015,2015
31,1989,1989
62,1972,1972
86,1965,1965
122,1992,1992
148,1968,1968
153,1981,1981
178,1988,1988
188,1992,1992
254,1991,1991


In [430]:
#Once the two columns are combined as 'First_Together', drop original columns, convert "Refused" to np.nan
#Filter data set such that we only preserve entries where 'First_Together' is not np.nan
data_set.drop(columns=['FirstMet_Year', 'Married_Year'],inplace=True)
data_set['First_Together'].replace({'Refused': np.nan}, inplace=True)
data_set = data_set[data_set['First_Together'].notna()]

In [431]:
data_set['P_Age'].unique()

array([52., 45., 59., 60., 67., 65., 43., 35., 42., 37., 40., 72., 25.,
       55., 22., 26., 63., 75., 32., 24., 62., 53., 58., 28., 48., 34.,
       64., 23., 54., 36., 70., 29., 41., 74., 56., 21., 38., 57., 30.,
       71., 73., 49., 50., 68., 61., 46., 69., 27., 80., 47., 39., 76.,
       77., 66., 51., 83., 44., 95., -1., nan, 31., 85., 78., 20., 33.,
       79., 81., 87., 86., 84., 18., 82., 89., 19., 90., 17., 91., 10.,
       14., 16., 94.])

In [432]:
#convert numeric categories to numeric type
data_set[['First_Together','Age', 'P_Age']] = data_set.loc[:,['First_Together', 'Age', 'P_Age']].apply(lambda x: pd.to_numeric(x))
#drop row where P_Age is NaN, inf or negative
data_set = data_set[data_set['P_Age'] > 0]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


In [433]:
data_set.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2928 entries, 1 to 3509
Data columns (total 50 columns):
 #   Column                             Non-Null Count  Dtype   
---  ------                             --------------  -----   
 0   Married                            2928 non-null   category
 1   First_Together                     2928 non-null   int64   
 2   Religious_Attendance               2928 non-null   category
 3   Interracial_Couple                 2906 non-null   category
 4   Age                                2928 non-null   int64   
 5   P_Age                              2928 non-null   float64 
 6   Education                          2928 non-null   category
 7   P_Education                        2928 non-null   category
 8   Gender                             2928 non-null   category
 9   Household_Income                   2928 non-null   category
 10  Ethnicity                          2928 non-null   category
 11  P_Ethnicity                        2928 non

In [434]:
data_set['P_Age'] = data_set.loc[:,['P_Age']].apply(lambda x: x.astype('int64'))

In [435]:
data_set.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2928 entries, 1 to 3509
Data columns (total 50 columns):
 #   Column                             Non-Null Count  Dtype   
---  ------                             --------------  -----   
 0   Married                            2928 non-null   category
 1   First_Together                     2928 non-null   int64   
 2   Religious_Attendance               2928 non-null   category
 3   Interracial_Couple                 2906 non-null   category
 4   Age                                2928 non-null   int64   
 5   P_Age                              2928 non-null   int64   
 6   Education                          2928 non-null   category
 7   P_Education                        2928 non-null   category
 8   Gender                             2928 non-null   category
 9   Household_Income                   2928 non-null   category
 10  Ethnicity                          2928 non-null   category
 11  P_Ethnicity                        2928 non

In [436]:
#create a column to figure out years together. Survey is taken in 2017
new_col = data_set.loc[:,['First_Together']].apply(lambda x: 2017 - x)
data_set.insert(loc=1, column='Years_Together', value=pd.Series(new_col.First_Together))

In [437]:
#create a column to figure out age difference between partners
new_col = abs(data_set['Age'] - data_set['P_Age'])
data_set.insert(loc=5, column='Age_Diff', value=list(new_col))

In [438]:
#export cleaned dataset as cleaned_data.csv
data_set.to_csv('clean_data.csv', index=False)