In [56]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split as tts


#import cleaned data
data_set = pd.read_csv('clean_data.csv', header=0)

# #import data schema
# data_schema = pd.read_csv('./cleaned_HCMST_2017_schema.csv', header=0, index_col=0)
# data_set

In [57]:
#definite functions to convert the data set into the format used for EDA
def create_met_places(df):
    r_soc, p_soc = [], []
    places = df.columns[list(df.columns).index('R_cowork')::]
    p_cols = df[places].copy()
    p_cols = p_cols.replace({'yes': 1, 'no': 0})
    #use idxmax to identify 'yes' i.e. 1
    p_cols = p_cols.idxmax(axis=1).copy()
    df.insert(loc=df.shape[1], column='places_met', value=pd.Series(p_cols))
    #aggregate respondent, partner and intermediate social scenarios as 'social circle'
    for item in list(df['places_met'].unique()):
        if "R_" in item:
            r_soc.append(item)
        elif "P_" in item:
            p_soc.append(item)
    for i in range(len(r_soc)):
        df = df.replace({'places_met': {r_soc[i]: 'R_soc_circle'}})
        df = df.replace({'places_met': {p_soc[i]: 'P_soc_circle'}})
    #'customer' should be added to R_soc_circle
    df = df.replace({'places_met': {'customer': 'R_soc_circle'}})
    return df

def fix_household_inc(df):
    hh_labels = df['Household_Income'].unique()
    #leave out the first (<$5,000) and last elements ($250,000), create a dictionary for the middle elements
    inc_dict = {}
    for l in hh_labels:
        if ' to ' in l:
            k = int(l[0:l.find(' to ',0)].replace('$','').replace(',',''))
            inc_dict[l] = k
    inc_dict_val = sorted(list(inc_dict.values()))
    for l in hh_labels:
        if ' to ' in l:
            inc_dict[l] = inc_dict_val.index(inc_dict[l]) + 1
    inc_dict['Less than $5,000'] = 0
    inc_dict['$250,000 or more'] = len(hh_labels) - 1
    df = df.replace({'Household_Income': inc_dict})
    #create feature 'Household_Income2'
    hh_inc2 = df['Household_Income'].copy()
    hh_inc2 = hh_inc2.apply(lambda x: x**2)
    df.insert(loc=6, column='Household_Income2', value=pd.Series(hh_inc2))
    return df

def fix_rel_attendance(df):
    #Recategorized "refused"
    df = df.replace({'Religious_Attendance': {'Refused': 'A few times a year'}})
    rel_dict = {'More than once a week': 0, 'Once a week': 1, 'Once or twice a month': 2, 
           'A few times a year': 3, 'Once a year or less': 4, 'Never': 5}
    df = df.replace({'Religious_Attendance': rel_dict})
    return df


def fix_politics(df):
    #Recategorize "refused"
    df = df.replace({'P_Politics': {'Refused': 3}})
    pol_dict = {'Strong Republican': 0, 'Leans Republican': 1, 'Not Strong Republican': 2,
           'Undecided/Independent/Other': 3, 'Not Strong Democrat': 4, 'Leans Democrat': 5,
            'Strong Democrat': 6}
    df = df.replace({'Politics': pol_dict})
    df = df.replace({'P_Politics': pol_dict})
    pol_col = abs(df.Politics - df.P_Politics)
    df.insert(loc=8, column='Pol_Diff', value=pd.Series(pol_col))
    return df
    
def fix_education(df):
    p_edu_list = list(df['P_Education'].unique())
    p_edu_dict = {x: 'Less than high school' for x in p_edu_list[p_edu_list.index('12th grade no diploma'):]}
    p_edu_dict[p_edu_list[0]] = 'High school'
    for i in [2, 4]:
        p_edu_dict[p_edu_list[i]] = 'Some college'
    for i in [1, 3, 5]:
        p_edu_dict[p_edu_list[i]] = 'Bachelor\'s degree or higher'
    df = df.replace({'P_Education': p_edu_dict})
    #convert educational categories to ordinal form, create a new feature called 'Edu_Diff'
    edu_cat_dict = {'Bachelor\'s degree or higher': 0, 'Some college': 1, 'High school': 2, 'Less than high school': 3}
    df = df.replace({'Education': edu_cat_dict})
    df = df.replace({'P_Education': edu_cat_dict})
    edu_col = abs(df.Education - df.P_Education)
    df.insert(loc=8, column='Edu_Diff', value=pd.Series(edu_col))
    return df

def fix_age(df):
    #eliminate any NaN elements due to the absolute subtraction of 'Age' from 'P_Age'
    df = df[df['Age_Diff'].notna()]
    return df

In [58]:
#master function to transform data set with auxiliary functions
def data_preprocess(df):
    aux_f = [create_met_places, fix_household_inc, fix_rel_attendance, fix_politics, fix_education, fix_age]
    data = df.copy()
    for f in aux_f:
        data = f(data)
    return data

In [59]:
#save processed data as new dataframe
data_set2 = data_preprocess(data_set)

In [60]:
pd.set_option('display.max_columns', None)
data_set2

Unnamed: 0,Married,Years_Together,First_Together,End_Year,Religious_Attendance,Interracial_Couple,Household_Income2,Age_Diff,Edu_Diff,Pol_Diff,Age,P_Age,Education,P_Education,Gender,Household_Income,Ethnicity,P_Ethnicity,Politics,P_Politics,White,Black or African American,American Indian or Alaska Native,Asian Indian,Chinese,Filipino,Japanese,Korean,Vietnamese,Other Asian,Hawaiian,Guamanian,Samoan,Other Pacific Islander,Some other race,R_cowork,R_friend,R_family,R_sig_other,R_neighbor,P_cowork,P_friend,P_family,P_sig_other,P_neighbor,btwn_I_cowork,btwn_I_friend,btwn_I_family,btwn_I_sig_other,btwn_I_neighbor,school,college,mil,church,vol_org,customer,bar_restaurant,party,internet_other,internet_dating,internet_soc_network,internet_game,internet_chat,internet_org,public,blind_date,vacation,single_serve_nonint,business_trip,work_neighbors,met_online,places_met
0,"Yes, I am Married",34.0,1983.0,,5,no,289,3.0,2,1,55,52.0,0,2,Female,17,"White, Non-Hispanic",White,2,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,yes,no,no,no,no,yes,no,no,no,no,no,no,no,no,no,no,no,no,no,no,no,no,no,no,no,no,no,no,no,no,no,no,no,no,no,no,R_soc_circle
1,"Yes, I am Married",11.0,2006.0,,2,no,361,2.0,0,0,47,45.0,0,0,Male,19,"White, Non-Hispanic",White,5,5,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,no,no,no,no,no,no,no,no,no,no,no,no,no,no,no,no,no,no,no,no,no,yes,no,yes,no,no,no,no,no,no,no,no,no,no,no,yes,bar_restaurant
2,"Yes, I am Married",34.0,1983.0,,4,no,324,0.0,1,0,59,59.0,0,1,Female,18,"White, Non-Hispanic",White,6,6,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,no,no,no,no,yes,no,no,no,no,yes,no,no,no,no,no,no,no,no,no,no,no,no,no,no,no,no,no,no,no,no,yes,no,no,no,no,no,R_soc_circle
3,"Yes, I am Married",36.0,1981.0,,1,no,169,1.0,1,3,59,60.0,2,1,Male,13,"White, Non-Hispanic",White,6,3,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,no,no,no,no,no,no,no,no,no,no,no,no,no,no,no,no,no,no,no,no,no,yes,no,no,no,no,no,no,no,no,no,no,yes,no,no,no,bar_restaurant
4,"Yes, I am Married",51.0,1966.0,,1,no,121,1.0,2,3,66,67.0,2,0,Female,11,"White, Non-Hispanic",White,4,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,no,yes,no,no,no,no,yes,no,no,no,no,no,no,no,no,no,no,no,no,no,no,yes,no,no,no,no,no,no,no,no,yes,no,no,no,no,no,R_soc_circle
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2919,"Yes, I am Married",2.0,2015.0,,5,no,144,2.0,1,1,27,29.0,0,1,Female,12,"White, Non-Hispanic",White,6,5,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,no,no,no,no,no,no,no,no,no,no,no,no,no,no,no,no,no,no,no,no,no,yes,no,no,yes,no,no,no,no,no,no,no,no,no,no,yes,bar_restaurant
2920,"No, I am not Married",1.0,2016.0,,3,no,400,3.0,1,1,24,21.0,0,1,Male,20,"White, Non-Hispanic",White,2,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,no,no,yes,no,no,no,yes,no,no,no,no,no,no,no,no,no,no,no,no,no,no,no,no,no,yes,no,no,no,no,no,no,no,no,no,no,yes,R_soc_circle
2921,"Yes, I am Married",5.0,2012.0,,3,yes,256,2.0,1,1,41,39.0,0,1,Male,16,"White, Non-Hispanic",Black or African American,4,5,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,no,no,no,no,no,no,no,no,no,no,no,no,no,no,no,no,no,no,no,no,no,no,no,no,yes,no,no,no,no,no,no,no,no,no,no,yes,internet_dating
2922,"No, I am not Married",3.0,2014.0,,4,yes,144,4.0,2,0,32,28.0,0,2,Male,12,"White, Non-Hispanic",Asian or Pacific Islander,5,5,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,no,no,no,no,no,no,no,no,no,no,no,no,no,no,no,no,no,no,no,no,no,yes,no,no,yes,no,no,no,no,no,no,no,no,no,no,yes,bar_restaurant


In [61]:
#split data set into in sample and out-of-sample (OOS) data,
#so that a machine learning model can be built, and we can use the model to classify OOS data
oos_size = int(0.10*data_set2.shape[0])
ins_size = data_set2.shape[0] - oos_size
data = data_set2[:ins_size]
oos_data = data_set2[ins_size:]

In [62]:
#isolate the numeric features and prediction target we will be using for the machine learning model
num_features = ['Household_Income', 'Religious_Attendance', 'Household_Income2', 'Pol_Diff', 'Edu_Diff', 'Age_Diff']
X = data[num_features].copy()
y = data.Years_Together.copy()

#create train test splits, set random_state = 0 to disable shuffling, test size is 25% of training examples
X_train, X_test, y_train, y_test = tts(X, y, test_size=0.25, random_state=0)

In [63]:
#let try things out with a few regression models, and record model scores
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error as mse
from math import sqrt
scores = []
rmses = []

lin_model = LinearRegression().fit(X_train, y_train)
scores.append(lin_model.score(X_train, y_train))
y_hat = lin_model.predict(X_test)
rmses.append(sqrt(mse(y_test, y_hat, squared=True)))

dt_model = DecisionTreeRegressor().fit(X_train, y_train)
scores.append(dt_model.score(X_train, y_train))
y_hat = dt_model.predict(X_test)
rmses.append(sqrt(mse(y_test, y_hat, squared=True)))

rf_model = RandomForestRegressor().fit(X_train, y_train)
scores.append(rf_model.score(X_train, y_train))
y_hat = rf_model.predict(X_test)
rmses.append(sqrt(mse(y_test, y_hat, squared=True)))
print(f'Scores: {scores}')
print(f'RMSE: {rmses}')

Scores: [0.07730163473950524, 0.8770115181501104, 0.7612180973640528]
RMSE: [15.71185353000026, 23.00023769020978, 17.25169106593022]
