In [1]:
# Imports

import pandas as pd
import pandas_profiling
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import scipy
import sklearn as sk
from sklearn.model_selection import train_test_split
from sklearn import ensemble
from sklearn import tree, metrics
from sklearn import preprocessing
from patsy import dmatrices
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [2]:
# Load data set, drop col with numbers, drop high collinearity columns
# Don't want to drop religion but unsure how to scale Nulls
data = pd.read_csv("marital_satisfaction_data_wrangled_final.csv")
data = data.drop(['Unnamed: 0','spouse_satisfaction','relationship_satisfaction'], axis=1)
data = pd.get_dummies(data, prefix=['rel','cnty'], columns=['religion','country'])

        
data = data.drop(['rel_Catholic','cnty_Croatia'], axis=1)
data.describe()
print(data.columns)

# Dummy vars for categorical, drop Croatia +

Index(['sex', 'age', 'marriage_duration_years', 'num_children_total',
       'num_children_inhome', 'edu_level', 'material_situation', 'religiosity',
       'pension', 'enjoy_spouse_company', 'happiness', 'spouse_attraction',
       'spouse_enjoy_doing_things_together', 'spouse_enjoy_cuddling',
       'spouse_respect', 'spouse_pride', 'spouse_romance', 'spouse_love',
       'marital_satisfaction', 'natl_pride_in_parents',
       'natl_pride_in_children', 'natl_aging_parents_live_with_children',
       'natl_children_live_at_home_marraige', 'indv_pride_in_parents',
       'indv_pride_in_children', 'indv_aging_parents_live_with_children',
       'indv_children_live_at_home_marraige', 'rel_999.0', 'rel_Buddhist',
       'rel_Evangelic', 'rel_Hindu', 'rel_Jehovah', 'rel_Jewish', 'rel_Muslim',
       'rel_None', 'rel_Orthodox', 'rel_Other', 'rel_Protestant',
       'rel_Spiritualism', 'cnty_Brazil ', 'cnty_Bulgaria', 'cnty_Canada',
       'cnty_China', 'cnty_Estonia', 'cnty_Germany', 'cnty_

In [3]:
X = data.drop(['marital_satisfaction'], axis=1)
y = data.marital_satisfaction

In [4]:
cols= list(X.columns)
scaler = preprocessing.RobustScaler()
Xsc = scaler.fit_transform(X)
Xsc = pd.DataFrame(Xsc, columns=cols)
Xsc

# Perform same thing w/o standardizing

Unnamed: 0,sex,age,marriage_duration_years,num_children_total,num_children_inhome,edu_level,material_situation,religiosity,pension,enjoy_spouse_company,...,cnty_Romania,cnty_Russia,cnty_Saudi Arabia,cnty_Slovakia,cnty_South Korea,cnty_Spain,cnty_Switzerland,cnty_Turkey,cnty_U.K.,cnty_Uganda
0,-1.0,-1.058824,-0.555556,-2.0,-0.5,0.0,0.0,0.0,-0.333333,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,-1.0,-0.588235,-0.500000,-1.0,-0.5,0.0,0.0,1.0,-0.666667,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,-1.0,-0.529412,-0.277778,-2.0,-0.5,0.0,0.0,0.0,-0.333333,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,-1.0,-0.529412,-0.277778,-1.0,0.0,0.0,0.0,1.0,-1.333333,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,-1.0,-0.647059,-0.166667,-2.0,-0.5,-1.0,1.0,0.5,-1.333333,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7173,-1.0,-0.411765,-0.277778,0.0,1.0,0.0,2.0,0.0,0.666667,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
7174,-1.0,-0.176471,-0.333333,2.0,1.5,0.0,0.0,0.0,0.666667,-2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
7175,-1.0,-0.411765,-0.555556,-1.0,0.0,0.0,1.0,1.5,-0.333333,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
7176,-1.0,-0.705882,-0.500000,0.0,0.5,0.0,1.0,1.5,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [5]:
vif = pd.DataFrame()
vif["feature"] = X.columns
vif["VIF"]=[variance_inflation_factor(X.values, i) for i in range(len(X.columns))]
vif.sort_values(by='VIF', ascending=False).head(n=25)

# Is this done correctly? If so, do I basically keep everything? (except possibly duration)

Unnamed: 0,feature,VIF
1,age,47.345479
5,edu_level,21.95383
14,spouse_respect,17.276199
15,spouse_pride,12.882883
2,marriage_duration_years,12.56975
17,spouse_love,12.305951
9,enjoy_spouse_company,10.493723
11,spouse_attraction,10.159857
0,sex,10.051635
23,indv_pride_in_children,9.39928


In [6]:
vifsc = pd.DataFrame()
vifsc["feature"] = Xsc.columns
vifsc["VIF"]=[variance_inflation_factor(Xsc.values, i) for i in range(len(Xsc.columns))]
vifsc.sort_values(by='VIF', ascending=False).head(n=10)

# Is this done correctly? If so, do I basically keep everything? (except possibly duration)

Unnamed: 0,feature,VIF
32,rel_Muslim,9.137914
48,cnty_India,6.354239
29,rel_Hindu,6.078286
2,marriage_duration_years,5.720534
1,age,5.092878
50,cnty_Iran,4.949703
34,rel_Orthodox,4.612692
23,indv_pride_in_children,4.218453
19,natl_pride_in_children,3.595851
67,cnty_Turkey,3.363938


In [7]:
y.replace(to_replace=[-3,-2,-1,0,1,2], value=0, inplace=True)
y.replace(to_replace=[3], value=1, inplace=True)
y.value_counts()

0    4442
1    2736
Name: marital_satisfaction, dtype: int64

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=.25)

In [9]:
log_reg = sm.Logit(y_train, X_train).fit()
print(log_reg.params)

         Current function value: 0.505528
         Iterations: 35
sex                       -0.564316
age                       -0.047431
marriage_duration_years    0.035044
num_children_total         0.072426
num_children_inhome       -0.024770
                             ...   
cnty_Spain                -0.641313
cnty_Switzerland          -0.763027
cnty_Turkey                0.137168
cnty_U.K.                 -0.134327
cnty_Uganda               -1.801779
Length: 70, dtype: float64




In [10]:
log_reg.summary()

0,1,2,3
Dep. Variable:,marital_satisfaction,No. Observations:,5383.0
Model:,Logit,Df Residuals:,5313.0
Method:,MLE,Df Model:,69.0
Date:,"Tue, 23 Mar 2021",Pseudo R-squ.:,0.2391
Time:,20:38:27,Log-Likelihood:,-2721.3
converged:,False,LL-Null:,-3576.3
Covariance Type:,nonrobust,LLR p-value:,1.505e-311

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
sex,-0.5643,0.066,-8.559,0.000,-0.694,-0.435
age,-0.0474,0.006,-8.202,0.000,-0.059,-0.036
marriage_duration_years,0.0350,0.007,5.323,0.000,0.022,0.048
num_children_total,0.0724,0.042,1.745,0.081,-0.009,0.154
num_children_inhome,-0.0248,0.034,-0.728,0.467,-0.091,0.042
edu_level,-0.3161,0.037,-8.449,0.000,-0.389,-0.243
material_situation,0.0840,0.045,1.848,0.065,-0.005,0.173
religiosity,-0.0691,0.023,-3.040,0.002,-0.114,-0.025
pension,0.0152,0.019,0.809,0.419,-0.022,0.052


In [11]:
y_pred = log_reg.predict(X_test)

In [12]:
rf=ensemble.RandomForestClassifier()
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
print(rf.score(X_test, y_test))
#roc + auc for each classifier

0.7598885793871867


In [13]:
def calc_TP_FP_rate(y_test, y_pred_f):
    
    y_pred_f = pd.Series(y_pred_f, index=y_test.index)
    
    TP = 0
    FP = 0
    TN = 0
    FN = 0

    for i in y_test.index: 
        if y_test[i]==y_pred_f[i]==1:
           TP += 1
        if y_pred_f[i]==1 and y_test[i]!=y_pred_f[i]:
           FP += 1
        if y_test[i]==y_pred_f[i]==0:
           TN += 1
        if y_pred_f[i]==0 and y_test[i]!=y_pred_f[i]:
           FN += 1
    
    tpr = TP / (TP + FN)
    fpr = FP / (FP + TN)

    return tpr, fpr

# Test function

calc_TP_FP_rate(y_test, y_pred)

(0.6550218340611353, 0.17509025270758122)

In [14]:
_ _ _
feature_importances = pd.DataFrame(rf.feature_importances_, index=X_train.columns, columns=['importance'])
feature_importances.sort_values(by=['importance'], ascending=False)

SyntaxError: invalid syntax (<ipython-input-14-9516138f5c8f>, line 1)

In [None]:
X_unsc = data.drop(['marital_satisfaction'], axis=1)
y_unsc = data.marital_satisfaction
y_unsc.value_counts()

In [None]:
X_u_train, X_u_test, y_u_train, y_u_test = train_test_split(X_unsc,y_unsc,test_size=.25)

In [None]:
log_reg = sm.Logit(y_u_train, X_u_train).fit()
print(log_reg.params)

#caclc % change given 1 unit increase in each feature


In [None]:
rf_u=ensemble.RandomForestClassifier()
rf_u.fit(X_u_train, y_u_train)
print(rf_u.score(X_u_test, y_u_test))