# Input Tests

In [34]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from xgboost.sklearn import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_curve

In [38]:
df = pd.read_csv('marriage_eng.csv', index_col=0)

### Feature/Class data split

In [41]:
df.columns

Index(['age', 'education', 'education_num', 'marital_status', 'occupation',
       'race', 'sex', 'capital_gain', 'capital_loss', 'hours_per_week',
       'native_country', 'income'],
      dtype='object')

In [43]:
X = df.loc[:, ['age', 'education', 'occupation', 'race', 'sex', 'capital_gain', 'capital_loss',
                'hours_per_week', 'native_country', 'income']]
y = df.loc[:, 'marital_status']

### Dummy Variables for discrete variable features

In [67]:
df_dummies = pd.get_dummies(X, drop_first=False)

In [68]:
df_dummies['education_inc_HS'] = df_dummies['education_<HS']
df_dummies['income_high'] = df_dummies['income_>85k']
df_dummies['income_low'] = df_dummies['income_<=85k']


In [69]:
# Removed < and > from feature names which seems to give XGBoost some problems
df_dummies.drop(['education_<HS', 'income_>85k', 'income_<=85k'], axis=1, inplace=True)

### Train/Test split

In [70]:
X_train, X_test, y_train, y_test = train_test_split(df_dummies, y,
                                                    test_size=.3,
                                                    random_state=42)

### Model fit and predictions

In [71]:
xgb = XGBClassifier()
xgb.fit(X_train, y_train)

train_preds = xgb.predict(X_train)
test_preds = xgb.predict(X_test)

### Training Performance

In [72]:
print(f'       F1: {f1_score(y_train, train_preds):.2f}')
print(f'Precision: {precision_score(y_train, train_preds):.2f}')
print(f'   Recall: {recall_score(y_train, train_preds):.2f}')
print(f' Accuracy: {accuracy_score(y_train, train_preds):.2f}')


       F1: 0.90
Precision: 0.85
   Recall: 0.95
 Accuracy: 0.85


### Testing Performance

In [73]:
print(f'       F1: {f1_score(y_test, test_preds):.2f}')
print(f'Precision: {precision_score(y_test, test_preds):.2f}')
print(f'   Recall: {recall_score(y_test, test_preds):.2f}')
print(f' Accuracy: {accuracy_score(y_test, test_preds):.2f}')


       F1: 0.89
Precision: 0.85
   Recall: 0.94
 Accuracy: 0.84


# Frontend User input

Create an empty df with same columns as our training data

In [74]:
zero_data = np.zeros(shape=(1, len(X_train.columns)))
X_test1 = pd.DataFrame(data=zero_data, columns=X_train.columns)

Create a dictionary for user data (to be input by anyone)

In [76]:
response_dict = {
    'age': 0,
    
    'income_high': 0,
    'income_low': 0
    
    'capital_gain': 0,
    'capital_loss': 0,
    
    'hours_per_week': 0,
    
    'education_Assoc-acdm': 0,
    'education_Assoc-voc': 0,
    'education_Bachelors': 0,
    'education_Doctorate': 0,
    'education_HS-grad': 0,
    'education_Masters': 0,
    'education_Prof-school': 0,
    'education_Some-college': 0,
    'education_inc_HS': 0,
    
    'occupation_Blue': 0,
    'occupation_White': 0,
    
    'race_Amer-Indian-Eskimo': 0,
    'race_Asian-Pac-Islander': 0,
    'race_Black': 0,
    'race_Other': 0,
    'race_White': 0,
    
    'sex_Female': 0,
    'sex_Male': 0,
    
    'native_country_Canada': 0,
    'native_country_China': 0,
    'native_country_England': 0,
    'native_country_France': 0,
    'native_country_Germany': 0,
    'native_country_Greece': 0,
    'native_country_Hungary': 0,
    'native_country_India': 0,
    'native_country_Iran': 0,
    'native_country_Ireland': 0,
    'native_country_Italy': 0,
    'native_country_Jamaica': 0,
    'native_country_Japan': 0,
    'native_country_Latin-America': 0,
    'native_country_Outlying-US(Guam-USVI-etc)': 0,
    'native_country_Poland': 0,
    'native_country_Portugal': 0,
    'native_country_SE-Asia': 0,
    'native_country_Scotland': 0,
    'native_country_Trinadad&Tobago': 0,
    'native_country_United-States': 0,
    'native_country_Yugoslavia': 0,
}

Reference dictionary to fill in columns with data, otherwise zero (for dummy variables)

In [77]:
for col in X_train.columns:
    X_test1[col] = response_dict.get(col, 0)

Print out results

In [78]:
user_prediction = xgb.predict(X_test1)

In [79]:
if user_prediction:
    print('Based on these criteria, your marriage is likely to continue successfully for many years.')
elif user_prediction == 0:
    print('Based on these criteria, your marriage may experience extreme stress which could lead to divorce.')
else:
    print('More information is needed for this model work properly.')
    
print('\n')
print(f'       F1: {f1_score(y_test, test_preds):.2f}')
print(f'Precision: {precision_score(y_test, test_preds):.2f}')
print(f'   Recall: {recall_score(y_test, test_preds):.2f}')
print(f' Accuracy: {accuracy_score(y_test, test_preds):.2f}')


Based on these criteria, your marriage is likely to continue successfully for many years.


       F1: 0.89
Precision: 0.85
   Recall: 0.94
 Accuracy: 0.84
