# Input Tests

In [4]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from xgboost.sklearn import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_curve

import pickle

In [2]:
df = pd.read_csv('marriage.csv')

In [5]:
with open('marriage.pickle', 'wb') as f:
    pickle.dump(df, f)

### Feature/Class data split

In [6]:
X = df.loc[:, ['age', 'workclass', 'education_num', 'occupation', 'race', 'capital_gain', 'capital_loss',
                'hours_per_week', 'native_country', 'income']]
y = df.loc[:, 'marital_status']

### Dummy Variables for discrete variable features

In [7]:
df_dummies = pd.get_dummies(X, drop_first=True)

### Train/Test split

In [8]:
X_train, X_test, y_train, y_test = train_test_split(df_dummies, y,
                                                    test_size=.3,
                                                    random_state=42)

### Model fit and predictions

In [9]:
xgb = XGBClassifier()
xgb.fit(X_train, y_train)

train_preds = xgb.predict(X_train)
test_preds = xgb.predict(X_test)

### Training Performance

In [10]:
print(f'       F1: {f1_score(y_train, train_preds):.2f}')
print(f'Precision: {precision_score(y_train, train_preds):.2f}')
print(f'   Recall: {recall_score(y_train, train_preds):.2f}')
print(f' Accuracy: {accuracy_score(y_train, train_preds):.2f}')


       F1: 0.86
Precision: 0.79
   Recall: 0.94
 Accuracy: 0.77


### Testing Performance

In [11]:
print(f'       F1: {f1_score(y_test, test_preds):.2f}')
print(f'Precision: {precision_score(y_test, test_preds):.2f}')
print(f'   Recall: {recall_score(y_test, test_preds):.2f}')
print(f' Accuracy: {accuracy_score(y_test, test_preds):.2f}')


       F1: 0.85
Precision: 0.78
   Recall: 0.93
 Accuracy: 0.77


In [12]:
list(X_train.columns)

['age',
 'education_num',
 'capital_gain',
 'capital_loss',
 'hours_per_week',
 'workclass_Local-gov',
 'workclass_Private',
 'workclass_Self-emp-inc',
 'workclass_Self-emp-not-inc',
 'workclass_State-gov',
 'workclass_Without-pay',
 'occupation_Armed-Forces',
 'occupation_Craft-repair',
 'occupation_Exec-managerial',
 'occupation_Farming-fishing',
 'occupation_Handlers-cleaners',
 'occupation_Machine-op-inspct',
 'occupation_Other-service',
 'occupation_Priv-house-serv',
 'occupation_Prof-specialty',
 'occupation_Protective-serv',
 'occupation_Sales',
 'occupation_Tech-support',
 'occupation_Transport-moving',
 'race_Asian-Pac-Islander',
 'race_Black',
 'race_Other',
 'race_White',
 'native_country_Canada',
 'native_country_China',
 'native_country_Columbia',
 'native_country_Cuba',
 'native_country_Dominican-Republic',
 'native_country_Ecuador',
 'native_country_El-Salvador',
 'native_country_England',
 'native_country_France',
 'native_country_Germany',
 'native_country_Greece',
 'n

# Frontend User input

Create an empty df with same columns as our training data

In [13]:
zero_data = np.zeros(shape=(1, len(X_train.columns)))
X_test1 = pd.DataFrame(data=zero_data, columns=X_train.columns)

Create a dictionary for user data (to be input by anyone)

In [24]:
response_dict = {
    'age': 37,
    'education_num': 13,
    'capital_gain': 0,
    'capital_loss': 0,
    'hours_per_week': 80,
    'workclass_Private': 1,
    'occupation_Other_service': 1,
    'native_country_United-States': 1,
    'income_ >50K': 0
}

Reference dictionary to fill in columns with data, otherwise zero (for dummy variables)

In [25]:
for col in X_train.columns:
    X_test1[col] = response_dict.get(col, 0)

In [26]:
X_test1.head()

Unnamed: 0,age,education_num,capital_gain,capital_loss,hours_per_week,workclass_Local-gov,workclass_Private,workclass_Self-emp-inc,workclass_Self-emp-not-inc,workclass_State-gov,...,native_country_Portugal,native_country_Puerto-Rico,native_country_Scotland,native_country_Taiwan,native_country_Thailand,native_country_Trinadad&Tobago,native_country_United-States,native_country_Vietnam,native_country_Yugoslavia,income_ >50K
0,37,13,0,0,80,0,1,0,0,0,...,0,0,0,0,0,0,1,0,0,0


Print out results

In [27]:
user_prediction = xgb.predict(X_test1)

In [28]:
if user_prediction:
    print('Based on these criteria, your marriage is likely to continue successfully for many years.')
elif user_prediction == 0:
    print('Based on these criteria, your marriage may experience extreme stress which could lead to divorce.')
else:
    print('More information is needed for this model work properly.')
    
print('\n')
print(f'       F1: {f1_score(y_test, test_preds):.2f}')
print(f'Precision: {precision_score(y_test, test_preds):.2f}')
print(f'   Recall: {recall_score(y_test, test_preds):.2f}')
print(f' Accuracy: {accuracy_score(y_test, test_preds):.2f}')


Based on these criteria, your marriage is likely to continue successfully for many years.


       F1: 0.85
Precision: 0.78
   Recall: 0.93
 Accuracy: 0.77
