In [2]:
import pandas as pd
import numpy as np
import math
import chart_studio.plotly as py
import plotly.tools as tls
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score, accuracy_score, log_loss, mean_squared_error, f1_score



In [4]:
# Load model from file
xnn_dir = '/home/kimm/article-information-2019/data/xnn_output/hmda_results/'

filename = 'hmda_ann_results.csv'

TEST = pd.read_csv(xnn_dir + filename)

Feature_names = ['term_360', 'conforming', 'debt_to_income_ratio_missing','loan_amount_std', 
                 'loan_to_value_ratio_std', 'no_intro_rate_period_std', 'intro_rate_period_std',
                 'property_value_std', 'income_std', 'debt_to_income_ratio_std']

In [5]:
TEST.columns

Index(['Unnamed: 0', 'high_priced', 'term_360', 'conforming', 'black', 'asian',
       'white', 'amind', 'hipac', 'hispanic', 'non_hispanic', 'male', 'female',
       'agegte62', 'agelt62', 'debt_to_income_ratio_missing',
       'loan_amount_std', 'loan_to_value_ratio_std',
       'no_intro_rate_period_std', 'intro_rate_period_std',
       'property_value_std', 'income_std', 'debt_to_income_ratio_std', '0'],
      dtype='object')

In [6]:
def get_prauc(frame, y, yhat, pos=1, neg=0, res=0.01):
    
    """ Calculates precision, recall, and f1 for a pandas dataframe of y 
        and yhat values.
    
    Args:
        frame: Pandas dataframe of actual (y) and predicted (yhat) values.
        y: Name of actual value column.
        yhat: Name of predicted value column.
        pos: Primary target value, default 1.
        neg: Secondary target value, default 0.
        res: Resolution by which to loop through cutoffs, default 0.01.
    
    Returns:
        Pandas dataframe of precision, recall, and f1 values.
        
    """
    
    frame_ = frame.copy(deep=True) # don't destroy original data
    dname = 'd_' + str(y)          # column for predicted decisions
    eps = 1e-20                    # for safe numerical operations
    
    # init p-r roc frame
    prroc_frame = pd.DataFrame(columns=['cutoff', 'recall', 'precision', 'f1'])
    
    # loop through cutoffs to create p-r roc frame
    for cutoff in np.arange(0, 1 + res, res):

        # binarize decision to create confusion matrix values
        frame_[dname] = np.where(frame_[yhat] > cutoff , 1, 0)
        
        # calculate confusion matrix values
        tp = frame_[(frame_[dname] == pos) & (frame_[y] == pos)].shape[0]
        fp = frame_[(frame_[dname] == pos) & (frame_[y] == neg)].shape[0]
        tn = frame_[(frame_[dname] == neg) & (frame_[y] == neg)].shape[0]
        fn = frame_[(frame_[dname] == neg) & (frame_[y] == pos)].shape[0]

        # calculate precision, recall, and f1
        recall = (tp + eps)/((tp + fn) + eps)
        precision = (tp + eps)/((tp + fp) + eps)
        f1 = 2/((1/(recall + eps)) + (1/(precision + eps)))
        
        # add new values to frame
        prroc_frame = prroc_frame.append({'cutoff': cutoff,
                                          'recall': recall,
                                          'precision': precision,
                                          'f1': f1}, 
                                          ignore_index=True)
    
    # housekeeping
    del frame_
    
    return prroc_frame

# calculate and display recall and precision
#prauc_frame = get_prauc(test_yhat, y, yhat)
prauc_frame = get_prauc(TEST, 'high_priced', '0')
prauc_frame.style.set_caption('Recall and Precision')

Unnamed: 0,cutoff,recall,precision,f1
0,0.0,0.999741,0.0976121,0.177859
1,0.01,0.982161,0.143283,0.250082
2,0.02,0.964064,0.172415,0.292516
3,0.03,0.944674,0.192956,0.320456
4,0.04,0.922182,0.21104,0.343476
5,0.05,0.900724,0.226337,0.361767
6,0.06,0.881851,0.241812,0.379548
7,0.07,0.860651,0.255585,0.394128
8,0.08,0.838935,0.268204,0.406463
9,0.09,0.819286,0.280915,0.418377


In [7]:
xnn_cut = prauc_frame.loc[prauc_frame['f1'].idxmax(), 'cutoff'] # value associated w/ index of max. F1
print('Best F1 threshold: %.2f' % xnn_cut)

Best F1 threshold: 0.20


In [8]:
# Calculate test statistics
Prediction = list(TEST['0'])
Classification = list(TEST['0'].apply(lambda x: int(x >= 0.20)))
Actual = list(TEST['high_priced'].apply(int))

test_statistics = {}

test_statistics['AUC'] = roc_auc_score(Actual, Prediction)
test_statistics['accuracy_score'] = accuracy_score(Actual, Classification)
test_statistics['log_loss'] = log_loss(Actual, Prediction)
test_statistics['rmse'] = math.sqrt(mean_squared_error(Actual, Prediction))

print(test_statistics)

{'AUC': 0.8717675106762534, 'accuracy_score': 0.8604709797791337, 'log_loss': 0.22976372677364865, 'rmse': 0.2619966665423728}
