In [54]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
from sklearn.model_selection import RandomizedSearchCV
from sklearn.feature_selection import SelectFromModel
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix as cm
from sklearn.metrics import precision_score as precision
from sklearn.metrics import accuracy_score as accuracy
from sklearn.metrics import f1_score as f1
from sklearn.metrics import recall_score as recall
from sklearn.metrics import precision_recall_curve
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import GaussianNB
import holidays
import xgboost as xgb
from xgboost import plot_importance

## Data Preparation

Before training a predictive model, we go through various model preparation steps. After checking for nulls and determining the types of variables present within the data, we discover that many of the variables are binary. Thus, we standardize continuous features and one-hot encode categorical features. More over, we extract featurs such as holiday and day of week to incorporate anything that the date may give us. We do not treat this as a time series task, as we are rich with data and do not see any reason to believe the data has chronological significance.

## Model & Feature Selection

After evaluating the models (Naive Bayes, Logistic Regression, and XGBoost) on various different metrics and running 5-fold cross validation, XGBoost did the best on this task. Once the model was chosen, we use an embedded method for feature selection. We choose the optimal features for the XGBoost's model prediction. We do this by finding the feature importances for the XGBoost model, and then going through each threshold, lowering the number of features and outputting the number that gives the best precision score. We choose precision, as it seems like the most relevant metric for trying to optimize the ratio of successes.

## Autopay

Using the precision recall curve and the proposed cost function of a success being +5 and a failure being -1, a function is built to choose the optimal threshold from the precision recall curve to optimize the ratio of success. We optimize particularly on the cost function output, trying to get the cost as high as possible.

## Variable X

Although I was not able to play with the variable X much, I did notice that after it was put into the pipeline, the number of required features decreased dramatically, making it seem like it captures a lot of information like the continuous variables and the unknown binary variables.

## Results

I built a run function that will print a report in the jupyter notebook when run. The run function can include variable X. Moreover, make sure the data is in the same directory as the juptyer notebook.

In [55]:
us_holidays = holidays.UnitedStates(years = 2017)
def holiday(date):
    """
    Purpose is to make a new column that is binary, where 1 means there is a holiday and 0 means there is not.
    """
    if date in us_holidays:
        return 1
    else:
        return 0

In [56]:
def linear_interpolation(data, column):
    """
    Takes a pandas data frame and a column name and then linearly interpolates null values.
    Puts mean of column for first value, if first value is not present.
    """
    data[column] = data[column].interpolate(method = 'linear').fillna(data[column].mean())
    return data

In [57]:
def data_processing(data, without_x = True):
    """
    Function takes in a pandas dataframe with a truth value of whether the x column should be included. What this function
    does is:
    1) One-hot encode regions.
    2) Linearly interpolate null values in model_score, consumer_reporting_score, and weekly_income
    3) Standardize non-binary columns, since most of the columns are binary
    4) Extract possibly important features from date provided such as day of week and holiday
    5) Drop unnecessary columns
    """
    
    if without_x:
        data = data.drop(['variable_x'], 1)
    regions = pd.get_dummies(data.region_code)
    data = linear_interpolation(data, 'model_score')
    data = linear_interpolation(data, 'consumer_reporting_score')
    data = linear_interpolation(data, 'weekly_income')
    standardize = StandardScaler()
    data[['model_score', 'consumer_reporting_score', 'weekly_income']] = standardize.fit_transform(data[['model_score', 'consumer_reporting_score', 'weekly_income']])
    data['date_submitted'] = pd.to_datetime(data['date_submitted'], format="%m/%d/%y")
    data['day_of_week'] = data['date_submitted'].dt.day_name()
    day_of_week = pd.get_dummies(data.day_of_week)
    data = pd.concat([data, day_of_week, regions], 1)
    data['holiday'] = data['date_submitted'].map(holiday)
    data = data.drop(['date_submitted', 'region_code', 'day_of_week'], 1)
    
    return data
    

In [58]:
def predictive_model(train_data, train_labels, test_data, test_labels, model):
    """
    Trains and evaluates models and reports out results, including accuracy, train accuracy, confusion matrix, precision,
    recall, F1-Score, and then returns a trained model.
    """
    model.fit(train_data, train_labels)
    predictions = model.predict(test_data)
    score = model.score(test_data, test_labels)
    train_score = model.score(train_data, train_labels)
    confusion_matrix = cm(test_labels, predictions)
    prec = precision(test_labels, predictions)
    rec = recall(test_labels, predictions)
    f1_score = f1(test_labels, predictions)
    
    print('Accuracy: {}'.format(score))
    print('Train Accuracy: {}'.format(train_score))
    print('Confusion Matrix')
    print(confusion_matrix)
    print('Precision: {}'.format(prec))
    print('Recall: {}'.format(rec))
    print('F1-Score: {}'.format(f1_score))
    
    return model

In [59]:
def feature_selection(model, train_data, train_labels, test_data, test_labels):
    """
    Creates a unique list of thresholds (based on XGBoost), and then selects features on each model, evaluates the model,
    and then outputs the best model, train data, and test data.
    """
    thresholds = sorted(list(set(model.feature_importances_)))
    best_threshold = []
    for thresh in thresholds:
        if thresh > 0:
            selection = SelectFromModel(model, threshold=thresh, prefit=True)
            select_train = selection.transform(train_data)
            selection_model = xgb.XGBClassifier()
            selection_model.fit(select_train, train_labels)
            select_test = selection.transform(test_data)
            y_pred = selection_model.predict(select_test)
            accuracy = precision(test_labels, y_pred)
            print("Thresh={}, n={}, Precision:{}".format(thresh, select_train.shape[1], accuracy*100.0))
            best_threshold.append((accuracy, thresh))
    
    print('\n Optimal Threshold on Feature Importance')
    threshold = sorted(best_threshold, key = lambda tup: tup[0])[-1][1]
    selection = SelectFromModel(model, threshold=threshold, prefit=True)
    selection_support = selection.get_support()
    select_train = selection.transform(train_data)
    selection_model = xgb.XGBClassifier()
    selection_model.fit(select_train, train_labels)
    select_test = selection.transform(test_data)
    y_pred = selection_model.predict(select_test)
    accuracy = precision(test_labels, y_pred)
    print("Thresh={}, n={}, Precision:{}".format(thresh, select_train.shape[1], accuracy*100.0))
    print(train_data.columns[selection_support])
    select_train = pd.DataFrame(select_train, columns = train_data.columns[selection_support])
    select_test = pd.DataFrame(select_test, columns = test_data.columns[selection_support])
    
    return selection_model, select_train, select_test

In [60]:
def defining_threshold(model, test_data, test_labels):
    """
    Creates probabilities and precision recall curve for the optimal solution to be solved.
    """
    prediction_probabilities = model.predict_proba(test_data)
    precision, recall, thresholds = precision_recall_curve(list(test_labels), prediction_probabilities[:,1])
    return optimal_solution(thresholds, prediction_probabilities[:,1], test_labels)

In [61]:
def optimal_solution(thresholds, test_probabilties, test_labels):
    """
    By going through the thresholds of the precision recall curve, this function does the labeling based on a models
    probability of choosing a positive class, and then implements a cost if the function is correct (+5) and if the
    function is incorrect (-1). Then it chooses the model with the highest cost, which corresponds to the autopay
    system that best optimizes the ratio of success, while making sure failures are minimized.
    """
    optimize = []
    for threshold in thresholds:
        pred_labels = []
        num_success = 0
        cost = 0
        for probability in test_probabilties:
            if probability >= threshold:
                pred_labels.append(1)
            else:
                pred_labels.append(0)
        for pred, true in zip(pred_labels, test_labels):
            if pred == 1:
                if true == pred:
                    num_success += 1
                    cost += 5
                else:
                    cost += -1
        optimize.append((cost, 
                         num_success/sum(pred_labels), 
                         threshold, 
                         accuracy(test_labels.values, pred_labels), 
                         precision(test_labels.values, pred_labels), 
                         recall(test_labels.values, pred_labels), 
                         f1(test_labels.values, pred_labels)))
    
    optimal = sorted(optimize, key = lambda tup: tup[0])[-1]
    
    return 'Cost: {} \n Ratio of Success: {} \n Threshold: {} \n Accuracy: {} \n Precision: {} \n Recall: {} \n F1: {}'.format(optimal[0],
                                                                                                                              optimal[1],
                                                                                                                              optimal[2],
                                                                                                                              optimal[3],
                                                                                                                              optimal[4],
                                                                                                                              optimal[5],
                                                                                                                              optimal[6])

In [62]:
def run(without_x = True):
    data = pd.read_csv('20200130_v_ 20200123_0318.csv')
    processed_data = data_processing(data, without_x)
    train_data, test_data = train_test_split(processed_data, 
                                             test_size = .25, 
                                             random_state = 28, 
                                             stratify = processed_data.success)
    train_labels = train_data.success
    train_data = train_data.drop(['success'], 1)
    test_labels = test_data.success
    test_data = test_data.drop(['success'], 1)
    print('Checking Models \n')
    print('Logistic Regression')
    lr = predictive_model(train_data, train_labels, test_data, test_labels, LogisticRegression(solver = 'lbfgs'))
    print('\n XGBoost')
    xgboost = predictive_model(train_data, train_labels, test_data, test_labels, xgb.XGBClassifier())
    print('\n Naive Bayes')
    nb = predictive_model(train_data, train_labels, test_data, test_labels, GaussianNB())
    
    print('\n')
    print('Cross Validation \n')
    classifiers = [LogisticRegression(solver = 'lbfgs'), xgb.XGBClassifier(), GaussianNB()]
    scores = ['f1', 'precision']
    cv_data = processed_data.drop(['success'], 1)
    cv_labels = processed_data.success
    for classifier in classifiers:
        cv = cross_validate(classifier, cv_data, cv_labels, cv = 5, scoring = scores)
        print(classifier)
        print('F1-Score: {} \n'.format(cv['test_f1'].mean()))
    
    print('\n')
    print('Finding Optimal Feature Importance Threshold Using XGBoost')
    selected_model, select_train, select_test = feature_selection(xgboost, 
                                                                      train_data, 
                                                                      train_labels, 
                                                                      test_data, 
                                                                      test_labels)
    print('\n')
    print('Final model: XGBoost')
    new_xgb = predictive_model(select_train, train_labels, select_test, test_labels, xgb.XGBClassifier())
    print('\n')
    print('Accuracy when processing autopay.')
    print('\n')
    print(defining_threshold(new_xgb, select_test, test_labels))

In [63]:
run()

Checking Models 

Logistic Regression
Accuracy: 0.7893602144272229
Train Accuracy: 0.7873708336570585
Confusion Matrix
[[3979 2635]
 [ 980 9568]]
Precision: 0.7840694911087438
Recall: 0.90709139173303
F1-Score: 0.8411058854555843

 XGBoost
Accuracy: 0.7936137979256497
Train Accuracy: 0.7918382410069148
Confusion Matrix
[[4058 2556]
 [ 986 9562]]
Precision: 0.789074104637729
Recall: 0.9065225635191505
F1-Score: 0.8437306979617047

 Naive Bayes
Accuracy: 0.7803286330264538
Train Accuracy: 0.777348302385207
Confusion Matrix
[[4147 2467]
 [1303 9245]]
Precision: 0.7893613387978142
Recall: 0.8764694728858551
F1-Score: 0.8306379155435759


Cross Validation 

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)
F1-Score: 0.8

In [64]:
run(False)

Checking Models 

Logistic Regression
Accuracy: 0.7948374315347861
Train Accuracy: 0.7926151814155854
Confusion Matrix
[[4108 2506]
 [1015 9533]]
Precision: 0.7918431763435502
Recall: 0.9037732271520668
F1-Score: 0.8441138708106434

 XGBoost
Accuracy: 0.7989162102319077
Train Accuracy: 0.7974904824799938
Confusion Matrix
[[4214 2400]
 [1051 9497]]
Precision: 0.7982684710431202
Recall: 0.9003602578687903
F1-Score: 0.8462463800400981

 Naive Bayes
Accuracy: 0.7802703647593521
Train Accuracy: 0.7777173490793257
Confusion Matrix
[[4160 2454]
 [1317 9231]]
Precision: 0.789987163029525
Recall: 0.8751422070534699
F1-Score: 0.8303872621778438


Cross Validation 

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)
F1-Score: 