In [None]:
# Install packages not preinstalled in jupyter notebook (RUN ONCE)

!pip install xgboost
!pip install seaborn
!pip install yellowbrick
!pip install category_encoders

In [None]:
#import various libraries and toolkits

import pandas as pd
import numpy as np
import seaborn as sns
from numpy import loadtxt
import xgboost as xgb
import matplotlib.pyplot as plt
import category_encoders as ce
import warnings
%matplotlib inline

from sklearn.utils.multiclass import unique_labels
from xgboost.sklearn import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix, mean_squared_error, accuracy_score, f1_score, roc_auc_score, precision_score, recall_score
from sklearn.model_selection import train_test_split
from yellowbrick.classifier import ClassBalance, ROCAUC, ClassificationReport, ClassPredictionError
warnings.simplefilter('ignore')

In [None]:
# function to plot confusion matrix (almost all of this was prebuilt. I made edits to fit my use-case)

def plot_confusion_matrix(y_true, y_pred, classes,
                          normalize=False,
                          title=None,
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if not title:
        if normalize:
            title = 'Normalized confusion matrix'
        else:
            title = 'Confusion matrix, without normalization'

    # Compute confusion matrix
    cm = confusion_matrix(y_true, y_pred)
    # Only use the labels that appear in the data
    classes = classes[unique_labels(y_true, y_pred)]
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)
    
    plt.figure(figsize=(20,10))
    fig, ax = plt.subplots()
    im = ax.imshow(cm, interpolation='nearest', cmap=cmap)
    # We want to show all ticks...
    ax.set(xticks=np.arange(cm.shape[1]),
           yticks=np.arange(cm.shape[0]),
           # ... and label them with the respective list entries
           xticklabels=classes, yticklabels=classes,
           title=title,
           ylabel='Predicted label',
           xlabel='True label')

    # Rotate the tick labels and set their alignment.
    

    # Loop over data dimensions and create text annotations.
    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            ax.text(j, i, format(cm[i, j], fmt),
                    ha="center", va="center",
                    color="white" if cm[i, j] > thresh else "black")
    
    return ax

In [None]:
PASS_NOpass_10k_0923.csv
2000 claims balanced.csv
BAC_118659_1108.csv

In [None]:
trainset = input("What is the name of your training dataset?: ")
validationset = input('What is the name of your validation dataset?: ')
predictionset = input('What is the name of the prediction dataset?: ')

In [None]:
# Create function to read in testing dataset and predicting dataset

def readinformattarget(trainset, predictionset):
    df = pd.read_csv(trainset, encoding = "ISO-8859-1")
    predictionset = pd.read_csv(predictionset, encoding = 'ISO-8859-1')

    # Create list of df column names
    cols = list(df)

    # Move 'Deviation Code F' to the front of the list of columns
    cols.insert(0, cols.pop(cols.index('Deviation Code F')))
    
    # Convert values of 'Deviation Code F' to 'non_compliant' or 'compliant' for vizualization purposes
    df.loc[(df['Deviation Code F'] != 'P'),'Deviation Code F']='Non-Compliant'
    df.loc[(df['Deviation Code F'] == 'P'),'Deviation Code F']='Compliant'

    # Convert target variable to a category data type
    df['Deviation Code F'].astype('category')
    
    return df, predictionset

# Assign training set and 'BAC to predict' to df and predictionset with intial formatting
df, predictionset = readinformattarget(trainset, predictionset)

In [None]:
# Create function to make a bar chart to examine distribution of deviation codes within the training set (WILL NOT BE IN PRODUCTION SCRIPT)
def targetdistributionviz(df):
    
    # formatting
    ax2 = sns.countplot(df['Deviation Code F'], label='Count').set_title('distribution of compliant and non-compliant claims')
    plt.xlabel('Target Column: Deviation Code F')
    ax2.figure.savefig('Pass_fail_count.png')
    
# Execute    
targetdistributionviz(df)

In [None]:
# Create function to split the training set into X and y variables.

def trainingsetsplit(trainfile):   
    # Re-initialize the training set as df for formatting purposes
    df = pd.read_csv(trainfile, encoding = "ISO-8859-1")

    # Convert values of 'Deviation Code F' to 'non_compliant' or 'compliant' (0 or 1, respectively)
    df.loc[(df['Deviation Code F'] != 'P'),'Deviation Code F']='0'
    df.loc[(df['Deviation Code F'] == 'P'),'Deviation Code F']='1'

    # Drop columns with duplicate names from our training dataset
    df = df.loc[:,~df.columns.duplicated()]

    # Drop columns where all values are duplicates of another column
    df = df.T.drop_duplicates().T

    # Drop columns where all values are the same
    nunique = df.apply(pd.Series.nunique)
    cols_to_drop = nunique[nunique == 1].index
    df.drop(cols_to_drop, axis=1)

    # Split training set into feature set and response set
    X = df.drop('Deviation Code F',axis=1)
    y = df['Deviation Code F']
    y = y.to_frame()
    y = y.astype('int')
    return X, y

X, y = trainingsetsplit(trainset)

In [None]:
# Create running list of any columns that offer no statistical significance and drop them (will be updated as needed) (applying these drops to predictionset, because the final list of usable features must come from predictionset,
# as these are provided by our Audit Team partner, Cathy Snyder)

predictionset = predictionset.drop(['CLAIM_NO', 'CLAIM_VERSION','Submitting_BAC'], axis = 1)
len(predictionset.dtypes)

In [None]:
# Create a function to ensure that we are only using columns in our prediction set to train with

def colmatching(original, predictionset, X):
    
    # create list of columns that exist in both data sets
    col_list = original.columns.intersection(predictionset.columns)
    
    # re-assign X and predictionset to account for list of similar columns
    X = X[col_list]
    predictionset = predictionset[col_list]
    return X, predictionset

# Execute function
X, predictionset = colmatching(df, predictionset, X)

In [None]:
# Recheck that both datasets have an identical number of columns
print(X.shape, predictionset.shape)

In [None]:
# Examine column types for X to facilitate column conversion to their proper datatypes

X.dtypes

In [None]:
# Manual feature engineering. Converting columns to proper datatype for further processing

cat_columns = ['Trouble_Code', 'Labor_Operation_cd', 'Model_Year', 'LABOR_OP_DEP_TYPE_KEY', 'REPAIR_GROUP_KEY', 'CUSTOMER_COMPLAINT_CD_KEY', 'FI_WARRANTY_TYPE_KEY']
int_columns = [ 'Odometer_Reading', 'TOTAL_LABOR_ITEM_COUNT', 'TOTAL_MATERIAL_QUANTITY', 'TOTAL_MESSAGE_COUNT_E']
float_columns = ['Net_Item_Amt_Exc_Tax', 'Total_Tax', 'Other_Hours', 'Total_Hours', 'TOTAL_DEDUCTIBLE_AMT', 'TOTAL_NET_ITEM_TAX_AMT', 'TOTAL_TAX_AMOUNT', 'TOTAL_LABOR_HOURS', 'OTHER_LABOR_HOURS', 'TOTAL_SUPP_LABOR_HOURS']
X[cat_columns] = X[cat_columns].astype('category')
X[int_columns] = X[int_columns].astype('int')
X[float_columns] = X[float_columns].astype('float')



In [None]:
# Create function to perform hashing trick

def hashingtrick(X):
    ce_hash = ce.HashingEncoder()
    hashed_X = ce_hash.fit_transform(X)
    return hashed_X

# execute hashing trick and assign to hashed_X, then check datatypes for all columns for confirmation
hashed_X = hashingtrick(X)
hashed_X.dtypes

In [None]:
# Process hashed_X and y through .Dmatrix function so xgboost can ingest the data
data_dmatrix = xgb.DMatrix(data = hashed_X, label = y)

# Conduct another split (train/test split) in order to perform cross-validation techniques
X_train, X_test, y_train, y_test = train_test_split(hashed_X,y, test_size=0.25)

In [None]:
X_train.dtypes

In [None]:
# Instantiate process for Parameter tuning using cross validation
xgb_clf = xgb.XGBClassifier(tree_method = "exact", predictor = "cpu_predictor", verbosity = 1,
                            objective = "binary:logistic")

# Create parameter grid
parameters = {"learning_rate": [0.1, 0.01, 0.001],
               "gamma" : [0.01, 0.1, 0.3, 0.5, 1, 1.5, 2],
               "max_depth": [2, 4, 7, 10],
               "colsample_bytree": [0.3, 0.6, 0.8, 1.0],
               "subsample": [0.2, 0.4, 0.5, 0.6, 0.7], # I've recently read that a minimum of .8 is a best practice, but this is a new development. I've been afraid to change anything but was curious if you
                                                       # had any knowledge of this metric?
               "reg_alpha": [0, 0.5, 1],
               "reg_lambda": [1, 1.5, 2, 3, 4.5],
               "min_child_weight": [1, 3, 5, 7],
               "n_estimators": [100, 250, 500, 1000]}


