In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix
from imblearn.under_sampling import RandomUnderSampler
from sklearn import preprocessing

In [2]:
# Load the dataset
data = pd.read_csv('shared/complaints_25Nov21.csv')
data

Unnamed: 0,Date received,Product,Sub-product,Issue,Sub-issue,Consumer complaint narrative,Company public response,Company,State,ZIP code,Tags,Consumer consent provided?,Submitted via,Date sent to company,Company response to consumer,Timely response?,Consumer disputed?,Complaint ID
0,2016-10-26,Money transfers,International money transfer,Other transaction issues,,"To whom it concerns, I would like to file a fo...",Company has responded to the consumer and the ...,"CITIBANK, N.A.",,,,Consent provided,Web,2016-10-29,Closed with explanation,Yes,No,2180490
1,2015-03-27,Bank account or service,Other bank product/service,"Account opening, closing, or management",,My name is XXXX XXXX XXXX and huband name is X...,Company chooses not to provide a public response,"CITIBANK, N.A.",PA,151XX,Older American,Consent provided,Web,2015-03-27,Closed with explanation,Yes,No,1305453
2,2015-04-20,Bank account or service,Other bank product/service,"Making/receiving payments, sending money",,XXXX 2015 : I called to make a payment on XXXX...,Company chooses not to provide a public response,U.S. BANCORP,PA,152XX,,Consent provided,Web,2015-04-22,Closed with monetary relief,Yes,No,1337613
3,2013-04-29,Mortgage,Conventional fixed mortgage,"Application, originator, mortgage broker",,,,JPMORGAN CHASE & CO.,VA,22406,Servicemember,,Phone,2013-04-30,Closed with explanation,Yes,Yes,393900
4,2013-05-29,Mortgage,Other mortgage,"Loan modification,collection,foreclosure",,,,"BANK OF AMERICA, NATIONAL ASSOCIATION",GA,30044,,,Referral,2013-05-31,Closed with explanation,Yes,No,418647
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
207255,2015-05-24,Debt collection,Credit card,Taking/threatening an illegal action,Sued w/o proper notification of suit,,,JPMORGAN CHASE & CO.,FL,33133,,Consent not provided,Web,2015-05-24,Closed with explanation,Yes,No,1390395
207256,2012-01-10,Mortgage,Conventional fixed mortgage,"Loan modification,collection,foreclosure",,,,JPMORGAN CHASE & CO.,NY,10312,,,Referral,2012-01-11,Closed without relief,Yes,Yes,12192
207257,2012-07-17,Student loan,Non-federal student loan,Repaying your loan,,,,"BANK OF AMERICA, NATIONAL ASSOCIATION",NH,032XX,,,Web,2012-07-18,Closed with explanation,Yes,No,118351
207258,2016-09-29,Bank account or service,Checking account,"Account opening, closing, or management",,Near the end of XXXX 2016 I opened a Citigold ...,Company has responded to the consumer and the ...,"CITIBANK, N.A.",CA,900XX,,Consent provided,Web,2016-09-29,Closed with non-monetary relief,Yes,No,2138969


In [3]:
# Handle missing values
data.fillna('missing', inplace=True)

In [4]:
# Set X (predictor) and y (predicted) variables
X = data[['Product', 'Sub-product', 'Issue', 'State', 'Tags', 'Submitted via', 'Company response to consumer', 'Timely response?']]
y = data['Consumer disputed?']

In [5]:
# Convert 'Consumer disputed?' to binary values
le = preprocessing.LabelEncoder()
y = le.fit_transform(y)

In [6]:
# Convert categorical features to numerical using get_dummies
X = pd.get_dummies(X, columns=['Product', 'Sub-product', 'Issue', 'State', 'Tags', 'Submitted via', 'Company response to consumer', 'Timely response?'], drop_first=True)

In [7]:
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

In [8]:
# Check the proportion of disputed complaints in the training dataset
proportion_disputed = sum(y_train) / len(y_train)

In [9]:
# Balance the dataset using random undersampling if needed
if proportion_disputed < 0.3:
    undersampler = RandomUnderSampler(random_state=123)
    X_train, y_train = undersampler.fit_resample(X_train, y_train)

In [10]:
# Train a predictive model (XGBoost Classifier)
model_xgb = XGBClassifier(random_state=123)
model_xgb.fit(X_train, y_train)

In [11]:
# Evaluate the model
y_pred = model_xgb.predict(X_test)
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.84      0.53      0.65     32504
           1       0.27      0.63      0.38      8948

    accuracy                           0.55     41452
   macro avg       0.55      0.58      0.51     41452
weighted avg       0.71      0.55      0.59     41452

[[17244 15260]
 [ 3346  5602]]


In [12]:
# Define cost structure
cost_per_non_disputed = 100  # Cost for non-disputed complaints
cost_per_disputed = 600      # Cost for disputed complaints
cost_per_extra_diligence = 90  # Cost for extra diligence to avoid disputes

In [13]:
# Calculate the proportion of consumers who raised a dispute in the test set
proportion_disputed_in_test = sum(y_test == 1) / len(y_test)

print(proportion_disputed_in_test)

0.21586413200810575


In [14]:
# Calculate the proportion of consumers who raised a dispute in the training dataset after random undersampling
proportion_disputed_in_training = sum(y_train == 1) / len(y_train)

print(proportion_disputed_in_training)

0.5


In [15]:
# Calculate the recall for 'Consumer disputed?' = 'Yes' on the test set

from sklearn.metrics import recall_score
recall = recall_score(y_test, y_pred, pos_label=1)  # Assuming '1' represents 'Yes'

print(recall)

0.6260616897630755


In [16]:
# Calculate the total cost without using the model
total_cost_without_model = (sum(y_test == 0) * cost_per_non_disputed) + (sum(y_test == 1) * cost_per_disputed)

print(total_cost_without_model)

8619200


In [17]:
# Assuming that the model predicts a complaint will be disputed, the banks spend $90 for extra diligence
extra_diligence_cost = (y_pred == 1).sum() * 90

# Assuming that the model predicts a complaint will not be disputed, the cost depends on the actual dispute status
predicted_not_disputed = (y_pred == 0)
actual_disputed = (y_test == 1)
actual_not_disputed = (y_test == 0)

# Calculate the cost for complaints predicted not to be disputed
# If the prediction is correct (True Negative), the cost is $100 per complaint
# If the prediction is incorrect (False Negative), the cost is $600 per complaint
not_disputed_cost = (predicted_not_disputed & actual_not_disputed).sum() * 100 + (predicted_not_disputed & actual_disputed).sum() * 600

# Calculate the total cost
total_cost = extra_diligence_cost + not_disputed_cost

print(total_cost)

5609580


In [18]:
# Create an array of threshold values to test
thresholds = np.arange(0.1, 1.0, 0.01)

# Initialize variables to keep track of the lowest cost and the corresponding threshold
lowest_cost = float('inf')
optimal_threshold = 0

# Iterate over the threshold values and calculate costs
for threshold in thresholds:
    y_pred_adjusted = (model_xgb.predict_proba(X_test)[:, 1] >= threshold).astype(int)
    
    # Calculate the cost for the current threshold
    extra_diligence_cost = (y_pred_adjusted == 1).sum() * 90
    not_disputed_cost = (y_pred_adjusted == 0).sum() * 100 + (y_pred_adjusted == 0).sum() * 600
    total_cost = extra_diligence_cost + not_disputed_cost

    # Update the lowest cost and optimal threshold if a lower cost is found
    if total_cost < lowest_cost:
        lowest_cost = total_cost
        optimal_threshold = threshold

lowest_cost, optimal_threshold

(3766670, 0.1)