In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn import preprocessing
darsh_data = pd.read_csv('lending_train.csv')

In [9]:
darsh_data.shape

(1000000, 25)

In [8]:
darsh_data['loan_paid'].value_counts()

1    799870
0    200130
Name: loan_paid, dtype: int64

In [2]:
def preprocessv2(data):
    di = {'4 years':4,'10+ years':10,'2 years':2,'7 years':7,'8 years':8,'3 years':3,'< 1 year':0.5,'1 year':1,'9 years':9,'5 years':5,'6 years':6}
    data = data.replace({"employment_length":di})
    # make loan_duration numeric
    data['loan_duration'] = data['loan_duration'].apply(lambda x: int(x[0:3]))
    
    data.drop(['employment', 'extended_reason', 'zipcode','ID'],axis=1, inplace=True)

    # Select numeric columns.
    a = data.select_dtypes('number')
    # Select string and object columns.
    b = data.select_dtypes('object')

    # Fill numeric columns with mean.
    data[a.columns] = a.fillna(a.mean())
    # Fill object columns with mode.
    data[b.columns] = b.fillna(b.agg(lambda x: x.mode().values[0]))
    
    data['home_ownership_status']=data['home_ownership_status'].replace(['NONE', 'ANY'], 'OTHER')

    dummies = pd.get_dummies(data['home_ownership_status'],drop_first=True)
    data = data.drop('home_ownership_status',axis=1)
    data = pd.concat([data,dummies],axis=1)
    
    dummies = pd.get_dummies(data[['race', 'reason_for_loan','employment_verified',
                                     'type_of_application','state']],drop_first=True)
    
    data = data.drop(['race', 'reason_for_loan','employment_verified','state',
                                         'type_of_application','state'],axis=1)
    X = pd.concat([data, dummies],axis=1).drop(columns = ['loan_paid'], axis=1)

    return (X, data['loan_paid'])

In [3]:
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.metrics import accuracy_score

In [4]:
X, y = preprocessv2(darsh_data) #including feature drop off

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=5, stratify=y)

In [6]:
X_train.shape #training data is 75% of the split while testing is 25% (of sampled 500k)

(750000, 87)

In [7]:
# General imports
import pandas as pd
from sklearn import datasets
from collections import Counter
# Generate datasets
from sklearn.datasets import make_classification
from imblearn.datasets import make_imbalance
# Train, test, splits and gridsearch optimization
from sklearn.model_selection import train_test_split, GridSearchCV
# Class weights
from sklearn.utils import class_weight
# Performance
from sklearn.metrics import classification_report
# Modeling
import xgboost
import warnings
warnings.filterwarnings('ignore')

In [8]:
# https://stackoverflow.com/questions/67303447/how-to-use-downsampling-and-configure-class-weight-parameter-when-using-xgboost
from math import sqrt
X, y = preprocessv2(darsh_data) #including feature drop off
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=8, stratify=y)
# scaled_X, scaled_y = make_imbalance(X, y, sampling_strategy={0:200}, random_state=8)
# Get the counts of the training data per XGBoost documentation
counts = Counter(y_train)
# model_sqrt = xgboost.XGBClassifier(scale_pos_weight=sqrt(counts[0] / counts[1]), random_state=30,
#             sample_weight=class_weight.compute_sample_weight(class_weight='balanced', y=scaled_y))

# model_sqrt = xgboost.XGBClassifier(scale_pos_weight=sqrt(counts[0] / counts[1]))
model_sqrt = xgboost.XGBClassifier(scale_pos_weight= .25)
# model_sqrt = xgboost.XGBClassifier(scale_pos_weight=(counts[0] / counts[1]),learning_rate=0.3874,max_depth=5,n_estimators=39,reg_alpha=.03764,min_child_weight = 9)
model_sqrt = xgboost.XGBClassifier(scale_pos_weight=.25,learning_rate = 0.3874,max_depth=5,n_estimators=39,reg_alpha=0.03764,colsample_bytree=0.9068,gamma=5.295,min_child_weight=7.685)

# ^ mess with hyperparameters here ^
model_sqrt.fit(X_train, y_train)
# print(classification_report(y_test, model_sqrt.predict(X_test)))



XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.9068,
              enable_categorical=False, gamma=5.295, gpu_id=-1,
              importance_type=None, interaction_constraints='',
              learning_rate=0.3874, max_delta_step=0, max_depth=5,
              min_child_weight=7.685, missing=nan, monotone_constraints='()',
              n_estimators=39, n_jobs=8, num_parallel_tree=1, predictor='auto',
              random_state=0, reg_alpha=0.03764, reg_lambda=1,
              scale_pos_weight=0.25, subsample=1, tree_method='exact',
              validate_parameters=1, verbosity=None)

In [9]:
print(counts[0],counts[1])
# ratio of 0 to 1 is ~.25
# 0's are negative class 1's are positive

150098 599902


In [10]:
y_pred = model_sqrt.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
accuracy

0.64586

In [11]:
confusion_matrix(y_test,y_pred)

array([[ 31906,  18126],
       [ 70409, 129559]])

In [12]:
testing_data_final = pd.read_csv('lending_topredict.csv')
final_dat, loan_paid = preprocessv2(testing_data_final)
final_pred = model_sqrt.predict(final_dat)
df = pd.DataFrame(final_pred)
df['ID'] = testing_data_final['ID']
df.rename(columns = {0:'loan_paid'}, inplace = True)
columns_titles = ["ID","loan_paid"]
df2=df.reindex(columns=columns_titles)

In [13]:
darsh_data['loan_paid'].value_counts()

1    799870
0    200130
Name: loan_paid, dtype: int64

In [14]:
df2.to_csv('darsh_predsv16.csv', index = False)

In [15]:
last = pd.read_csv("last.csv")
loss = pd.read_csv("loss.csv")

In [16]:
fourteen = pd.read_csv("darsh_predsv14.csv")

In [17]:
fourteen['loan_paid'].value_counts()

1    201107
0    144203
Name: loan_paid, dtype: int64