In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# logistic regression
from sklearn.linear_model import LogisticRegression

# decision tree ;)
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier

# random forest
from sklearn.ensemble import RandomForestRegressor
from sklearn.datasets import make_regression
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import AdaBoostRegressor

from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier

from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split

from sklearn.metrics import roc_auc_score

import xgboost as xgb

In [None]:
! pip install kaggle
! mkdir ~/.kaggle
! cp kaggle.json ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json
! kaggle competitions download -c caltech-cs155-2022-mp1
! unzip LOANS_TRAIN.csv.zip
! unzip LOANS_TEST.csv.zip

Downloading sample_submission.csv to /content
  0% 0.00/535k [00:00<?, ?B/s]
100% 535k/535k [00:00<00:00, 71.7MB/s]
Downloading LOANS_TEST.csv.zip to /content
  0% 0.00/2.31M [00:00<?, ?B/s]
100% 2.31M/2.31M [00:00<00:00, 129MB/s]
Downloading LOANS_TRAIN.csv.zip to /content
 68% 7.00M/10.3M [00:00<00:00, 70.9MB/s]
100% 10.3M/10.3M [00:00<00:00, 66.0MB/s]
Downloading LCDataDictionary.xlsx to /content
  0% 0.00/12.5k [00:00<?, ?B/s]
100% 12.5k/12.5k [00:00<00:00, 12.9MB/s]
Archive:  LOANS_TRAIN.csv.zip
  inflating: LOANS_TRAIN.csv         
Archive:  LOANS_TEST.csv.zip
  inflating: LOANS_TEST.csv          


In [None]:
# PROCESSING TRAINING DATA SET

def float_percent(x):
    return float(x.strip('%'))/100

def int_emplength(x):
    if x == '< 1 year' or x == 0:
        return 0
    nums = []
    for c in x:
        if c.isdigit():
            nums.append(c)
    num = ''.join(nums)
    if num == '':
        return 0
    return int(num)

# Ordinal with A = 7, G = 1 (Assumption A = best)
def convert_grade_to_numeric(grade):
  return 7 - int(ord(grade) - ord('A'))

# Ordinal with A = 7, G = 1 (Assumption A1 = best)
def convert_subgrade_to_numeric(grade):
  tier = convert_grade_to_numeric(grade[0])
  number = int(grade[1])
  return (tier * 5) - number + 1

def one_hot_encode_home_ownership(X):
  home_ownership_rent = []
  home_ownership_mortgage = []
  home_ownership_own = []
  for i in range(len(X)):
    if X[i][8] == 'RENT':
      home_ownership_rent.append(1)
      home_ownership_mortgage.append(0)
      home_ownership_own.append(0)
    elif X[i][8] == 'MORTGAGE':
      home_ownership_rent.append(0)
      home_ownership_mortgage.append(1)
      home_ownership_own.append(0)
    elif X[i][8] == 'OWN':
      home_ownership_rent.append(0)
      home_ownership_mortgage.append(0)
      home_ownership_own.append(1)
    else:
      home_ownership_rent.append(0)
      home_ownership_mortgage.append(0)
      home_ownership_own.append(0)


  return home_ownership_rent, home_ownership_mortgage, home_ownership_own

def one_hot_encode_verification_status(X):
  verification_status_verified = []
  verification_status_source = []
  verification_status_not_verified = []
  for i in range(len(X)):
    if X[i][10] == 'Verified':
      verification_status_verified.append(1)
      verification_status_source.append(0)
      verification_status_not_verified.append(0)
    elif X[i][10] == 'Source Verified':
      verification_status_verified.append(0)
      verification_status_source.append(1)
      verification_status_not_verified.append(0)
    elif X[i][10] == 'Not Verified':
      verification_status_verified.append(0)
      verification_status_source.append(0)
      verification_status_not_verified.append(1)

  return verification_status_verified, verification_status_source, verification_status_not_verified


def one_hot_encode_initial_list_status(X):
  f = []
  w = []
  for i in range(len(X)):
    if X[i][23] == 'f':
      f.append(1)
      w.append(0)
    elif X[i][23] == 'w':
      f.append(0)
      w.append(1)
  return f, w

In [None]:
def process(X, boolean):
  loan_amount = []
  term = []
  int_rate = []
  installment = []
  grade = []
  subgrade = []
  emp_len = []
  annual_inc = []
  purpose = []
  dti = []
  open_acc = []
  pub_rec = []
  revol_bal = []
  revol_util = []
  total_acc = []
  pub_rec_bank = []
  loan_status = []

  home_ownership_rent, home_ownership_mortgage, home_ownership_own = one_hot_encode_home_ownership(X)
  verification_status_verified, verification_status_source, verification_status_not_verified = one_hot_encode_verification_status(X)
  initial_status_f, initial_status_w = one_hot_encode_initial_list_status(X)


  for i in range(len(X)):
    loan_amount.append(X[i][0])
    term.append(X[i][1])
    int_rate.append(float_percent(str(X[i][2])))
    installment.append(X[i][3])
    grade.append(convert_grade_to_numeric(X[i][4]))
    subgrade.append(convert_subgrade_to_numeric(X[i][5]))
    emp_len.append(int_emplength(X[i][7]))
    annual_inc.append(X[i][9])
    dti.append(X[i][16])
    open_acc.append(X[i][18])
    pub_rec.append(X[i][19])
    revol_bal.append(X[i][20])
    revol_util.append(float_percent(str(X[i][21])))
    total_acc.append(X[i][22])
    pub_rec_bank.append(X[i][26])

  data = {'loan_amnt': loan_amount, 
        'term (months)': term,
        'int_rate': int_rate,
        'installment': installment,
        'grade': grade,
        'subgrade': subgrade,
        'emp_length': emp_len,
        'home_ownership_rent': home_ownership_rent,
        'home_ownership_mortgage': home_ownership_mortgage,
        'home_ownership_own': home_ownership_own,
        'annual_inc': annual_inc,
        'verification_status_verified': verification_status_verified, 
        'verification_status_source': verification_status_source, 
        'verification_status_not_verified': verification_status_not_verified,
        'dti': dti,
        'open_acc': open_acc,
        'pub_rec': pub_rec,
        'revol_bal': revol_bal,
        'revol_util': revol_util,
        'pub_rec_bank': pub_rec_bank,
      }

  if boolean:
    loan_status = [0 if X[i,-1] == 'Fully Paid' else 1 for i in range(X.shape[0])]
    data['loan_status'] = loan_status

  # Good dataframe
  # Sorry didnt generalize for X_test 
  df = pd.DataFrame(data=data)
  return df
  # df.to_csv(filename + '.csv')

In [None]:
from sklearn.datasets import load_iris
from sklearn.preprocessing import StandardScaler

def normalize(train, test):

  X_std = np.std(train, axis=0)
  X_mean = np.mean(train, axis=0)

  for k in range(len(train[0])):
      train[:,k] = (train[:,k] - X_mean[k]) / X_std[k]
      test[:,k] = (test[:,k] - X_mean[k]) / X_std[k]

  return train, test

In [None]:
train_df = pd.read_csv('/content/LOANS_TRAIN.csv')
# train_df = train_df.dropna(subset=['emp_length'])
train_data = train_df.fillna(0)
train = process(train_data.to_numpy()[:,1:], True)
full_train_data = train.to_numpy()
print(len(train_df))

test_df = pd.read_csv('/content/LOANS_TEST.csv')
# test_df = test_df.dropna(subset=['emp_length'])
test_data = test_df.fillna(0)
test = process(test_data.to_numpy()[:,1:], False)
test_data = test.to_numpy()
print(len(test_df))

X_train = full_train_data[:,:-1]
Y_train = full_train_data[:,-1]
X_test = test_data
X_train, X_test = normalize(X_train, X_test)


FileNotFoundError: ignored

In [None]:
# training_data = train[['loan_amnt', 'term (months)', 'installment', 'grade', 'subgrade', 'annual_inc', 'dti', 'open_acc', 'revol_bal', 'loan_status']]
# testing_data = test[['loan_amnt', 'term (months)', 'installment', 'grade', 'subgrade', 'annual_inc', 'dti', 'open_acc', 'revol_bal']]
# X_train = training_data.to_numpy()[:,:-1]
# Y_train = training_data.to_numpy()[:,-1]
# X_test = testing_data.to_numpy()

In [None]:
# from sklearn.metrics import precision_score, recall_score, accuracy_score
from xgboost import XGBClassifier
from tqdm import tqdm


x_train, x_test, y_train, y_test = train_test_split(X_train, Y_train, test_size=0.3)
x_train, x_test = normalize(x_train, x_test)

# xgb_model = xgb.XGBClassifier()


param = {
    'eta': 0.15, 
    'max_depth': 20,  
    'min_child_weight': 5,
    'n_estimators':300,
    'nthread':30,
    'verbosity':1,
    'objective': 'multi:softprob',  
    'num_class': 2
    } 

D_train = xgb.DMatrix(x_train, label=y_train)
D_test = xgb.DMatrix(x_test, label=y_test)

model = xgb.train(param, D_train)

preds = model.predict(D_train)
best_preds = np.asarray([np.argmax(line) for line in preds])
print("mean of preds: " + str(np.mean(best_preds)))
print("0: " + str(len(best_preds) - np.sum(best_preds)) + "   1: " + str(np.sum(best_preds)))
print("Accuracy: " + str(sum(best_preds == y_train) / len(y_train)))
print("roc_auc: " + str(roc_auc_score(best_preds, y_test)))

print()

preds = model.predict(D_test)
best_preds = np.asarray([np.argmax(line) for line in preds])
print("mean of preds: " + str(np.mean(best_preds)))
print("0: " + str(len(best_preds) - np.sum(best_preds)) + "   1: " + str(np.sum(best_preds)))
print("Accuracy: " + str(sum(best_preds == y_test) / len(y_test)))
print("roc_auc: " + str(roc_auc_score(best_preds, y_test)))

print()

D_test = xgb.DMatrix(X_test)
preds = model.predict(D_test)
best_preds = np.asarray([np.argmax(line) for line in preds])
print("mean of preds: " + str(np.mean(best_preds)))
print("0: " + str(len(best_preds) - np.sum(best_preds)) + "   1: " + str(np.sum(best_preds)))

In [None]:

# submission = {'id': [i for i in range(200000, 242480)],
#               'loan_status': p_final
#               }

# preds_df = pd.DataFrame(submission)
# preds_df.to_csv('/content/submission.csv')

In [None]:
 
# https://towardsdatascience.com/a-beginners-guide-to-xgboost-87f5d4c30ed7

In [None]:
15091