In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# logistic regression
from sklearn.linear_model import LogisticRegression

# decision tree ;)
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier

# random forest
from sklearn.ensemble import RandomForestRegressor
from sklearn.datasets import make_regression
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import AdaBoostRegressor

from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier

from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split

from sklearn.metrics import roc_auc_score

import xgboost as xgb

In [None]:
# ! pip install kaggle
# ! mkdir ~/.kaggle
# ! cp kaggle.json ~/.kaggle/
# ! chmod 600 ~/.kaggle/kaggle.json
# ! kaggle competitions download -c caltech-cs155-2022-mp1
# ! unzip LOANS_TRAIN.csv.zip
# ! unzip LOANS_TEST.csv.zip

In [None]:
# PROCESSING TRAINING DATA SET

def float_percent(x):
    if type(x) == int:
      return x
    return float(x.strip('%'))/100

def int_emplength(x):
    if x == -1:
      return -1
    if x == '< 1 year' or x == 0:
        return 0
    nums = []
    for c in x:
        if c.isdigit():
            nums.append(c)
    num = ''.join(nums)
    if num == '':
        return 0
    return int(num)

# Ordinal with A = 7, G = 1 (Assumption A = best)
def convert_grade_to_numeric(grade):
  return 7 - int(ord(grade) - ord('A'))

# Ordinal with A = 7, G = 1 (Assumption A1 = best)
def convert_subgrade_to_numeric(grade):
  tier = convert_grade_to_numeric(grade[0])
  number = int(grade[1])
  return int((tier * 5) - number + 1)

def target_encode(train_data, test_data):
  train_states = train_data['addr_state']
  test_states = test_data['addr_state']
  vals = train_data['loan_status']

  state_count = {}
  for state in train_states:
    if state in state_count:
      state_count[state] += 1
    else:
      state_count[state] = 1

  state_dict = {}
  for state in range(len(train_states)):
    if train_states[state] in state_dict:
      state_dict[train_states[state]] += (vals[state] / state_count[train_states[state]])
    else:
      state_dict[train_states[state]] = (vals[state] / state_count[train_states[state]])

  train_data['addr_state'] = [state_dict[i] for i in train_data['addr_state']]
  test_data['addr_state'] = [state_dict[i] if i in state_dict else -1 for i in test_data['addr_state']]
  return train_data, test_data

def normalize(train, test):

  X_std = np.std(train, axis=0)
  X_mean = np.mean(train, axis=0)

  for k in range(len(train[0])):
    if X_std[k] != 0:
      train[:,k] = (train[:,k] - X_mean[k]) / X_std[k]
      test[:,k] = (test[:,k] - X_mean[k]) / X_std[k]

  return train, test

In [None]:
months = {'Jan':1, 'Feb':2, 'Mar':3, 'Apr':4,
          'May':5, 'Jun':6, 'Jul':7, 'Aug':8,
          'Sep':9, 'Oct':10,'Nov':11,'Dec':12}

def preprocess_data(train_data, value):
  processed = {}
  train_data = train_data.fillna(-1)

  processed['loan_amnt'] = [i for i in train_data['loan_amnt']]
  processed['term (months)'] = [1 if i == 36 else -1 for i in train_data['term_(months)']]
  processed['int_rate'] = [float_percent(i) for i in train_data['int_rate']]
  processed['installment'] = [i for i in train_data['installment']]
  processed['sub_grade'] = [convert_subgrade_to_numeric(i) for i in train_data['sub_grade']]
  processed['emp_length'] = [int_emplength(i) for i in train_data['emp_length']]
  processed['home_ownership_mortgage'] = [1 if i == 'MORTGAGE' else 0 for i in train_data['home_ownership']]
  processed['home_ownership_rent'] = [1 if i == 'RENT' else 0 for i in train_data['home_ownership']]
  processed['home_ownership_own'] = [1 if i == 'OWN' else 0 for i in train_data['home_ownership']]
  processed['home_ownership_other'] = [1 if (i != 'OWN' and i != 'RENT' and i != 'MORTGAGE') else 0 for i in train_data['home_ownership']]
  processed['annual_inc'] = [i for i in train_data['annual_inc']]
  processed['verification_status_not_verified'] = [1 if i == 'Not Verified' else 0 for i in train_data['verification_status']]
  processed['verification_status_verified'] = [1 if i == 'Verified' else 0 for i in train_data['verification_status']]
  processed['verification_status_source_verified'] = [1 if i == 'Source Verified' else 0 for i in train_data['verification_status']]
  processed['verification_status_other'] = [1 if (i != 'OWN' and i != 'RENT' and i != 'MORTGAGE') else 0 for i in train_data['verification_status']]
  processed['purpose_debt_consolidation'] = [1 if i == 'debt_consolidation' else 0 for i in train_data['purpose']]
  processed['purpose_car'] = [1 if i == 'car' else 0 for i in train_data['purpose']]
  processed['purpose_moving'] = [1 if i == 'credit_card' else 0 for i in train_data['purpose']]
  processed['purpose_home_improvement'] = [1 if i == 'home_improvement' else 0 for i in train_data['purpose']]
  processed['purpose_credit_card'] = [1 if i == 'moving' else 0 for i in train_data['purpose']]
  processed['purpose_wedding'] = [1 if i == 'wedding' else 0 for i in train_data['purpose']]
  processed['purpose_major_purpose'] = [1 if i == 'major_purpose' else 0 for i in train_data['purpose']]
  processed['purpose_vacation'] = [1 if i == 'vacation' else 0 for i in train_data['purpose']]
  processed['purpose_small_business'] = [1 if i == 'small_business' else 0 for i in train_data['purpose']]
  processed['purpose_medical'] = [1 if i == 'medical' else 0 for i in train_data['purpose']]
  processed['purpose_renewable_energy'] = [1 if i == 'renewable_energy' else 0 for i in train_data['purpose']]
  processed['purpose_educational'] = [1 if i == 'educational' else 0 for i in train_data['purpose']]
  processed['purpose_house'] = [1 if i == 'house' else 0 for i in train_data['purpose']]
  processed['purpose_other'] = [1 if i == 'other' else 0 for i in train_data['purpose']]
  processed['zip_code'] = [int(i.strip('xx')) for i in train_data['zip_code']]
  processed['dti'] = [i for i in train_data['dti']]
  processed['earliest_cr_line_month'] = [int(months[i.split('-')[0]]) for i in train_data['earliest_cr_line']]
  processed['earliest_cr_line_year'] = [int(i.split('-')[1]) for i in train_data['earliest_cr_line']]
  processed['open_acc'] = [i for i in train_data['open_acc']]
  processed['pub_rec'] = [i for i in train_data['open_acc']]
  processed['revol_bal'] = [i for i in train_data['revol_bal']]
  processed['revol_util'] = [float_percent(i) for i in train_data['revol_util']]
  processed['total_acc'] = [i for i in train_data['total_acc']]
  processed['mort_acc'] = [i for i in train_data['mort_acc']]
  processed['pub_rec_bankruptcies'] = [i for i in train_data['pub_rec_bankruptcies']]
  processed['addr_state'] = [i for i in train_data['addr_state']]

  if value:
    processed['loan_status'] = [0 if i == 'Fully Paid' else 1 for i in train_data['loan_status']]

  df = pd.DataFrame(data=processed)

  return df


In [None]:
train_df = pd.read_csv('/content/LOANS_TRAIN.csv')
test_df = pd.read_csv('/content/LOANS_TEST.csv')
print(len(train_df))
print(len(test_df))

train = preprocess_data(train_df.iloc[:,1:], True)
test = preprocess_data(test_df.iloc[:,1:], False)

train, test = target_encode(train, test)
train.fillna(-1)
test.fillna(-1)

full_train_data = train.to_numpy()
test_data = test.to_numpy()

X_train = full_train_data[:,:-1]
Y_train = full_train_data[:,-1]
X_test = test_data
X_train, X_test = normalize(X_train, X_test)

197250
42480


In [None]:
features = ['loan_amnt', 'term (months)','int_rate','installment','subgrade','emp_length',
          'home_ownership_mortgage','home_ownership_rent','home_ownership_own','home_ownership_other',
          'annual_inc','verification_status_not_verified','verification_status_verified',
          'verification_status_source_verified','verification_status_other','purpose_debt_consolidation',
          'purpose_car','purpose_moving','purpose_home_improvement','purpose_credit_card',
          'purpose_wedding','purpose_major_purpose','purpose_vacation','purpose_small_business',
          'purpose_medical','purpose_renewable_energy','purpose_educational','purpose_house',
          'purpose_other','zip_code','dti','earliest_cr_line_month','earliest_cr_line_year',
          'open_acc','pub_rec','revol_bal','revol_util','total_acc','mort_acc',
          'pub_rec_bankruptcies','addr_state']   

logistic = SGDClassifier(loss='log',penalty='l1',alpha=1)
logistic.fit(X_train, Y_train)

plt.figure(figsize=(20,10))
plt.bar(features,logistic.coef_)
plt.xlabel('Feature')
plt.ylabel('Weight')
plt.ylim(bottom=-1000,top=1000)
plt.title('L1 Regularized Logistic Regression Weights')
plt.show()

NameError: ignored

In [None]:
from tqdm import tqdm

# self-implemented grid-search
eta = [i*0.01 for i in range(5,36,5)]
max_depth = [i for i in range(3,9)]
min_child_weight = [i for i in range(3,9)]
gamma = [0.1*i for i in range(5, 35, 5)]
n_estimators = [i*100 for i in range(11)]

errors = []

train_error = []
test_error = []
# for i in tqdm(params):
for a in eta:
  print("eta: " + str(a))
  for b in max_depth:
    print("   max_depth: " + str(b))
    for c in min_child_weight:
      print("       min_child_weight: " + str(c))
      for d in n_estimators:
          print("           n_estimators: " + str(d))
          x_train, x_test, y_train, y_test = train_test_split(X_train, Y_train, test_size=0.3)

          param = {
            'eta': a, 
            'max_depth': b,  
            'min_child_weight': c,
            'subsample':0.6,
            'gamma':1,
            'colsample_bytree':0.8,
            'n_estimators': d,
            'objective': 'binary:logistic',  
            'tree_method':'exact',
            'grow_policy':'lossguide'
          } 


          clf = xgb.XGBModel(**param)

          clf.fit(x_train, y_train,
                  eval_set=[(x_train, y_train), (x_test, y_test)],
                  eval_metric='auc',
                  verbose=True)

          # evals_result = clf.evals_result()

          # train_pred = model.predict(D_train)
          train_pred = clf.predict(x_train)
          test_pred = clf.predict(x_test)

          vals = [a,b,c,d,roc_auc_score(y_train, train_pred),roc_auc_score(y_test, test_pred)]
          errors.append(vals)



[1;30;43mStreaming output truncated to the last 5000 lines.[0m
[326]	validation_0-auc:0.739825	validation_1-auc:0.691523
[327]	validation_0-auc:0.739931	validation_1-auc:0.691485
[328]	validation_0-auc:0.740021	validation_1-auc:0.691432
[329]	validation_0-auc:0.740073	validation_1-auc:0.691427
[330]	validation_0-auc:0.740247	validation_1-auc:0.691378
[331]	validation_0-auc:0.740343	validation_1-auc:0.691347
[332]	validation_0-auc:0.74046	validation_1-auc:0.691341
[333]	validation_0-auc:0.740554	validation_1-auc:0.691286
[334]	validation_0-auc:0.740535	validation_1-auc:0.691262
[335]	validation_0-auc:0.740659	validation_1-auc:0.691275
[336]	validation_0-auc:0.74079	validation_1-auc:0.691385
[337]	validation_0-auc:0.74085	validation_1-auc:0.691314
[338]	validation_0-auc:0.740933	validation_1-auc:0.691293
[339]	validation_0-auc:0.741111	validation_1-auc:0.691332
[340]	validation_0-auc:0.741167	validation_1-auc:0.691346
[341]	validation_0-auc:0.741235	validation_1-auc:0.691363
[342]	vali

In [None]:
# Return the best parameters
print(errors(np.argmax(errors[:][-1]))

In [None]:
# plt.figure()
# plt.plot(params, train_error, label='training')
# plt.plot(params, test_error, label='testing')
# plt.xlabel('max_depth')
# plt.ylabel('mean')
# plt.legend(loc='upper right')
# plt.show()
# plt.savefig('max_depth vs roc_auc')

In [None]:
x_train, x_test, y_train, y_test = train_test_split(X_train, Y_train)

param = {
    'eta': 0.1, 
    'max_depth': 3,  
    'min_child_weight': 6,
    'subsample':0.6,
    'gamma':1,
    'colsample_bytree':0.8,
    'n_estimators': 1000,
    'objective': 'binary:logistic',  
    'tree_method':'exact',
    'grow_policy':'lossguide'
} 


clf = xgb.XGBModel(**param)

clf.fit(x_train, y_train,
        eval_set=[(x_train, y_train), (x_test, y_test)],
        eval_metric='auc',
        verbose=True)

evals_result = clf.evals_result()

[0]	validation_0-auc:0.663257	validation_1-auc:0.664094
[1]	validation_0-auc:0.668916	validation_1-auc:0.669078
[2]	validation_0-auc:0.675345	validation_1-auc:0.675537
[3]	validation_0-auc:0.675826	validation_1-auc:0.67566
[4]	validation_0-auc:0.676452	validation_1-auc:0.676797
[5]	validation_0-auc:0.676847	validation_1-auc:0.677676
[6]	validation_0-auc:0.677439	validation_1-auc:0.677866
[7]	validation_0-auc:0.678201	validation_1-auc:0.678787
[8]	validation_0-auc:0.678593	validation_1-auc:0.679486
[9]	validation_0-auc:0.678641	validation_1-auc:0.679775
[10]	validation_0-auc:0.679017	validation_1-auc:0.680358
[11]	validation_0-auc:0.679661	validation_1-auc:0.681219
[12]	validation_0-auc:0.680106	validation_1-auc:0.681672
[13]	validation_0-auc:0.680364	validation_1-auc:0.681871
[14]	validation_0-auc:0.680792	validation_1-auc:0.682456
[15]	validation_0-auc:0.681102	validation_1-auc:0.682946
[16]	validation_0-auc:0.681436	validation_1-auc:0.683358
[17]	validation_0-auc:0.681638	validation_

In [None]:
best_preds = clf.predict(X_test)
print(best_preds)
print("mean of preds: " + str(np.mean(best_preds)))

submission = {'id': [i for i in range(200000, 242480)],
              'loan_status': best_preds
              }

preds_df = pd.DataFrame(submission)
preds_df.to_csv('/content/submission.csv', index=False)

[0.01337957 0.10283218 0.02990655 ... 0.33296025 0.13993134 0.20638372]
mean of preds: 0.14960569
