In [33]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

from sklearn.preprocessing import StandardScaler

from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV

import warnings
warnings.filterwarnings("ignore")

In [34]:
train = pd.read_csv('train_fNxu4vz.csv')
y = train.Interest_Rate

In [35]:
test = pd.read_csv('test_fjtUOL8.csv')

In [36]:
train_2 = train.copy()
train_2.drop(['Interest_Rate'], axis=1, inplace=True)

In [37]:
data = pd.concat([train_2, test], axis=0)
data.drop(['Loan_ID'], axis=1, inplace=True)

In [38]:
data.corr()

Unnamed: 0,Annual_Income,Debt_To_Income,Inquiries_Last_6Mo,Months_Since_Deliquency,Number_Open_Accounts,Total_Accounts
Annual_Income,1.0,-0.180756,0.058453,-0.055736,0.149062,0.211464
Debt_To_Income,-0.180756,1.0,-0.005079,0.006626,0.303119,0.228669
Inquiries_Last_6Mo,0.058453,-0.005079,1.0,0.011673,0.099949,0.132369
Months_Since_Deliquency,-0.055736,0.006626,0.011673,1.0,-0.050122,-0.060633
Number_Open_Accounts,0.149062,0.303119,0.099949,-0.050122,1.0,0.681922
Total_Accounts,0.211464,0.228669,0.132369,-0.060633,0.681922,1.0


## Missing Values

In [39]:
data['Length_Employed'].fillna('other', inplace=True)
data['Home_Owner'].fillna('Other', inplace = True)
data.Annual_Income.fillna(999999999999, inplace=True)
data.Months_Since_Deliquency.fillna(999, inplace=True)

## Data Cleanup

In [52]:
 def extract_num(a_string):
    split = a_string.split()
    if split[0].isdigit() == True:
        return int(split[0])
    elif split[0] == '10+':
        return 10
    elif split[0] == 'other':
        return 9999
    else:
        return 1

In [41]:
data.Length_Employed = data.Length_Employed.apply(extract_num)

In [42]:
def no_commas(string):
    return int(string.replace(',', ''))

In [43]:
data.Loan_Amount_Requested = data.Loan_Amount_Requested.apply(no_commas)

In [44]:
def frequency_encoding(data, col):
    encoding = data.groupby(col).size()
    encoding = encoding / len(data)
    final = 'enc_' + col
    data[final] = data[col].map(encoding)
    
    data.drop(col, axis=1, inplace=True)
    
    return encoding

In [45]:
frequency_encoding(data, 'Home_Owner')

Home_Owner
Mortgage    0.428227
None        0.000073
Other       0.153876
Own         0.076242
Rent        0.341581
dtype: float64

In [46]:
frequency_encoding(data, 'Income_Verified')

Income_Verified
VERIFIED - income           0.361789
VERIFIED - income source    0.322268
not verified                0.315943
dtype: float64

In [47]:
frequency_encoding(data, 'Purpose_Of_Loan')

Purpose_Of_Loan
car                   0.011499
credit_card           0.224188
debt_consolidation    0.589385
educational           0.000727
home_improvement      0.056436
house                 0.004806
major_purchase        0.020858
medical               0.009615
moving                0.006116
other                 0.050561
renewable_energy      0.000752
small_business        0.014840
vacation              0.005306
wedding               0.004911
dtype: float64

In [48]:
frequency_encoding(data, 'Gender')

Gender
Female    0.287957
Male      0.712043
dtype: float64

## Feature Generation

In [49]:
data['Monthly_Income'] = data['Annual_Income'] / 12
data['Debt'] = data.Debt_To_Income * data.Monthly_Income
data['Latent_Accounts'] = data.Total_Accounts - data.Number_Open_Accounts

In [50]:
data['Experienced'] = 0
data.Experienced[data.Length_Employed >= 6] = 1

In [51]:
data['LoanRequest_to_MonthlyIncome_Ratio'] = data.Loan_Amount_Requested /data.Monthly_Income
data['LoanRequest_to_AnnualIncome_Ratio'] = data.Loan_Amount_Requested / data.Annual_Income

## Data Transformation

In [19]:
data.Loan_Amount_Requested = data.Loan_Amount_Requested.apply(np.sqrt)
data.Annual_Income = data.Annual_Income.apply(np.log)
data.Number_Open_Accounts = data.Number_Open_Accounts.apply(np.sqrt)
data.Total_Accounts = data.Total_Accounts.apply(np.sqrt)
data.Monthly_Income = data.Monthly_Income.apply(np.log)
data.Debt = data.Debt.apply(np.sqrt)
data.Latent_Accounts = data.Latent_Accounts.apply(np.sqrt)

## Data Scaling

In [20]:
idx = data.columns

scaler = StandardScaler()
scaled_data = scaler.fit_transform(data)
scaled_data = pd.DataFrame(scaled_data, columns=idx)

copy_scaled_data = scaled_data.copy()
train_final = copy_scaled_data[0:164309]
test_final = copy_scaled_data[164309:]

## Final Model

In [21]:
xgb = XGBClassifier(random_state=96, colsample_bytree=0.7, max_depth=6, gamma=.01)
optimization_dict = {'max_depth': [4], 'n_estimators': [200]}
model = GridSearchCV(xgb, optimization_dict, scoring='accuracy', verbose=1, n_jobs=-1)

In [22]:
model.fit(train_final, y)

Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:   59.0s finished


GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=XGBClassifier(base_score=None, booster=None,
                                     colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=0.7, gamma=0.01,
                                     gpu_id=None, importance_type='gain',
                                     interaction_constraints=None,
                                     learning_rate=None, max_delta_step=None,
                                     max_depth=6, min_child_weight=None,
                                     missing=nan, monotone_constra...
                                     num_parallel_tree=None,
                                     objective='binary:logistic',
                                     random_state=96, reg_alpha=None,
                                     reg_lambda=None, scale_pos_weight=None,
                                     subsample=

## Prediction and Output

In [23]:
test_final.fillna(-0.548024, inplace=True)
ID = test.Loan_ID

In [24]:
final = model.predict(test_final)
dv = pd.DataFrame(ID)
dv['Interest_Rate'] = final
dv.to_csv('result.csv', index=False)