In [22]:
# imports
import numpy as np
import pandas as pd
import pickle
from sklearn.ensemble import GradientBoostingClassifier

In [23]:
# getting the data
df = pd.read_csv('data/train_preprocessed_data.csv')
pd.set_option('max_columns', None)
df['loan_paid'].value_counts()

1    776024
0    191895
Name: loan_paid, dtype: int64

In [24]:
# setting up data
y_train = df['loan_paid'].to_numpy()
x_train = df.drop(columns=['loan_paid']).to_numpy()

In [25]:
# fixing numpy conversion issues
x_train[np.isinf(x_train)] = 0
print(np.where(np.isinf(x_train)))

print(np.where(np.isinf(y_train)))

(array([], dtype=int64), array([], dtype=int64))
(array([], dtype=int64),)


In [26]:
# getting sample weights
with open('models/sample_weights.pickle', 'rb') as handle:
    sample_weights = pickle.load(handle)

In [27]:
# gradient boosted classfier model
model = GradientBoostingClassifier(n_estimators=250, max_depth=2, subsample=1)
model.fit(x_train, y_train, sample_weights)

GradientBoostingClassifier(max_depth=2, n_estimators=250, subsample=1)

In [28]:
# testing accuracy on training data
print("Accuracy on training data:", model.score(x_train, y_train))

Accuracy on training data: 0.8040528184693141


In [29]:
# getting validation input
x_val = pd.read_csv('data/predict_preprocessed_data.csv')
x_val_id = x_val['ID']
x_val = x_val.drop(columns=['ID'])

In [30]:
# converting to np array
x_val = x_val.to_numpy()
print(np.where(np.isinf(x_val)))
x_val[np.isinf(x_val)] = 0
np.where(np.isinf(x_val))

(array([140519]), array([12]))


(array([], dtype=int64), array([], dtype=int64))

In [31]:
# getting predictions
pred = model.predict(x_val)

In [32]:
# creating submission
submission = pd.DataFrame()
submission['ID'] = x_val_id
submission['loan_paid'] = pred
submission.to_csv('submissions/pred.csv', index=None)