In [1]:
import pandas as pd

In [2]:
train_train_df = pd.read_csv("train_train_rm.csv", sep=";")
train_test_df = pd.read_csv("train_test_rm.csv", sep=";")

test_df = pd.read_csv("test_rm.csv", sep=";")

In [3]:
train_train_df = train_train_df.rename(columns=lambda x: x.replace("[", ""))  
train_train_df = train_train_df.rename(columns=lambda x: x.replace("]", ""))
train_train_df = train_train_df.rename(columns=lambda x: x.replace("<", "lt"))
train_test_df = train_test_df.rename(columns=lambda x: x.replace("[", ""))
train_test_df = train_test_df.rename(columns=lambda x: x.replace("]", ""))
train_test_df = train_test_df.rename(columns=lambda x: x.replace("<", "lt"))
test_df = test_df.rename(columns=lambda x: x.replace("[", ""))
test_df = test_df.rename(columns=lambda x: x.replace("]", ""))
test_df = test_df.rename(columns=lambda x: x.replace("<", "lt"))

## Random Forest Classifier

In [4]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
from sklearn import svm
import xgboost as xgb
from scipy.stats import uniform, randint

unwanted_features = ["status", "loan_id"]
features = [x for x in list(train_train_df) if x not in unwanted_features]
target = "status"

train_train_df = train_train_df.append(train_test_df, ignore_index=True)

X = train_train_df[features]
y = train_train_df[target]

X_test = train_test_df[features]
y_test = train_test_df[target]


rfc = RandomForestClassifier(random_state=42)

param_grid = { 
    'n_estimators': [100, 150, 200],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth' : [4,5,6],
    'criterion' :['gini', 'entropy'],
}

CV_rfc = GridSearchCV(estimator=rfc, param_grid=param_grid, cv=3)

CV_rfc.fit(X, y)

print("Best Score:", CV_rfc.best_score_)
print("Best Params:", CV_rfc.best_params_)

Best Score: 0.8841534612176813
Best Params: {'criterion': 'gini', 'max_depth': 6, 'max_features': 'auto', 'n_estimators': 200}


In [5]:
clf_rfc = RandomForestClassifier(random_state=42, criterion='gini', max_depth=5, max_features='auto', n_estimators=200)

clf_rfc.fit(X, y)

RandomForestClassifier(max_depth=5, n_estimators=200, random_state=42)

In [6]:
# Predict the response for test dataset
y_pred = clf_rfc.predict_proba(X_test)[:, -1]

# Area Under the Curve, the higher the better
auc = metrics.roc_auc_score(y_test, y_pred)
print("AUC Score: ", auc)

AUC Score:  1.0


In [7]:
test = test_df[features]

confidences = clf_rfc.predict_proba(test)[:,-1]

confidences = [0 if x < 0.000001 else x for x in confidences]
confidences = ["{:f}".format(x) for x in confidences]

submit = pd.DataFrame()

submit["Id"] = test_df["loan_id"]
submit["Predicted"] = confidences

submit.to_csv("rf_python.csv", sep=",", index=False)

## Neural Network

In [67]:
from keras.models import Sequential
from keras.layers import Dense
import keras

# define the keras model
model = Sequential()
model.add(Dense(18, input_dim=52, activation='relu'))
model.add(Dense(10, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
# compile the keras model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=[keras.metrics.AUC()])
# fit the keras model on the dataset
model.fit(X, y, epochs=300, batch_size=32, verbose=0)
# evaluate the keras model
_, auc = model.evaluate(X, y)
print('AUC: %.2f' % auc)


AUC: 0.79


In [69]:
confidences = model.predict_proba(test)[:,-1]

confidences = [0 if x < 0.000001 else x for x in confidences]
confidences = ["{:f}".format(x) for x in confidences]

submit = pd.DataFrame()

submit["Id"] = test_df["loan_id"]
submit["Predicted"] = confidences

submit.to_csv("neuralnet_python.csv", sep=",", index=False)