In [1]:
import pandas as pd

In [2]:
df  = pd.read_csv(r"C:\Users\David gathara marigi\Downloads\loan_data_sample.csv", index_col = "SK_ID_CURR")

In [None]:
numeric_df = df.copy() 
numeric_df['NAME_CONTRACT_TYPE'] = numeric_df['NAME_CONTRACT_TYPE'].map({'Cash loans': 0, 'Revolving loans': 1}) 
numeric_df['CODE_GENDER'] = numeric_df['CODE_GENDER'].map({'M': 0, 'F': 1}) 
numeric_df['FLAG_OWN_CAR'] = numeric_df['FLAG_OWN_CAR'].map({'N': 0, 'Y': 1}) 
numeric_df['FLAG_OWN_REALTY'] = numeric_df['FLAG_OWN_REALTY'].map({'N': 0, 'Y': 1}) 
numeric_df['NAME_EDUCATION_TYPE'] = numeric_df['NAME_EDUCATION_TYPE'].map({'Lower secondary': 0, 'Secondary / secondary special': 0, 
                                       'Incomplete higher': 1, 
                                       'Higher education': 2, 
                                       'Academic degree': 2}) 
numeric_df.dropna(inplace=True)

In [None]:
numeric_df.info() 

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
features = numeric_df.drop('TARGET', axis=1) 
targets = numeric_df['TARGET'] 
x_train, x_test, y_train, y_test = train_test_split(features, 
                                                    targets, 
stratify=targets, 
random_state=42)

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
dt = DecisionTreeClassifier() 
dt.fit(x_train, y_train) 
print(f'Train accuracy: {dt.score(x_train, y_train)}')  
print(f'Test accuracy: {dt.score(x_test, y_test)}')

In [None]:
small_dt = DecisionTreeClassifier(max_depth=2, 
max_features=None) 
small_dt.fit(x_train, y_train) 
print(f'Train accuracy: {small_dt.score(x_train, y_train)}') 
print(f'Test accuracy: {small_dt.score(x_test, y_test)}')

In [None]:
import matplotlib.pyplot as plt 
from sklearn.tree import plot_tree

In [None]:
f = plt.figure(figsize=(12, 12)) 
_ = plot_tree(small_dt, feature_names=features.columns)

In [None]:
# random forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
rfc = RandomForestClassifier(max_depth=10, n_jobs=-1, 
random_state=42) 
rfc.fit(x_train, y_train) 
print(rfc.score(x_train, y_train)) 
print(rfc.score(x_test, y_test)) 

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
x_tr_sample = x_train.sample(1000) 
y_tr_sample = y_train.loc[x_tr_sample.index] 
params = {'n_estimators': [100, 300, 500], 
'max_depth': [10, 15, 20], 
'max_features': [3, 6, 9], 
'random_state': [42], 
'n_jobs': [-1]} 
gs = GridSearchCV(rfc, param_grid=params, n_jobs=-1) 
gs.fit(x_tr_sample, y_tr_sample) 
print(gs.best_estimator_) 
print(gs.best_score_)

In [None]:
import h2o 
h2o.init() 

In [None]:
hf = h2o.H2OFrame(df) 
hf['TARGET'] = hf['TARGET'].asfactor() 
train, valid = hf.split_frame(ratios=[.8], seed=42)

In [None]:
from h2o.estimators import H2ORandomForestEstimator

In [None]:
drf = H2ORandomForestEstimator(ntrees=100, max_depth=10, 
mtries=3) 
feature_columns = hf.columns
feature_columns.remove('TARGET') 
target_column = 'TARGET' 
drf.train(x=feature_columns, 
          y=target_column, 
          training_frame=train, 
          validation_frame=valid) 
drf.model_performance(valid).F1()

In [None]:
predictions = drf.predict(train) 
(predictions['p1'] > 0.097).as_data_frame()['p1'].values

In [None]:
save_path = h2o.save_model(model=drf, path='drf', force=True) 
drf2 = h2o.load_model(path=save_path)

In [None]:
drf.varimp()
drf.varimp_plot(server=True)
plt.savefig() 

In [None]:
from yellowbrick.model_selection import feature_importances 

In [None]:
_ = feature_importances(gs.best_estimator_, 
                        x_train, 
                        y_train, 
                        colors=['darkblue'] * 
features.shape[0])

In [None]:
from sklearn.ensemble import AdaBoostClassifier

In [None]:
from sklearn.ensemble import AdaBoostClassifier 
adaboost = AdaBoostClassifier(n_estimators=100, 
                              learning_rate=0.5, 
                              random_state=42) 
adaboost.fit(x_train, y_train) 
print(adaboost.score(x_train, y_train)) 
print(adaboost.score(x_test, y_test))

In [None]:
from pycaret.classification import setup, create_model, tune_model

In [None]:
classification = setup(data=numeric_df, target='TARGET')

In [None]:
adaboost = create_model('ada', fold=3) 
tuned_adaboost, gridsearch = tune_model(adaboost, fold=3, 
return_tuner=True)

In [None]:
""" XGBoost with PyCaret
 Again, we can use 
xgboost easily through 
pycaret , which, by default,
 searches the following hyperparameter space:
 learning_rate : 0.0000001 to 0.5
 n_estmators : 10-300 in steps of 10
 subsample : 0.2 to 1
 max_depth : 1 to 11 in steps of 1
 colsample_bytree : 0.5 to 1
 min_child_weight : 1 to 4 in steps of 1
 reg_alpha : 0.0000001 to 10
 reg_lambda : 0.0000001 to 10
 scale_pos_weight : 0 to 50 in steps of 0.1"""

In [None]:
tune_distributions = {"learning_rate": UniformDistribution(0.000001, 0.5, log=True), 
"n_estimators": IntUniformDistribution(10, 300), 
"subsample": UniformDistribution(0.2, 1), 
"max_depth": IntUniformDistribution(1, 11), 
"colsample_bytree": UniformDistribution(0.5, 1), 
"min_child_weight": IntUniformDistribution(1, 4), 
"reg_alpha": UniformDistribution(0.0000000001, 10, 
log=True), 
    "reg_lambda": UniformDistribution(0.0000000001, 10, 
log=True), 
    "scale_pos_weight": UniformDistribution(1, 50), 
}

In [None]:
xgb = create_model('xgboost', fold=3) 
best_xgb, tuner = tune_model(xgb, 
                             fold=3, 
                             search_library='scikit-optimize', 
                             return_tuner=True)

In [None]:
best_xgb.get_params() 

In [None]:
tuner.cv_results_['params']
tuner.cv_results_['mean_test_score']

In [None]:
xgb_model.get_booster().get_score()

In [None]:
best_xgb.feature_importances_ 

In [None]:
import xgboost as xgb 

In [None]:
dtrain = xgb.DMatrix(x_train, label=y_train) 
dtest = xgb.DMatrix(x_test, label=y_test)

In [None]:
xgb_model = xgb.train(params={'objective': 'binary:logistic'}, 
dtrain=dtrain)

In [None]:
from sklearn.metrics import accuracy_score

In [None]:
train_preds = xgb_model.predict(dtrain) 
test_preds = xgb_model.predict(dtest) 
print(accuracy_score(y_train, train_preds > 0.5)) 
print(accuracy_score(y_test, test_preds > 0.5))

In [None]:
xgb_model = xgb.XGBClassifier()

In [None]:
fit_model = xgb_model.fit(x_train, y_train)

In [None]:
fit_model.score(x_train, y_train)

In [None]:
# for gpu

In [None]:
xgboost_gpu = create_model('xgboost', 
                           fold=3, 
                           tree_method='gpu_hist', 
                           gpu_id=0)

In [None]:
light_gbm = create_model('lightgbm', fold=3) 
best_lgbm, tuner = tune_model(light_gbm, 
                              fold=3, 
                              search_library='scikit optimize', 
                              return_tuner=True)

In [None]:
import lightgbm

In [None]:
lightgbm.plot_importance(best_lgbm)

In [None]:
lgb_model = lightgbm.LGBMClassifier()

In [None]:
trained_lgb = lgb_model.fit(x_train, y_train)

In [None]:
catboost_model = create_model('catboost', fold=3) 
best_cb, tuner = tune_model(catboost_model, 
                            fold=3, 
                            search_library='scikit-optimize', 
                            return_tuner=True)

In [None]:
from catboost import CatBoostClassifier, Pool

In [None]:
cb_model = CatBoostClassifier() 
catboost_train_data = Pool(x_train,  
                           y_train)

In [None]:
cb_model.fit(catboost_train_data)

In [None]:
cb_model.score(catboost_train_data)

In [None]:
catboost_test_data = Pool(x_test) 
cb_model.predict(catboost_test_data)

In [None]:
import catboost

In [None]:
new_cb = catboost.CatBoostClassifier(**best_cb.get_params()) 
new_cb.set_params(n_estimators=1000) 
new_cb.fit(X=x_train, 
           y=y_train, 
           eval_set=(x_test, y_test), 
           early_stopping_rounds=10, 
           plot=True)