In [None]:
from sklearn import preprocessing
import pandas as pd
import seaborn as sns
import numpy as np
import lightgbm as lgb
from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.ensemble import AdaBoostClassifier

In [None]:
df_train = pd.read_csv ('application_train.csv')
df_test = pd.read_csv ('application_test.csv')

In [None]:
#Label encoding of the categorical columns
label_enc = ["NAME_CONTRACT_TYPE", "CODE_GENDER", 
            "FLAG_OWN_CAR", "FLAG_OWN_REALTY", 
            "NAME_TYPE_SUITE", 
            "NAME_INCOME_TYPE",
            "NAME_EDUCATION_TYPE",
            "NAME_FAMILY_STATUS",
            "NAME_HOUSING_TYPE", "WEEKDAY_APPR_PROCESS_START", 
            "ORGANIZATION_TYPE"]

#Now droping columns which can not be label encoded or features which don't posses any significance to the predicted 
#value 
d_columns = ['OCCUPATION_TYPE', 'FONDKAPREMONT_MODE', 'HOUSETYPE_MODE', 'WALLSMATERIAL_MODE', 'EMERGENCYSTATE_MODE']  
df_train = df_train.drop(columns = d_columns[0:5])
df_test = df_test.drop(columns = d_columns[0:5])

for col in label_enc:
    label = preprocessing.LabelEncoder()
    label.fit(list(df_train[col].values.astype('str')) + list(df_test[col].values.astype('str')))
    df_train[col] = label.transform(list(df_train[col].values.astype('str')))
    df_test[col] = label.transform(list(df_test[col].values.astype('str')))

## All models should be run independently from the indicated start sections

In [None]:
#Logistic Regression Model Implementation

#Dropping columns with null values
df_train=df_train[df_train.columns[~df_train.isnull().any()]]
df_test=df_test[df_test.columns[~df_test.isnull().any()]]
df_test.drop(['DAYS_LAST_PHONE_CHANGE',  'CNT_FAM_MEMBERS', 'AMT_GOODS_PRICE'], axis = 1, inplace = True) 

#Removing the 'Traget' column from the main training dataset and assigning variable y with the target
X=df_train.loc[:, df_train.columns != 'TARGET']
y=df_train.loc[:, df_train.columns == 'TARGET']

#Performing test-train split with a 20:80 ratio
train_X, test_X, train_y, test_y = train_test_split(X, y, 
                                                    train_size=0.80,
                                                    test_size=0.20,
                                                    random_state=99)
#Applying Logistic Regression
logreg = LogisticRegression()
logreg.fit(train_X, train_y)
logreg_model_score = logreg.score( train_X, train_y)
logreg_model_score

In [None]:
#Predicting Target probabilities and Calculating the ROC AUC score
logreg_pred=logreg.predict_proba(df_test)[:, 1]
logreg_pred_log_proba = logreg.predict_log_proba(test_X)[:,1]
logreg_model_ROC = roc_auc_score(test_y, logreg_pred_log_proba)
logreg_model_ROC
#ROC=0.5888641171603671

In [None]:
#Creating Submission file for submission
submit = df_test[['SK_ID_CURR']]
submit['TARGET'] = logreg_pred
submit.to_csv('logreg.csv', index = False)
#Score=0.59837

In [None]:
#Random Forest Model Implementation
randforest = RandomForestClassifier(random_state = 99,n_jobs = -1)
X=df_train.loc[:, df_train.columns != 'TARGET']
y=df_train.loc[:, df_train.columns == 'TARGET']

#Performing test-train split 20:80
train_X, test_X, train_y, test_y = train_test_split(X, y, 
                                                    train_size=0.80,
                                                    test_size=0.20,
                                                    random_state=99)

In [None]:
#Fitting the Model
randforest.fit(train_X, train_y)

#Calculating the accuracy score
RandForest_Model = randforest.score( train_X, train_y)
RandForest_Model

In [None]:
#Calculating ROC AUC Score
random_pred=randforest.predict_proba(df_test)[:, 1]
random_pred_log_proba = randforest.predict_proba(test_X)[:,1]
randforest_model_ROC = roc_auc_score(test_y, random_pred_log_proba)
randforest_model_ROC
#ROC=0.6345390792685979

In [None]:
#Generating Submission file
submit = df_test[['SK_ID_CURR']]
submit['TARGET'] = random_pred
submit.to_csv('randforest.csv', index = False)
#Score=0.63612

In [None]:
#LightGBM implimentation

#Removing the 'Traget' column from the main training dataset and assigning variable y with the target
X=df_train.loc[:, df_train.columns != 'TARGET']
y=df_train.loc[:, df_train.columns == 'TARGET']

#Performing test-train split with a 20:80 ratio
train_X, test_X, train_y, test_y = train_test_split(X, y, 
                                                    train_size=0.80,
                                                    test_size=0.20,
                                                    random_state=99)

#Listing all the Features and storing them in the variable 'features'
features= list(df_test.columns)

#Creating an emplty column to contain the predicted values
test_preds = np.zeros(df_test.shape[0])

#Setting up KFold Cross Validation
kfold = KFold(shuffle = True, random_state = 99)
        
lgbm = LGBMClassifier(
    n_estimators=4000,
    learning_rate=0.03,
    num_leaves=30,
    colsample_bytree=.8,
    subsample=.9,
    max_depth=7,
    reg_alpha=.1,
    reg_lambda=.1,
    min_split_gain=.01,
    min_child_weight=2,
    silent=-1,
    verbose=-1,
)

#Implementing LightGBM on the Dataset with early stopping set to 100
lgbm.fit(train_X, train_y, 
        eval_set= [(train_X, train_y), (test_X, test_y)], 
        eval_metric='auc', verbose=100, early_stopping_rounds=100
        )

#Implementing KFold Cross Validation
for n, (train_i, test_i) in enumerate(kfold.split(df_train)): 
    test_preds += lgbm.predict_proba(df_test[features], num_iteration=lgbm.best_iteration_)[:,1] / kfold.n_splits

In [None]:
#Calculating the Accuracy score
accuracy_score(test_y, lgbm.predict(test_X))

In [None]:
#Feature Importance Calculation
feature_importance_df = pd.DataFrame()
feature_importance_df["feature"] = features
feature_importance_df["importance percent"] = lgbm.feature_importances_
feature_importance_df['importance percent']= (feature_importance_df['importance percent']/100)
feature_importance_df

In [None]:
#Calculating ROC AUC Score
lgbm_pred_log_proba = lgbm.predict_proba(test_X)[:,1]
lgbm_model_ROC = roc_auc_score(test_y, lgbm_pred_log_proba)
lgbm_model_ROC
#ROC=0.6835767676225897
#ROC=0.7625142284859512 #without droping null columns 

In [None]:
#Generating the file for submission
submit = pd.DataFrame()
submit['SK_ID_CURR']= df_test['SK_ID_CURR']
submit['TARGET'] = test_preds
submit.sum(axis = 0, skipna = True)
submit.to_csv('lgbm.csv', index = False)
#Score=0.74687

In [None]:
#AdaBoost Model Implementation
adamodel = AdaBoostClassifier(random_state=99)
X=df_train.loc[:, df_train.columns != 'TARGET']
y=df_train.loc[:, df_train.columns == 'TARGET']

train_X, test_X, train_y, test_y = train_test_split(X, y, 
                                                    train_size=0.80,
                                                    test_size=0.20,
                                                    random_state=99)
adamodel.fit(train_X, train_y)
ada_score=adamodel.score(train_X, train_y)


In [None]:
#Calculating Accuracy Score
ada_score=adamodel.score(train_X, train_y)
ada_pred=adamodel.predict_proba(df_test)[:, 1]
ada_pred

In [None]:
#Calculating Feature importance
features= list(df_test.columns)
ada_feature_importance_df = pd.DataFrame()
ada_feature_importance_df["feature"] = features
ada_feature_importance_df["importance percent"] = adamodel.feature_importances_
ada_feature_importance_df

In [None]:
#ROC Calculation
ada_pred = adamodel.predict_log_proba(test_X)[:,1]
ada_model_ROC = roc_auc_score(test_y, ada_pred)
ada_model_ROC
#ROC=0.6695558382404692

In [None]:
#Generating Submission
submit = df_test[['SK_ID_CURR']]
submit['TARGET'] = ada_pred
submit.to_csv('adaBoost.csv', index = False)
#Score= 0.66936