In [26]:
#Import packages
import numpy as np
import pandas as pd

In [41]:
# importing SimpleImputer for handling missing value
from sklearn.impute import SimpleImputer
# importing MissingIndicator for handling missing value

# importing StandardScaler for standardization
from sklearn.preprocessing import StandardScaler
# importing OnHotEncoder for encoding categorical variable
from sklearn.preprocessing import OneHotEncoder
# importing for transformation
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_transformer
from sklearn.compose import make_column_selector
# importing PCA for handling dimensonality reduction
from sklearn.decomposition import PCA

# importing pipeline for chaining model building activities
#from sklearn.pipeline import Pipeline
#from sklearn.pipeline import make_pipeline
from imblearn.pipeline import Pipeline
from imblearn.pipeline import make_pipeline as mp
# importing FeatureUnion for combining transformers
from sklearn.pipeline import FeatureUnion

# importing samplers for handling data imbalance
from imblearn.combine import SMOTEENN 
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import RandomOverSampler 
from imblearn.under_sampling import RandomUnderSampler 

# importing train_test_split for train and validation split
from sklearn.model_selection import train_test_split
# importing SelectFromModel to select features from model 
from sklearn.feature_selection import SelectFromModel               

In [42]:
# importing classifiers to try with
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

# importing metrics required for model evaluation
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report
from sklearn.metrics import make_scorer
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay

# importing RepeatedKFold for cross validation
from sklearn.model_selection import RepeatedKFold
# importing for model evaluation
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import validation_curve
# importing RepeatedStratifiedKFold for model evaluation
from sklearn.model_selection import RepeatedStratifiedKFold
# importing GridSearchCV for hyperparameter tuning
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from yellowbrick.model_selection import ValidationCurve

In [43]:
# Loading dataset train_data.csv
train_df_sample = pd.read_csv('../input/amex-default-prediction/train_data.csv', nrows=100000)


In [44]:
# Loading dataset train_labels.csv
train_label_df = pd.read_csv('../input/amex-default-prediction/train_labels.csv')

In [45]:
test_df = pd.read_csv('../input/amex-default-prediction/test_data.csv', nrows=100000, index_col='customer_ID')

In [46]:
train_df = pd.merge(train_df_sample, train_label_df, how="inner", on=["customer_ID"])

In [47]:
#Check for number of unique customers
len(train_df.customer_ID.unique())

## We have 458913 unique customers.

In [48]:
#drop customer_ID and S_2 from train_df dataframe which are not required for model building
# train_df.drop(axis=1, columns=['S_2'], inplace=True)

In [49]:
#drop S_2 in test_df dataframe which is not required for model building
test_df.drop(axis=1, columns=['S_2'], inplace=True)

In [50]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
# Check for number of missing values
train_df.isnull().sum()

## Could be observed that there are many columns with many missing values

In [51]:
Percentage=len(train_df[train_df['target']==1])*100/len(train_df[train_df['target']==0])
Percentage

In [52]:

i=0
for col in train_df.columns:
    if (train_df[col].isnull().sum()/len(train_df[col])*100) >=75:
        print("Dropping column", col)
        train_df.drop(labels=col,axis=1,inplace=True)
        i=i+1
        
print("Total number of columns dropped in train dataframe", i)


In [53]:
train=train_df.dropna(axis=1, thresh=int(0.90*len(train_df)))

#Checking the shape of new train data
train.shape

In [54]:
train=train.set_index(['customer_ID'])
train=train.ffill().bfill()
train=train.reset_index()
train=train.groupby('customer_ID').tail(1)
train=train.set_index(['customer_ID'])

#Drop date column since it is no longer relevant
train.drop(['S_2'],axis=1,inplace=True)
#Check for number of rows
train.shape

In [56]:
train.select_dtypes(['object'])

In [57]:
#Drop columns D_63 and D_64 subsequently
train_D63 = pd.get_dummies(train[['D_63']])
train = pd.concat([train, train_D63], axis=1)
train = train.drop(['D_63'], axis=1)

train_D64 = pd.get_dummies(train[['D_64']])
train = pd.concat([train, train_D64], axis=1)
train = train.drop(['D_64'], axis=1)

In [58]:
train.columns

In [59]:
df = train.drop(['target'],axis = 1)
corr_matrix = df.corr().abs()
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape),k =1).astype(np.bool))
to_drop = [col for col in upper.columns if any(upper[col] > 0.90)]

df1  = train.drop(to_drop,axis =1)
df1.shape               

In [60]:
from sklearn.feature_selection import VarianceThreshold
from itertools import compress
def fs_variance(df, threshold:float=0.1):
    """
    Return a list of selected variables based on the threshold.
    """
    # The list of columns in the data frame
    features = list(df.columns)
    
    # Initialize and fit the method
    vt = VarianceThreshold(threshold = threshold)
    _ = vt.fit(df)
    
    # Get which column names which pass the threshold
    feat_select = list(compress(features, vt.get_support()))
    
    return feat_select
columns_to_keep=fs_variance(df1)
# We are left with 54 columns (excluding target), which passed the threshold.
train_final=train[columns_to_keep]
len(columns_to_keep)

In [139]:
# Check for missing value in test dataset
if(any(test_df.isna().sum())):
    print("Yes")
else:
    print("No")

In [66]:
list(columns_to_keep)

In [62]:
#convert dtype for B categorical variable to object
train_df = train_df.astype({"B_30": 'str', "B_38": 'str'})
#convert dtype for B categorical variable to object
test_df = test_df.astype({"B_30": 'str', "B_38": 'str'})
#convert dtype for D categorical variable to object
train_df = train_df.astype({"D_114": 'str', "D_116": 'str', "D_117": 'str', "D_120": 'str', "D_126": 'str', "D_68": 'str'})
#convert dtype for D categorical variable to object
test_df = test_df.astype({"D_114": 'str', "D_116": 'str', "D_117": 'str', "D_120": 'str', "D_126": 'str', "D_68": 'str'})

In [None]:
.

In [67]:
X = train_df[columns_to_keep[:-6]]
y = train_df['target']

In [68]:
X.head(2)

In [69]:
print("Shape of X", X.shape)

In [70]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, stratify=y, random_state=42)

In [71]:
print("Shape of X_train", X_train.shape)

In [72]:

# define categorical variables (columns)
categorical = list(X.select_dtypes('object').columns)
print(f"Categorical variables (columns) are: {categorical}")

# define numerical variables (columns)
numerical = list(X.select_dtypes('number').columns)
print(f"Numerical variables (columns) are: {numerical}")

In [73]:
 imputer = SimpleImputer(strategy='most_frequent', missing_values=np.nan)

In [74]:
encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)

In [75]:
 scaler = StandardScaler()

In [76]:
preprocess = ColumnTransformer([
    ('cat',  Pipeline([('imputer', imputer),('encoder',encoder)]), categorical),
    ('num', Pipeline([('imputer', imputer),('scaler',scaler)]), numerical)])
print(preprocess)


In [77]:
def model_score(model_name):
    print("#######################################################################")
    print("Training and Evaluation using", model_name)
    print("#######################################################################")
    print("preprocess - Categorical: Missing value Impute, OneHotEncoding and Scaling")
    print("preprocess - Numerical: Missing value Impute and Scaling")
    print("###########################################################################")
    model = pipe.fit(X_train, y_train)  
    print("#######################################################################")
    print (model)
    print("#######################################################################")
    print("model training score: %.3f" % pipe.score(X_train, y_train))
    print("model validation score: %.3f" % pipe.score(X_test, y_test))

In [78]:
# function for display of cross validation score
def model_cross_validation_score(model_name):
    print("#######################################################################")
    print("Training and Evaluation with Cross Validation using",model_name)
    print("#######################################################################")
    # using scoring with classification metrics
    scoring = ['accuracy', 'precision', 'recall','f1','roc_auc']
    #using RepeatedStratifiedKFold as cross validator
    cv = RepeatedStratifiedKFold(n_splits=3, n_repeats=2, random_state=42)
    # cross validation returning both train and test score
    scores = cross_validate(pipe, X, y, scoring=scoring, cv=cv, n_jobs=-1, return_train_score=True,return_estimator=True)
    print('Training Score: Accuracy: {:.2f}, Precision: {:.2f}, Recall: {:.2f},f1-score: {:.2f}, ROC AUC: {:.2f}'.format(np.mean(scores['train_accuracy']),np.mean(scores['train_precision']), np.mean(scores['train_recall']), np.mean(scores['train_f1']), np.mean(scores['train_roc_auc'])))
    print('Validation Score: Accuracy: {:.2f}, Precision: {:.2f}, Recall: {:.2f},f1-score: {:.2f}, ROC AUC: {:.2f}'.format(np.mean(scores['test_accuracy']),np.mean(scores['test_precision']), np.mean(scores['test_recall']), np.mean(scores['test_f1']), np.mean(scores['test_roc_auc'])))
    print("#######################################################################")

In [79]:
def model_random_search_score(model_name):
    print("#######################################################################")
    print("Training and Evaluation with RandomizedSearchCV using",model_name)
    print("#######################################################################")
    random_search.fit(X_train,y_train)
    model = random_search.best_estimator_
    score = random_search.best_score_
    print ("Best Estimator for", model_name,"is", model,"with best score as",score)

In [80]:
# pipeline steps required for model building,training and evaluation
steps = [
        ('preprocess', preprocess),
        ('over_sampler',SMOTE(random_state = 42)),
        ('under_sampler',RandomUnderSampler()),
        ('feature_selection', SelectFromModel(RandomForestClassifier(n_estimators = 10, random_state = 42, n_jobs = -1))),
        ('model_estimator', RandomForestClassifier(random_state = 42))
    ]
pipe = Pipeline(steps, verbose=True)

In [81]:
model_score("RandomForestClassifier")

In [82]:
pipe.set_params(model_estimator=DecisionTreeClassifier())
model_score("DecisionTreeClassifier")

In [83]:
# using XGBClassifier
pipe.set_params(model_estimator=XGBClassifier())
# using custom function to display model training and validation score
model_score("XGBClassifier")

In [84]:
#using custom function to display cross validation score
model_cross_validation_score("XGBClassifier")

In [85]:
model_cross_validation_score("RandomForestClassifier")

In [175]:
# using parameters for RandomizedSearchCV for XGBClassifier
param_random = dict(model_estimator=[XGBClassifier()],model_estimator__learning_rate= [0.05,0.10,0.15,0.20,0.25,0.30],model_estimator__max_depth= [ 3, 4, 5, 6, 8, 10, 12, 15],model_estimator__min_child_weight=[ 1, 3, 5, 7 ], model_estimator__gamma=[ 0.0, 0.1, 0.2 , 0.3, 0.4 ], model_estimator__colsample_bytree =[ 0.3, 0.4, 0.5 , 0.7 ])
random_search = RandomizedSearchCV(pipe, param_distributions=param_random, n_iter=1, cv=3, scoring='roc_auc', verbose=3,random_state=42)

In [176]:
model_random_search_score("XGBClassifier")

In [None]:
from sklearn.metrics import roc_auc_score

In [178]:
#Run XGBoost model with the best parameters found
model=XGBClassifier(n_estimators=150,max_depth=4,learning_rate=0.1, subsample=0.6)
model.fit(X_train,y_train)
#Test the model
y_predict=model.predict(X_test)
print('\XGBoost Classifier Accuracy: {:.3f}'.format(accuracy_score(y_test, y_predict)))
# Achieved 89.4% accuracy

In [None]:
pred_train = model.predict_proba(x_train_split)
pred_valid = model.predict_proba( x_test_split)

In [None]:
roc_auc_score(y_train_split, pred_train[:,1])

In [None]:
roc_auc_score(y_test_split, pred_valid[:,1])

In [None]:
from sklearn.metrics import classification_report
classification_report(y_train_split, pred_train[:,1])

In [None]:
print('\nXGBoost Classifier Precision: {:.3f}'.format(precision_score (y_test_split, y_predict)))
# Achieved Precision Score of 0.795

In [None]:
print('\nXGBoost Classifier Recall: {:.3f}'.format(recall_score (y_test_split, y_predict)))
#Achieved Recall Score of 0.793