In [1]:
import pandas as pd
pd.set_option('display.max_columns',50)
from IPython.display import HTML
import numpy as np

import matplotlib.pyplot as plt
import seaborn
%matplotlib inline

# from catboost import CatBoostClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
from lightgbm import LGBMClassifier
from sklearn.metrics import f1_score
from imblearn.over_sampling import SMOTE
from collections import Counter
import warnings
warnings.filterwarnings("ignore")

from xgboost import XGBClassifier,plot_importance

from sklearn.metrics import accuracy_score,classification_report

ModuleNotFoundError: No module named 'matplotlib'

In [None]:
%%HTML
<style type="text/css">
table.dataframe td, table.dataframe th {
    border: 1px  black solid !important;
  color: black !important;
}
</style>

In [None]:
train_df = pd.read_csv('../input/train.csv')
test_df = pd.read_csv('../input/test.csv')
train_df.head()

In [None]:
train_df.describe()

In [None]:
train_df.dtypes.to_frame()

In [None]:
num_col = train_df.columns[(train_df.dtypes=='float') | (train_df.dtypes=='int')]
num_col

In [None]:
cat_feats = train_df.columns[train_df.dtypes=='object']
cat_feats

#### Checking any null values exists in the dataframe

In [None]:
train_df.shape

In [None]:
train_df.isnull().sum()

In [None]:
missing_col= train_df.columns[train_df.isnull().any()]
missing_col

In [None]:
# Missing values imputation

# train_df['Gender'] = train_df['Gender'].astype(str).fillna(train_df['Gender'].mode())
# train_df['Married'] = train_df['Married'].astype(str).fillna(train_df['Married'].mode())
# train_df['Dependents'] = train_df['Dependents'].astype(str).fillna(train_df['Dependents'])
# train_df['Self_Employed'] = train_df['Self_Employed'].astype(str).fillna(train_df['Self_Employed'].mode())
# train_df['LoanAmount'] = train_df['LoanAmount'].fillna(train_df['LoanAmount'].mean())
# train_df['Loan_Amount_Term'] = train_df['Loan_Amount_Term'].fillna(train_df['Loan_Amount_Term'].mean())
# train_df['Credit_History'] = train_df['Credit_History'].fillna(train_df['Credit_History'].mean())




In [None]:
def handle_na(dataframe):
    for c in missing_col:
        if c in list(set(dataframe.columns[dataframe.dtypes=='object'])):
            dataframe[c] = dataframe.loc[:,c].astype(str).fillna(dataframe.loc[:,c].mode())
        else: 
            dataframe.loc[:,c] = dataframe.loc[:,c].fillna(dataframe.loc[:,c].mean())


In [None]:
handle_na(train_df)

In [None]:
train_df.isnull().sum().to_frame()

In [None]:
test_df.isnull().sum().to_frame()

In [None]:
handle_na(test_df)

In [None]:
# Now Identity for presence of Outlier in datasets

"""
Z score is an important concept in statistics. Z score is also called standard score. This score helps to understand 
if a data value is greater or smaller than mean and how far away it is from the mean. More specifically, Z score tells 
how many standard deviations away a data point is from the mean.

Z score = (x -mean) / std. deviation



A normal distribution is shown below and it is estimated that
68% of the data points lie between +/- 1 standard deviation.
95% of the data points lie between +/- 2 standard deviation
99.7% of the data points lie between +/- 3 standard deviation




"""

from IPython.display import Image
Image(filename='z_score.png')



In [None]:
for i in train_df[num_col]:
    outlier= []
    col_mean = train_df[i].mean()
    std_dev = np.std(train_df[i])
    z = (train_df[i]-col_mean)/std_dev
    if (z > col_mean).any():
        outlier.append(i)        
    print("mean of {} is {} and standard Deviation is {} ".format(i, col_mean,std_dev))
    
    print("outlier in {}".format(i), outlier)
    
    

In [None]:
#dropping loan_id from train_df and test_df
train_df = train_df.drop('Loan_ID',axis=1)
loan_ids = test_df['Loan_ID'].values
test_df = test_df.drop('Loan_ID',axis=1)

In [None]:
train_df.head()

In [None]:
(train_df["Loan_Status"].value_counts()/train_df.shape[0])*100

In [None]:
ax = train_df["Loan_Status"].value_counts().plot.pie(autopct='%.2f').set_title("Samples")
plt.savefig("../img/Samples.png", bbox_inches="tight")

In [None]:
# categorical_features_indices = np.where(train_df.dtypes != np.float)[0]

# categorical_features_indices

In [None]:
# train_df.iloc[:,[0,1,2,3,4,10]].head()

### Convert to X and Y

In [None]:
X_train, Y = train_df.drop(["Loan_Status"], axis=1).values, train_df["Loan_Status"].values
X_test = test_df.values

X_train.shape, Y.shape, X_test.shape

### Perform validation

In [None]:
kfold, scores = KFold(n_splits=5, shuffle=True,random_state=0) , list()
for train,test in kfold.split(X_train):
    x_train,x_test = X_train[train], X_train[test]
    y_train, y_test = Y[train],Y[test]
    
    model = CatBoostClassifier(random_state=27,max_depth=4,task_type='CPU',devices='0:1',n_estimators=1000,verbose=True)
    model.fit(x_train,y_train,cat_features = [0,1,2,3,4,10])
    preds = model.predict(x_test)
    score = f1_score(y_test,preds,average='weighted')
    scores.append(score)
    print(score)
print("Average: ",sum(scores)/len(scores))
    

### Make final prediction using Catboost

In [None]:
model = CatBoostClassifier(random_state=27, task_type="GPU", devices="0:1", n_estimators=1000, max_depth=4, verbose=500)
model.fit(X_train,Y,cat_features = [0,1,2,3,4,10] )
preds1 = model.predict(X_test)

### Check Feature Importance

In [None]:
feat_imp = pd.Series(model.feature_importances_,index = train_df.drop('Loan_Status',axis=1).columns)
feat_imp.nlargest(30).plot(kind='barh', figsize=(8,10))

## XGBoost

### Pre-Processing specific to LightGBM

In [None]:
train_df_1 = train_df.copy()
test_df_1 = test_df.copy()

In [None]:
le = LabelEncoder()

train_df_1['Gender'] = le.fit_transform(train_df_1['Gender'])
test_df_1['Gender'] = le.fit_transform(test_df_1['Gender'])

train_df_1['Married'] = le.fit_transform(train_df_1['Married'])
test_df_1['Married'] = le.fit_transform(test_df_1['Married'])


train_df_1['Married'] = le.fit_transform(train_df_1['Married'])
test_df_1['Married'] = le.fit_transform(test_df_1['Married'])

train_df_1['Education'] = le.fit_transform(train_df_1['Education'])
test_df_1['Education'] = le.fit_transform(test_df_1['Education'])

train_df_1['Self_Employed'] = le.fit_transform(train_df_1['Self_Employed'])
test_df_1['Self_Employed'] = le.fit_transform(test_df_1['Self_Employed'])

train_df_1['Property_Area'] = le.fit_transform(train_df_1['Property_Area'])
test_df_1['Property_Area'] = le.fit_transform(test_df_1['Property_Area'])



train_df_1['Dependents']= le.fit_transform(train_df_1['Dependents'])
test_df_1['Dependents']= le.fit_transform(test_df_1['Dependents'])


train_df_1['Loan_Status']= le.fit_transform(train_df_1['Loan_Status'])

### Convert to X and Y

In [None]:
X_train, Y = train_df_1.drop(["Loan_Status"], axis=1).values, train_df_1["Loan_Status"].values
X_test = test_df_1.values

X_train.shape, Y.shape, X_test.shape

In [None]:
import xgboost as xgb

In [None]:
param = {
    'max_depth': 5,  # the maximum depth of each tree
    'eta': 0.3,  # the training step for each iteration
    'silent': 1,  # logging mode - quiet
    'objective': 'multi:softprob',  # error evaluation for multiclass training
    'num_class': 3}  # the number of classes that exist in this datset
num_round = 20  # the number of training iterations

In [None]:
kfold, scores = KFold(n_splits=15, shuffle=True,random_state=0) , list()
for train,test in kfold.split(X_train):
    x_train,x_test = X_train[train], X_train[test]
    y_train, y_test = Y[train],Y[test]
    
    model = XGBClassifier()
    model.fit(x_train,y_train)
    preds1 = model.predict(x_test)
    print('Accuracy of Model is : ',accuracy_score(y_test,preds1))
    score = f1_score(y_test,preds1,average='weighted')
    scores.append(score)
    print(score)
print("Average: ",sum(scores)/len(scores))
    
    
    

### Making a final Prediction using Xgboost

In [None]:
model = XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.5, gamma=0.4, learning_rate=0.1,
       max_delta_step=0, max_depth=6, min_child_weight=7, missing=None,
       n_estimators=1000, n_jobs=1, nthread=None,
       objective='binary:logistic', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=None, silent=True,
       subsample=1)
model.fit(X_train,Y)
prediction = model.predict(X_test)

In [None]:
submission_df = pd.read_csv('sample_submission_49d68Cx.csv')
submission_df['Probability'] = preds1

In [None]:
del submission_df['Loan_Status']

In [None]:
# Yes =1 ; Loan = 0

In [None]:
submission_df['Loan_Status'] = ['Y' if x==1 else 'N' for x in submission_df['Probability']]

In [None]:
submission_df.head()

In [None]:
del submission_df['Probability']
submission_df.to_csv('Xgb_Prediction.csv',index=False)