In [1]:
import numpy as np
import pandas as pd

In [2]:
#Importing cleansed train dataset
data_path = "C:/Users/Sharath P Dandamudi/Desktop/Dataset/"
train_file = data_path + "train_modified.csv"

train_mod1 = pd.read_csv(train_file,header=0)

In [3]:
#To avoid rerunning the codes to import datsets if overwritten
train_mod=train_mod1.copy()

In [4]:
#Dropping the identifier variable in the training dataset
train_mod_id=train_mod['ID']
train_mod.drop(['ID'],axis=1,inplace=True)

In [5]:
#Checking the variables in the dataset
train_mod.dtypes

Disbursed                        float64
Existing_EMI                     float64
Loan_Amount_Applied              float64
Loan_Tenure_Applied              float64
Monthly_Income                     int64
Var4                               int64
Var5                               int64
EMI_Loan_Submitted_Missing         int64
Interest_Rate_Missing              int64
Processing_Fee_Missing             int64
Loan_Amount_Submitted_Missing      int64
Loan_Tenure_Submitted_Missing      int64
age                              float64
Device_Type_0                      int64
Device_Type_1                      int64
Filled_Form_0                      int64
Filled_Form_1                      int64
Gender_0                           int64
Gender_1                           int64
Var1_0                             int64
Var1_1                             int64
Var1_2                             int64
Var1_3                             int64
Var1_4                             int64
Var1_5          

In [None]:
#Plotting feature importance using built-in function - XGBoost
from xgboost import XGBClassifier
from xgboost import plot_importance
from matplotlib import pyplot

X_train = train_mod.drop(['Disbursed','EMI_Loan_Submitted_Missing'], axis=1)
Y_train = train_mod['Disbursed']

#Fitting model on training data
model = XGBClassifier()
model.fit(X_train, Y_train)

#Plotting feature importance
plot_importance(model)
pyplot.show()

In [None]:
#Feature selection using Extra classifiers - Extra Classifiers
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectFromModel

X_train = train_mod.drop(['Disbursed','EMI_Loan_Submitted_Missing'], axis=1)
Y_train = train_mod['Disbursed']

clf = ExtraTreesClassifier()
clf = clf.fit(X_train, Y_train)

model = SelectFromModel(clf, threshold="0.25*mean", prefit=True)
selected = model.get_support()
features = [f for i, f in enumerate(X_train.columns) if selected[i]]
print(features)

In [None]:
#Dimension reduction using PCA
# import numpy
# from sklearn.decomposition import PCA

# #Feature extraction
# pca = PCA(n_components=4)
# fit = pca.fit(X_train)

# #Summarizing components
# print("Explained Variance: %s") % fit.explained_variance_ratio_
# print(fit.components_)

In [None]:
#Split into train and test datasets 
#Conventional random sampling - Logistic regression
from sklearn import cross_validation
from sklearn.linear_model import LogisticRegression
test_size = 0.33
seed = 7

X_train_new, X_test_new, Y_train_new, Y_test_new = cross_validation.train_test_split(X_train, Y_train,
test_size=test_size, random_state=seed)
model = LogisticRegression()
model.fit(X_train_new, Y_train_new)
result = model.score(X_test_new, Y_test_new)
print("Accuracy: %.3f%%") % (result*100.0)

In [None]:
# K fold cross validation - Logistic regression
num_folds = 10
num_instances = len(X_train)
seed = 7
kfold = cross_validation.KFold(n=num_instances, n_folds=num_folds, random_state=seed)
model = LogisticRegression()
results = cross_validation.cross_val_score(model, X_train, Y_train, cv=kfold)
print("Accuracy: %.3f%% (%.3f%%)") % (results.mean()*100.0, results.std()*100.0)

In [None]:
#Computing accuracy - Logistic regression
num_folds = 10
num_instances = len(X_train)
seed = 7
kfold = cross_validation.KFold(n=num_instances, n_folds=num_folds, random_state=seed)
model = LogisticRegression()


scoring = 'accuracy'
results = cross_validation.cross_val_score(model, X_train, Y_train, cv=kfold, scoring=scoring)
print("Accuracy: %.3f (%.3f)") % (results.mean(), results.std())

In [None]:
#Computing LogLoss - Logistic regression
num_folds = 10
num_instances = len(X_train)
seed = 7
kfold = cross_validation.KFold(n=num_instances, n_folds=num_folds, random_state=seed)
model = LogisticRegression()

scoring = 'log_loss'
results = cross_validation.cross_val_score(model, X_train, Y_train, cv=kfold, scoring=scoring)
print("Logloss: %.3f (%.3f)") % (results.mean(), results.std())

In [None]:
#Computing ROC - Logistic regression (This will be relevant only for binary classification problems)
num_folds = 10
num_instances = len(X_train)
seed = 7
kfold = cross_validation.KFold(n=num_instances, n_folds=num_folds, random_state=seed)
model = LogisticRegression()

scoring = 'roc_auc'
results = cross_validation.cross_val_score(model, X_train, Y_train, cv=kfold, scoring=scoring)
print("AUC: %.3f (%.3f)") % (results.mean(), results.std())

In [None]:
#Confusion Matrix - Logistic regression (This will be relevant only for binary classification problems)
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt

test_size = 0.33
seed = 7

X_train_new, X_test_new, Y_train_new, Y_test_new = cross_validation.train_test_split(X_train, Y_train,
test_size=test_size, random_state=seed)
    
model = LogisticRegression()
model.fit(X_train_new, Y_train_new)
predicted = model.predict(X_test_new)
matrix = confusion_matrix(Y_test_new, predicted)
matrix

In [None]:
#Function to construct the confusion matrix (This will be relevant only for binary classification problems)
def show_confusion_matrix(C,class_labels=['0','1']):
    """
    C: ndarray, shape (2,2) as given by scikit-learn confusion_matrix function
    class_labels: list of strings, default simply labels 0 and 1.

    Draws confusion matrix with associated metrics.
    """
    import matplotlib.pyplot as plt
    import numpy as np
    
    assert C.shape == (2,2), "Confusion matrix should be from binary classification only."
    
    # true negative, false positive, etc...
    tn = C[0,0]; fp = C[0,1]; fn = C[1,0]; tp = C[1,1];

    NP = fn+tp # Num positive examples
    NN = tn+fp # Num negative examples
    N  = NP+NN

    fig = plt.figure(figsize=(8,8))
    ax  = fig.add_subplot(111)
    ax.imshow(C, interpolation='nearest', cmap=plt.cm.gray)

    # Draw the grid boxes
    ax.set_xlim(-0.5,2.5)
    ax.set_ylim(2.5,-0.5)
    ax.plot([-0.5,2.5],[0.5,0.5], '-k', lw=2)
    ax.plot([-0.5,2.5],[1.5,1.5], '-k', lw=2)
    ax.plot([0.5,0.5],[-0.5,2.5], '-k', lw=2)
    ax.plot([1.5,1.5],[-0.5,2.5], '-k', lw=2)

    # Set xlabels
    ax.set_xlabel('Predicted Label', fontsize=16)
    ax.set_xticks([0,1,2])
    ax.set_xticklabels(class_labels + [''])
    ax.xaxis.set_label_position('top')
    ax.xaxis.tick_top()
    # These coordinate might require some tinkering. Ditto for y, below.
    ax.xaxis.set_label_coords(0.34,1.06)

    # Set ylabels
    ax.set_ylabel('True Label', fontsize=16, rotation=90)
    ax.set_yticklabels(class_labels + [''],rotation=90)
    ax.set_yticks([0,1,2])
    ax.yaxis.set_label_coords(-0.09,0.65)


    # Fill in initial metrics: tp, tn, etc...
    ax.text(0,0,
            'True Neg: %d\n(Num Neg: %d)'%(tn,NN),
            va='center',
            ha='center',
            bbox=dict(fc='w',boxstyle='round,pad=1'))

    ax.text(0,1,
            'False Neg: %d'%fn,
            va='center',
            ha='center',
            bbox=dict(fc='w',boxstyle='round,pad=1'))

    ax.text(1,0,
            'False Pos: %d'%fp,
            va='center',
            ha='center',
            bbox=dict(fc='w',boxstyle='round,pad=1'))


    ax.text(1,1,
            'True Pos: %d\n(Num Pos: %d)'%(tp,NP),
            va='center',
            ha='center',
            bbox=dict(fc='w',boxstyle='round,pad=1'))

    # Fill in secondary metrics: accuracy, true pos rate, etc...
    ax.text(2,0,
            'False Pos Rate: %.2f'%(fp / (fp+tn+0.)),
            va='center',
            ha='center',
            bbox=dict(fc='w',boxstyle='round,pad=1'))

    ax.text(2,1,
            'True Pos Rate: %.2f'%(tp / (tp+fn+0.)),
            va='center',
            ha='center',
            bbox=dict(fc='w',boxstyle='round,pad=1'))

    ax.text(2,2,
            'Accuracy: %.2f'%((tp+tn+0.)/N),
            va='center',
            ha='center',
            bbox=dict(fc='w',boxstyle='round,pad=1'))

    ax.text(0,2,
            'Neg Pre Val: %.2f'%(1-fn/(fn+tn+0.)),
            va='center',
            ha='center',
            bbox=dict(fc='w',boxstyle='round,pad=1'))

    ax.text(1,2,
            'Pos Pred Val: %.2f'%(tp/(tp+fp+0.)),
            va='center',
            ha='center',
            bbox=dict(fc='w',boxstyle='round,pad=1'))


    plt.tight_layout()
    plt.show()   

In [None]:
#Shows the classification matrix - (This will be relevant only for binary classification problems)
show_confusion_matrix(matrix, ['Class 0', 'Class 1']) 

In [None]:
#Classification report
from sklearn.metrics import classification_report

test_size = 0.33
seed = 7
X_train_new, X_test_new, Y_train_new, Y_test_new = cross_validation.train_test_split(X_train, Y_train,
test_size=test_size, random_state=seed)

model = LogisticRegression()
model.fit(X_train_new, Y_train_new)
predicted = model.predict(X_test_new)
report = classification_report(Y_test_new, predicted)
print(report)

In [None]:
# Compare Algorithms - based on accuracy
import pandas
import matplotlib.pyplot as plt
from sklearn import cross_validation
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
# from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

# prepare configuration for cross validation test harness

X_train = train_mod.drop(['Disbursed','EMI_Loan_Submitted_Missing'], axis=1)
Y_train = train_mod['Disbursed']

num_folds = 2
num_instances = len(X_train)
seed = 7
# prepare models
models = []
models.append(('LR', LogisticRegression()))
# models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('NB', GaussianNB()))
models.append(('SVM', SVC()))
# evaluate each model in turn
results = []
names = []
scoring = 'accuracy'
for name, model in models:
    kfold = cross_validation.KFold(n=num_instances, n_folds=num_folds, random_state=seed)
    cv_results = cross_validation.cross_val_score(model, X_train, Y_train, cv=kfold, scoring=scoring)
    results.append(cv_results)
    names.append(name)
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
print(msg)
# boxplot algorithm comparison
fig = plt.figure()
fig.suptitle('Algorithm Comparison')
ax = fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(names)
plt.show()

# The cross_validation module is deprecated. It’s replaced by model_selection.
# The KFold parameters have changed too:
# cross_validation.KFold(n, n_folds=3, shuffle=False, random_state=None)
# model_selection.KFold(n_splits=3, shuffle=False, random_state=None)