In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
%matplotlib inline

In [2]:
train= pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
train['source'] = 'Train'
test['source'] = 'Test'

### Merge Both DataFrames for faster Cleaning

In [3]:
data = pd.concat([train,test],ignore_index = True)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  """Entry point for launching an IPython kernel.


In [None]:
data.head()

### Check for null values

In [None]:
data.apply(lambda x: sum(x.isnull()))

### Fill Numerical Null Values using MICE in Azure ML Studio

In [None]:
data.corr()

In [None]:
submission = pd.DataFrame({ x: data[x] for x in data.columns})
submission.to_csv('data.csv', index=False)

In [4]:
data = pd.read_csv('Cleaned.csv')

In [None]:
data.apply(lambda x: sum(x.isnull()))

### Fill Categorial Null Values using Pivot Table and Mode

In [5]:
from scipy.stats import mode
Dependents = data.pivot_table(values='Dependents',
                                   columns='Education',
                                   aggfunc=lambda x: x.mode().iat[0])
Dep_bool = data['Dependents'].isnull() 
data.loc[Dep_bool,'Dependents'] = data.loc[Dep_bool,'Education'].apply(lambda x: Dependents[x])

In [6]:
from scipy.stats import mode
gender = data.pivot_table(values='Gender',
                                   columns='Education',
                                   aggfunc=lambda x: x.mode().iat[0])
gender_bool = data['Gender'].isnull() 
data.loc[gender_bool,'Gender'] = data.loc[gender_bool,'Education'].apply(lambda x: gender[x])

In [7]:
from scipy.stats import mode
married = data.pivot_table(values='Married',
                                   columns='Education',
                                   aggfunc=lambda x: x.mode().iat[0])
married_bool = data['Married'].isnull() 
data.loc[married_bool,'Married'] = data.loc[married_bool,'Education'].apply(lambda x: married[x])

In [8]:
from scipy.stats import mode
self_employed = data.pivot_table(values='Self_Employed',
                                   columns='Education',
                                   aggfunc=lambda x: x.mode().iat[0])
self_bool = data['Self_Employed'].isnull() 
data.loc[self_bool,'Self_Employed'] = data.loc[self_bool,'Education'].apply(lambda x: self_employed[x])

In [9]:
data.apply(lambda x: len(x.unique()))

ApplicantIncome      752
CoapplicantIncome    437
Credit_History        81
Dependents             4
Education              2
Gender                 2
LoanAmount           259
Loan_Amount_Term      32
Loan_ID              981
Loan_Status            3
Married                2
Property_Area          3
Self_Employed          2
source                 2
dtype: int64

In [10]:
data = data.drop(('Loan_ID'),axis=1) #Remove unwanted columns

### Label Categorical Data

In [11]:
to_label = ['Education','Gender','Married','Property_Area','Self_Employed']
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
for x in to_label:
    print(x)
    data[x] = le.fit_transform(data[x])

Education
Gender
Married
Property_Area
Self_Employed


In [None]:
data.head()

### Normalize the data and Feature Engineering

In [12]:
data.loc[data['Credit_History'] > 0.5 , 'Credit_History'] = 1
data.loc[data['Credit_History'] < 0.5 , 'Credit_History'] = 0

In [13]:
data['Credit_History'] = data['Credit_History'].astype(int)

In [14]:
data.loc[data['Dependents'] ==  '3+' , 'Dependents'] = 3
data['Dependents'] = data['Dependents'].astype(int)

In [15]:
normalize_variables = ['ApplicantIncome','CoapplicantIncome','LoanAmount','Loan_Amount_Term']
from sklearn.preprocessing import StandardScaler
sc_x = StandardScaler()
data[normalize_variables] = sc_x.fit_transform(data[normalize_variables])

In [18]:
encode_onehot = ['Dependents','Property_Area']
data = pd.get_dummies(data, columns=encode_onehot)
for count in encode_onehot:
     data=data.drop(count + '_0',axis=1)

In [20]:
train = data[data['source'] =='Train']
x_test = data[data['source'] =='Test']

In [21]:
y_train = train['Loan_Status'] 
x_train = train.drop('Loan_Status',axis=1)
x_train = train.drop('source',axis=1)

In [22]:
x_test = x_test.drop('Loan_Status',axis=1)
x_test = x_test.drop('source',axis=1)

In [None]:
x_train.shape,x_test.shape

In [23]:
x_train=x_train.drop('Loan_Status',axis=1)

In [24]:
sc_y = LabelEncoder()
y_train = sc_y.fit_transform(y_train)

### Regression

In [88]:
from sklearn.ensemble import RandomForestClassifier
rfc= RandomForestClassifier(max_depth=3,min_samples_split=3,n_estimators=100)
rfc.fit(x_train,y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=3, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=3,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [89]:
y_pred = rfc.predict(x_test)

In [90]:
arr = []
for x in y_pred:
    if x>0:
        arr.append('Y')
    else:
        arr.append('N')

In [91]:
submission = pd.DataFrame(arr)
submission.to_csv('submit.csv', index=False)

# End

# Extra scratch work for better accuracy

## Splitting the training data into two to c/heck accuracy scores

In [107]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import  confusion_matrix
lr = LinearRegression()
from sklearn.ensemble import RandomForestClassifier
rfc= RandomForestClassifier(max_depth=3,min_samples_split=3,n_estimators=100,max_features=10)
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
svc = SVC(kernel = 'linear',)
from sklearn.linear_model import LogisticRegression
LR = LogisticRegression()
from sklearn.neighbors import KNeighborsClassifier
knc = KNeighborsClassifier(n_neighbors=5)

train_x_1 , test_x_1, train_y_1, test_y_1 = train_test_split(x_train,y_train,test_size=0.33)

def cm(model,train_x,train_y,test_x,test_y):
    model.fit(train_x, train_y)
    y_pred = model.predict(test_x)
    cm = confusion_matrix(test_y,y_pred)
    return cm

model_acc = cm(rfc,train_x_1,train_y_1,test_x_1,test_y_1)
accuracy = (model_acc[0][0]+model_acc[1][1])/(model_acc[0][0]+model_acc[0][1]+model_acc[1][0]+model_acc[1][1])

In [108]:
accuracy


0.7881773399014779

## Using GridSearch to identify the best parameters for RFC

In [81]:
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.metrics import roc_curve, precision_recall_curve, auc, make_scorer, recall_score, accuracy_score, precision_score, confusion_matrix

import matplotlib.pyplot as plt
plt.style.use("ggplot")

df = pd.read_csv('data.csv')

clf = RandomForestClassifier(n_jobs=-1)

param_grid = {
    'min_samples_split': [3, 5, 10], 
    'n_estimators' : [100, 300],
    'max_depth': [3, 5, 15, 25],
    
}

scorers = {
    'precision_score': make_scorer(precision_score),
    'recall_score': make_scorer(recall_score),
    'accuracy_score': make_scorer(accuracy_score)
}

def grid_search_wrapper(refit_score='precision_score'):
    """
    fits a GridSearchCV classifier using refit_score for optimization
    prints classifier performance metrics
    """
    skf = StratifiedKFold(n_splits=10)
    grid_search = GridSearchCV(clf, param_grid, scoring=scorers, refit=refit_score,
                           cv=skf, return_train_score=True, n_jobs=-1)
    grid_search.fit(train_x_1, train_y_1)

    # make the predictions
    y_pred = grid_search.predict(test_x_1)

    print('Best params for {}'.format(refit_score))
    print(grid_search.best_params_)

    # confusion matrix on the test data.
    print('\nConfusion matrix of Random Forest optimized for {} on the test data:'.format(refit_score))
    print(pd.DataFrame(confusion_matrix(test_y_1, y_pred),
                 columns=['pred_neg', 'pred_pos'], index=['neg', 'pos']))
    return grid_search
grid_search_clf = grid_search_wrapper(refit_score='precision_score')
results = pd.DataFrame(grid_search_clf.cv_results_)
#results = results.sort_values(by='mean_test_precision_score', ascending=False)
results[['mean_test_precision_score', 'mean_test_recall_score', 'mean_test_accuracy_score', 'param_max_depth', 'param_min_samples_split', 'param_n_estimators']].round(3).head()


Best params for precision_score
{'max_depth': 25, 'min_samples_split': 3, 'n_estimators': 100}

Confusion matrix of Random Forest optimized for precision_score on the test data:
     pred_neg  pred_pos
neg        32        38
pos         6       127


Unnamed: 0,mean_test_precision_score,mean_test_recall_score,mean_test_accuracy_score,param_max_depth,param_min_samples_split,param_n_estimators
0,0.806,0.983,0.82,3,3,100
1,0.806,0.979,0.818,3,3,300
2,0.804,0.983,0.818,3,5,100
3,0.806,0.983,0.82,3,5,300
4,0.803,0.979,0.815,3,10,100
