In [91]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold   #For K-fold cross validation
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.preprocessing import LabelEncoder
from sklearn import metrics
import pandas as pd
import numpy as np

In [92]:
# Generic function for making a classification model and accessing performance
def classification_model(model, data, predictors, outcome):
    model.fit(data[predictors], data[outcome])    # Fit the model
    
    predictions = model.predict(data[predictors]) # Make predictions on training set
    
    accuracy = metrics.accuracy_score(predictions, data[outcome])
    print("Accuracy : %s" % "{0:.3%}".format(accuracy))
    
    # Perform K-fold cross-val with 5 folds
    kf = KFold(n_splits=5)
    error = []
    
    for train, test in kf.split(data):
        train_predictors = (data[predictors].iloc[train,:]) # training data
        
        train_target = data[outcome].iloc[train] # outcome of training data
        
        model.fit(train_predictors, train_target)
        
        error.append(model.score(data[predictors].iloc[test,:], data[outcome].iloc[test])) # Record error from each cross val run
    
    print("Cross-Validation Score : %s" % "{0:.3%}".format(np.mean(error)))
    
    #Fit the model again so that it can be refered outside the function:
    model.fit(data[predictors],data[outcome])

In [93]:
df = pd.read_csv('cleaned_training_data.csv')

# Logistic Regression

In [94]:
outcome_var = 'Loan_Status'
model = LogisticRegression(solver = 'lbfgs')
predictor_var = ['Credit_History']
classification_model(model, df, predictor_var, outcome_var)

Accuracy : 81.374%
Cross-Validation Score : 81.386%


Just based off intuition you can guess that Credit History will be a big indicator for whether a person will be accepted for a loan or not. Our accuracy 

In [95]:
df.Loan_Status.value_counts()

1    378
0    175
Name: Loan_Status, dtype: int64

In [96]:
predictor_var = ['Credit_History', 'Education', 'Married', 'Self_Employed', 'Property_Area']
classification_model(model, df, predictor_var, outcome_var)

Accuracy : 81.374%
Cross-Validation Score : 81.386%


Despite adding more features, our accuracy hasn't increased at all. This tells us that credit_history is being taken into account too much when deciding on whether to give out a loan. Let's try another model and see how it affects our accuracy.

In [97]:
model = DecisionTreeClassifier()
predictor_var = ['Credit_History','Gender','Married','Education']
classification_model(model, df,predictor_var,outcome_var)

Accuracy : 81.374%
Cross-Validation Score : 81.386%


Despite changing the models, the other categorical variables fail to outweigh the effects of Credit_History. Let's try running a model using the numerical variables along with Credit_History

In [98]:
predictor_var = ['Credit_History','Loan_Amount_Term','LoanAmount_log']
classification_model(model, df,predictor_var,outcome_var)

Accuracy : 89.512%
Cross-Validation Score : 69.980%


Our Cross Validation Score is significantly lower than our accuracy, showing that we're probably overfitting

In [99]:
predictor_var = ['Credit_History','Loan_Amount_Term','LoanAmount_log', 'TotalIncome_log', 'Dependents']
classification_model(model, df,predictor_var,outcome_var)

Accuracy : 100.000%
Cross-Validation Score : 70.357%


We're definitely overfitting here, let's try to reduce our features to only include the top 3 features

In [100]:
featimp = pd.Series(model.feature_importances_, index=predictor_var).sort_values(ascending=False)
featimp

TotalIncome_log     0.389381
Credit_History      0.308681
LoanAmount_log      0.209864
Dependents          0.046261
Loan_Amount_Term    0.045813
dtype: float64

In [101]:
model = DecisionTreeClassifier()
predictor_var = ['Credit_History','TotalIncome_log', 'LoanAmount_log']
classification_model(model, df,predictor_var,outcome_var)

Accuracy : 100.000%
Cross-Validation Score : 72.531%


Still overfitting, let's try to tune the model parameters

In [102]:
model = DecisionTreeClassifier(max_depth = 7, max_features = 3)
predictor_var = ['Credit_History','TotalIncome_log', 'LoanAmount_log']
classification_model(model, df,predictor_var,outcome_var)

Accuracy : 85.895%
Cross-Validation Score : 77.761%


Tuning the parameters a little bit gets us a slightly better Cross Validation Score, let's try using a Random Forest

In [103]:
model = RandomForestClassifier(n_estimators=25, min_samples_split=25, max_depth=7, max_features=1)
predictor_var = ['TotalIncome_log','LoanAmount_log','Credit_History', 'Dependents', 'Property_Area']
classification_model(model, df,predictor_var,outcome_var)

Accuracy : 83.544%
Cross-Validation Score : 81.201%


Funnily enough, the Logistic Regression model using a single parameter is still giving us the best results. This tells me that I'd probably need to so some more in depth feature engineering to get a better score

In [104]:
model = LogisticRegression(solver='lbfgs')

In [105]:
model.fit(df[['Credit_History']], df['Loan_Status'])

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False)

In [106]:
df_test = pd.read_csv('test_lAUu6dG.csv')

In [107]:
df_test.dropna(inplace=True)

In [112]:
catergorical_vars = ['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed', 'Property_Area']
le = LabelEncoder()
for i in catergorical_vars:
    df_test[i] = le.fit_transform(df_test[i])

In [117]:
df_test['Loan_Status'] = model.predict(df_test[['Credit_History']])

In [118]:
df_test.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001015,1,1,0,0,0,5720,0,110.0,360.0,1.0,2,1
1,LP001022,1,1,1,0,0,3076,1500,126.0,360.0,1.0,2,1
2,LP001031,1,1,2,0,0,5000,1800,208.0,360.0,1.0,2,1
4,LP001051,1,0,0,1,0,3276,0,78.0,360.0,1.0,2,1
5,LP001054,1,1,0,1,1,2165,3422,152.0,360.0,1.0,2,1


# Key Takeaways

- Using a more sophisticated model does not guarantee better results
- It's not always wise to use blackbox models like the Random Forest without knowing how they work
- Feature Engineering is a very important step and should not be overlooked
- Since the  dataset was a bit skewed ( we had much more positive cases than negative cases) the cross-val accuracy predictor might've been misleading. For future reference I can look at the confusion matrix or the ROC curve to validate my results
- Grid Search might help me in the future to decide how to handle null values and how to pick hyperparameters for my model.
- Setting up a Pipeline of data cleaning steps will not only give me acces to a suite of data cleaning steps for future reference, but will also help me make sure that my data cleaning steps for my test set are the same as the ones for my training set. 
- Handling Categorical values using LabelEncoder may mess up on some models. If the categories aren't huge it may be better to use One Hot Encoding.