In [13]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import chi2
from sklearn.feature_selection import RFE
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_auc_score

### Define the following functions as per the description, so that they can be called later to perform the tasks.

In [22]:
# Remove correlated features
to_drop = []
def remove_corelated_features(X_train,val):
    """ Function to remove the correlated features
    
    This function accepts the dataframe X_train,val which creates a correlation matix and removes 
    the correlated features based on certain threshold.
    
    Keyword arguments:
    X_train - Pandas dataframe which contains the independent features.
    val - Certain threshold value by which correlated features to be dropped.
    to_drop - Columns to be dropped
    
    """
    # Create correlation matrix
    corr_matrix = X_train.corr().abs()

    # Select upper triangle of correlation matrix
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))

    # Find index of feature columns with correlation greater than 0.75
    to_drop = [column for column in upper.columns if any(upper[column] > val)]
    print("Columns to be dropped: ",to_drop)
    
    return X_train.drop(to_drop,axis=1)



In [40]:
# check the distribution of the passed features
    
def cal_eval_metric(y_test, y_pred, metric):
    """  Check the distribution of the passed features
    
    This function will check for the metric passed(accuracy/precision/recall/f1) 
    and return the required value.
    
    Keyword Arguments:   
    y_test: actual target values
    y_pred: predicted target values
    metric: the metric to be calculated
    
    Return:
    score - Calculated Evaluation Score
    
    """
    if metric == 'accuracy':
        score = accuracy_score(y_test, y_pred)

    elif metric == 'precision':
        score = precision_score(y_test, y_pred)
    
    elif metric == 'recall':
        score = recall_score(y_test, y_pred)
    
     
    elif metric == 'f1':
        score = f1_score(y_test, y_pred)
        
    elif metric == 'roc_auc':
        score = roc_auc_score(y_test, y_pred)
        
    else:
        print("Please enter proper score metric.")

    return score


### Read the dataset. Take a look at the dataset. 

* Check the data types present in the dataframe.
* Call the num_and_cat_columns() with train as the parameter and store the results.
* Are there any missing values? Are there any Outliers? How do you want to treat them?


In [24]:
# Code starts here
train = pd.read_csv('../data/train.csv')
train.head()
# Code ends here

Unnamed: 0,Id,loan_amnt,funded_amnt,term,int_rate,installment,grade,emp_title,home_ownership,annual_inc,...,total_rec_late_fee,recoveries,collection_recovery_fee,last_pymnt_amnt,collections_12_mths_ex_med,acc_now_delinq,tot_coll_amt,tot_cur_bal,total_rev_hi_lim,loan_status
0,0,27000.0,27000.0,1,141,805.68,6,100315,4,120000.0,...,0.0,0.0,0.0,24502.2,0.0,0.0,0.0,48614.0,27900.0,0
1,1,15000.0,15000.0,0,77,551.36,3,94083,0,100000.0,...,0.0,0.0,0.0,838.69,0.0,0.0,0.0,47703.0,17000.0,0
2,2,7000.0,7000.0,0,8,222.28,0,64843,0,48000.0,...,0.0,0.0,0.0,5097.27,0.0,0.0,0.0,138287.0,19500.0,0
3,3,18950.0,18950.0,0,38,648.5,1,54947,4,54000.0,...,0.0,0.0,0.0,4407.62,0.0,0.0,0.0,75286.0,42800.0,0
4,4,35000.0,35000.0,1,108,976.04,4,56326,0,87000.0,...,0.0,0.0,0.0,17284.95,0.0,0.0,0.0,213561.0,52000.0,0


In [25]:
test = pd.read_csv('../data/test.csv')
test.head()

Unnamed: 0,Id,loan_amnt,funded_amnt,term,int_rate,installment,grade,emp_title,home_ownership,annual_inc,...,total_acc,total_rec_late_fee,recoveries,collection_recovery_fee,last_pymnt_amnt,collections_12_mths_ex_med,acc_now_delinq,tot_coll_amt,tot_cur_bal,total_rev_hi_lim
0,0,20000.0,20000.0,0,6,625.81,0,76811,4,100000.0,...,10.0,0.0,0.0,0.0,14478.4,0.0,0.0,0.0,21984.0,24800.0
1,1,12000.0,12000.0,0,42,412.06,2,56215,0,75000.0,...,45.0,0.0,0.0,0.0,7429.4,0.0,0.0,0.0,265284.0,14200.0
2,2,19750.0,19750.0,0,41,677.89,2,84972,4,45000.0,...,42.0,0.0,939.59,169.0398,677.89,0.0,0.0,0.0,21120.0,78400.0
3,3,11000.0,11000.0,0,1,334.8,0,93121,0,82000.0,...,35.0,0.0,3.08,0.0,334.8,0.0,0.0,0.0,215435.0,14200.0
4,4,11200.0,11200.0,0,20,367.42,1,82777,0,64000.0,...,20.0,0.0,0.0,0.0,367.54,0.0,0.0,960.0,150933.0,27600.0


### Model building

- Separate the features and target and then split the train data into train and validation set.
- Now let's come to the actual task, using linear regression, predict the `Total Compensation`. 
- Try improving upon the `r2_score` (R-Square) using different parameters that give the best score.



In [43]:
# Split the data into train and test
X = train.drop(columns = ['loan_status'])
y = train[['loan_status']]

print(y['loan_status'].unique())
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Now let's come to the actual task, using logistic regression to predict the loan_status. 
# fit the model and predit the target values

#Instantiate logistic regression model
L_regressor=LogisticRegression()

# fit the model on train data
L_regressor.fit(X_train, y_train)

# predict the result
y_pred =L_regressor.predict(X_test)


# We will check the model accuracy using `accuracy score`, `precision score`, `recall score`, and `f1 score`. 
# To see your model's performance, call the cal_eval_metric() with respective parameters.

accuracy= cal_eval_metric(y_test,y_pred,  'accuracy')
precision = cal_eval_metric(y_test,y_pred,  'precision')
recall = cal_eval_metric(y_test,y_pred,  'recall')
f1 = cal_eval_metric(y_test,y_pred,  'f1')
roc_auc= cal_eval_metric(y_test,y_pred,  'roc_auc')

print("Accuracy Score: ", accuracy)
print("Precision Score: ", precision)
print("Recall Score: ", recall)
print("F1 Score: ", f1)
print("Roc Auc Curve: ", roc_auc)    


[0 1]
Accuracy Score:  0.9544522887724892
Precision Score:  0.9891186071817193
Recall Score:  0.7220015885623511
F1 Score:  0.8347107438016528
Roc Auc Curve:  0.8602483261858227


### Prediction on the test data and creating the sample submission file.

- Load the test data and store the `Id` column in a separate variable.
- Perform the same operations on the test data that you have performed on the train data.
- Create the submission file as a `csv` file consisting of the `Id` column from the test data and your prediction as the second column.

In [46]:
# Code Starts here
# Prediction on test data

# Read the test data
test = pd.read_csv('../data/test.csv')

# Storing the id from the test file
id_ = test['Id']

# Dropping the same columns from the test data and applying same transformation

test = test.drop(['funded_amnt', 'installment', 'grade', 'collection_recovery_fee', 'total_rev_hi_lim'],axis=1)

# Predict on the test data
y_pred_test = L_regressor.predict(test)
y_pred_test = y_pred_test.flatten()
y_pred_test
# Create a sample submission file
# sample_submission = pd.DataFrame({'Id':id_,'loan_status':y_pred_test})

# Convert the sample submission file into a csv file
# sample_submission.to_csv('sample_submission.csv',index=False)

# Code ends here

array([0, 0, 1, ..., 0, 0, 1], dtype=int64)