
[source](https://www.kaggle.com/ajaymanwani/loan-approval-prediction/notebook)

[pickle example](https://www.kaggle.com/vikasukani/loan-eligibility-prediction-machine-learning)


In [None]:
################### Importing Libraries ######################
import pandas as pd

train_df1 = pd.read_csv('../../data/raw/train.csv')
train_df1.info()
train_df1.head()

In [None]:
############ Count number of Categorical and Numerical Columns ######################
train_df2 = train_df1.drop(columns=['Loan_ID']) ## Dropping Loan ID
categorical_columns = ['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed', 'Property_Area','Credit_History','Loan_Amount_Term']
#categorical_columns = ['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed', 'Property_Area','Loan_Amount_Term']
print(categorical_columns)


numerical_columns = ['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount']
print(numerical_columns)

In [None]:
### Data Visualization libraries
import seaborn as sns
import matplotlib.pyplot as plt


fig,axes = plt.subplots(4,2,figsize=(12,15))
for idx,cat_col in enumerate(categorical_columns):
    row,col = idx//2,idx%2
    sns.countplot(x=cat_col,data=train_df2,hue='Loan_Status',ax=axes[row,col])


plt.subplots_adjust(hspace=1)

In [None]:
fig,axes = plt.subplots(1,3,figsize=(17,5))
for idx,cat_col in enumerate(numerical_columns):
    sns.boxplot(y=cat_col,data=train_df2,x='Loan_Status',ax=axes[idx])

print(train_df2[numerical_columns].describe())
plt.subplots_adjust(hspace=1)

In [None]:
#### Encoding categrical Features: ##########
train_df2_encoded = pd.get_dummies(train_df2,drop_first=True)
train_df2_encoded.head()

In [None]:
########## Split Features and Target Varible ############
X = train_df2_encoded.drop(columns='Loan_Status_Y')
y = train_df2_encoded['Loan_Status_Y']

################# Splitting into Train -Test Data #######
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,stratify =y,random_state =42)

############### Handling/Imputing Missing values #############
from sklearn.impute import SimpleImputer
imp = SimpleImputer(strategy='mean')
imp_train = imp.fit(X_train)
X_train = imp_train.transform(X_train)
X_test_imp = imp_train.transform(X_test)

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score,f1_score


tree_clf = DecisionTreeClassifier()
tree_clf.fit(X_train,y_train)
y_pred = tree_clf.predict(X_train)
print("Training Data Set Accuracy: ", accuracy_score(y_train,y_pred))
print("Training Data F1 Score ", f1_score(y_train,y_pred))

print("Validation Mean F1 Score: ",cross_val_score(tree_clf,X_train,y_train,cv=5,scoring='f1_macro').mean())
print("Validation Mean Accuracy: ",cross_val_score(tree_clf,X_train,y_train,cv=5,scoring='accuracy').mean())