In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import random

In [None]:
# Loading the dataset
loan = pd.read_csv('../train.csv')
print(f'There are {loan.shape[0]} rows and {loan.shape[1]} columns')
loan.head() # getting the first the 5 rows

In [None]:
# Checking the information about the dataset
loan.info()

In [None]:
# Checking missing values
loan.isnull().sum()

In [None]:
# General statistics of the dataset for numerical features
loan.describe()

## EXPLORATORY DATA ANALYSIS

1. The relationship between the applicant’s gender and the loan application
2. The link between the education background and the applicant’s loan
3. How are the career, credit and property location associated to the loan application respectively?
4. Some overview analysis in terms of your observations and analysis


### 1. The relationship between the applicant’s gender and the loan application
 `For this assignment we have two columns matching the description of this exercise Loan_Status, and Gender which are both categorical features`

In [None]:
loan.groupby('Gender')['Loan_Status'].value_counts()

In [None]:
# Plot showing the relationship of Gender with corresponding Loan Status
plt.figure(figsize = (8,8))
sns.countplot(x=loan['Loan_Status'], hue=loan['Gender'], data=loan)
plt.title('Gender vs Loan Status')
plt.xlabel('Loan Status')
plt.ylabel('Count')
plt.show()

### 2. The link between the education background and the applicant’s loan

In [None]:
loan.groupby('Loan_Status')['Education'].value_counts()

In [None]:
# Plot of the education and loan status
plt.figure(figsize = (8,8))
sns.countplot(x='Loan_Status', hue='Education', data=loan)
plt.title('Education vs Loan Status')
plt.xlabel('Loan Status')
plt.ylabel('Count')
plt.show()

### 3. How are the career, credit and property location associated to the loan application respectively?

In [None]:
# The respective columns are Self_Employed (object - Boolean), Credit_History (float64), Property Area (Object)
loan.info()

In [None]:
important_columns = ['Self_Employed', 'Credit_History', 'Property_Area']
for col in important_columns:
    sns.countplot(x='Loan_Status', hue=col, data=loan)
    plt.title(f'Loan Status vs {col}')
    plt.xlabel('Loan Status')
    plt.ylabel('Count')
    plt.show()

In [None]:
# Categorical Plot of
sns.catplot(x='Loan_Status', col='Self_Employed', hue='Credit_History',
            data=loan, kind='count')

### 4. Some overview analysis in terms of your observations and analysis

In [None]:
from sklearn.preprocessing import LabelEncoder
loan2 = loan.copy() # Copying the dataset
'''
Encoding some features to obtain more mathematical correlations.
'''
le = LabelEncoder()
loan2['Loan_Status'] = le.fit_transform(loan2['Loan_Status'])s
loan2['Education'] = le.fit_transform(loan2['Education'])
plt.figure(figsize = (8,8))
loan_correlation = loan2.select_dtypes(include=[np.number]).corr()
plt.title('Correlation of Loan Status')
sns.heatmap(loan_correlation, annot=True, cmap='YlGnBu')

> 1. The loan distribution is varied on different classes such as employment and different gender roles
 2. Also loan status is directly correlated with credit history compared to other features.
