In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
from statistics import variance, stdev 
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from scipy.stats import zscore
from scipy.stats import boxcox

import warnings
warnings.filterwarnings("ignore")

In [None]:
df = pd.read_csv("data/loan_data.csv")
df

In [None]:
df.head()

In [None]:
df.tail(2)

In [None]:
#perform sanity checks
df.shape

In [None]:
df.info()

In [None]:
# Summary statistics for numerical columns
df.describe().T

In [None]:
# Descriptive Statistics for categorical columns
df.describe(include = 'object').T

In [None]:
df.columns

In [None]:
# Check for missing values in DataFrame
df.isnull().sum()

In [None]:
# check for duplicates
df.duplicated().sum()

In [None]:
# Select numerical columns
numerical_columns = df.select_dtypes(include=['number']).columns
numerical_columns

In [None]:
# select categorical columns
categorical_columns = df.select_dtypes(include=['object']).columns
categorical_columns

In [None]:
# check for missing numerical values in the dataframe
missing_values = df[numerical_columns].isnull().sum()
print("Missing values in numerical columns:\n", missing_values)

In [None]:
# check missing values in categorical columns
missing_values = df[categorical_columns].isnull().sum()
print("Missing values in numerical columns:\n", missing_values)

In [None]:
warnings.filterwarnings('ignore')

# Fill missing values in numerical columns using forward fill
for column in numerical_columns:
    df[column].fillna(df[column].median(), inplace=True)
    
# Check if there are any remaining missing values
print("Remaining missing values:\n", df.isnull().sum())

In [None]:
warnings.filterwarnings('ignore')

# filling missing values in categorical columns
for col in categorical_columns:
    df[col].fillna(df[col].mode()[0], inplace=True)
    
# Check if there are any remaining missing values
print("Remaining missing values:\n", df.isnull().sum())

### Univariate Analysis

In [None]:
df['Property_Area'].value_counts()

In [None]:
df['Education'].value_counts()

In [None]:
df['Dependents'].value_counts()

3+ makes Dependants column an object datatype

### Dependent Variable--Loan Status (For prediction)

In [None]:
df['Loan_Status'].value_counts()

In [None]:
# outlier detection
z_scores = zscore(df[numerical_columns])

z_scores_df = pd.DataFrame(z_scores, columns=numerical_columns)

plt.figure(figsize=(12, 8))
plt.boxplot(z_scores_df.values, labels=numerical_columns, vert=False)
plt.title('Boxplot of Z-Scores for Numerical Columns')
plt.xlabel('Z-Score')
plt.show()

In [None]:
# Define a function to remove outliers using Box-Cox transformation
def remove_outliers_boxcox(data):
    # Added a small constant to handle zero and negative values--close to one
    data = data - data.min() + 1e-6
    transformed_data, _ = boxcox(data)
    return transformed_data

# Apply Box-Cox transformation to numerical columns
transformed_numerical_columns = df[numerical_columns].apply(remove_outliers_boxcox)
transformed_numerical_columns

In [None]:
# Plot boxplots of the transformed columns
plt.figure(figsize=(12, 8))
plt.boxplot(transformed_numerical_columns.values, labels=numerical_columns, vert=False)
plt.title('Boxplot after Box-Cox Transformation')
plt.xlabel('Transformed Values')
plt.show()

### Plots

In [None]:
sns.histplot(df, x = 'Loan_Amount_Term', kde = True)
plt.title('A plot showing the distribution of the loan amount term')

In [None]:
sns.catplot(x="Married", y="Education", data = df)
sns.catplot(x="Gender", y="Education", data = df)

In [None]:
# encode categorical variables
df['Gender'] =  LabelEncoder().fit_transform(df['Gender'])
df['Gender']

In [None]:
df['Gender'].head()

In [None]:
df['Property_Area'] = LabelEncoder().fit_transform(df['Property_Area'])
df['Property_Area'].head(15)

In [None]:
df['Education'] =  LabelEncoder().fit_transform(df['Education'])
df['Education'].head(15)

In [None]:
df['Married'] =  LabelEncoder().fit_transform(df['Married'])
df['Married'].head(15)

In [None]:
df['Self_Employed'] =  LabelEncoder().fit_transform(df['Self_Employed'])
df['Self_Employed'].head(15)

In [None]:
df['Loan_Status'] = LabelEncoder().fit_transform(df['Loan_Status'])
df['Loan_Status'].head(5)

In [None]:
df.head(5)

In [None]:
# Perform One-hot Encoding
#Getting dummy variable of the dependent column
dummys = pd.get_dummies(df['Dependents'], prefix = 'Dependents',drop_first = True)
dummys = dummys.astype(int)
dummys

In [None]:
# df.drop(['Loan_ID', 'Dependents'],axis = 1, inplace = True)
df.head(10)


In [None]:
df_full = pd.concat([df,dummys.astype(int)],axis = 1)
df_full.head(10)

In [None]:
#Seperating dependents and independent variables
X = df_full.drop(['Loan_Status'], axis = 1)
y = df_full.pop('Loan_Status') 

In [None]:
# X is the condition
X.head()


In [None]:
#Y is the prediction target
y.head(10)

In [None]:
#Scaling the dataset
scaler = StandardScaler()
scaled_data = pd.DataFrame(scaler.fit_transform(X), columns = X.columns)
scaled_data.head()