In [18]:
# Data Processing
import pandas as pd

# Visualization Libraries
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
import plotly.colors as colors
from plotly.subplots import make_subplots
pio.templates.default = "plotly_white"
import seaborn as sns

# Machine Learning
from sklearn.model_selection import train_test_split  # Split dataset for training and testing
from sklearn.impute import SimpleImputer  # Handle missing values
from sklearn.preprocessing import OneHotEncoder  # One-hot encoding categorical variables
from sklearn.ensemble import RandomForestRegressor  # Random Forest algorithm for regression tasks
from sklearn.metrics import mean_absolute_error  # Evaluate regression model performance


ModuleNotFoundError: No module named 'sklearn'

In [None]:
# Read the files of both test and train data
test_data = pd.read_csv('test.csv')
train_data = pd.read_csv('train.csv')

# Find general information regarding test data
print(test_data.head(5))
print('\n\nTest Data Columns')
print(test_data.columns)

    Loan_ID Gender Married Dependents     Education Self_Employed  \
0  LP001015   Male     Yes          0      Graduate            No   
1  LP001022   Male     Yes          1      Graduate            No   
2  LP001031   Male     Yes          2      Graduate            No   
3  LP001035   Male     Yes          2      Graduate            No   
4  LP001051   Male      No          0  Not Graduate            No   

   ApplicantIncome  CoapplicantIncome  LoanAmount  Loan_Amount_Term  \
0             5720                  0       110.0             360.0   
1             3076               1500       126.0             360.0   
2             5000               1800       208.0             360.0   
3             2340               2546       100.0             360.0   
4             3276                  0        78.0             360.0   

   Credit_History Property_Area  
0             1.0         Urban  
1             1.0         Urban  
2             1.0         Urban  
3             NaN     

In [None]:
# Determine quantity of null values within data set
print('Test Data Null Values')
print(test_data.isnull().sum())
print('\n')
print('Train Data Null Values')
print(train_data.isnull().sum())

Test Data Null Values
Loan_ID               0
Gender               11
Married               0
Dependents           10
Education             0
Self_Employed        23
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount            5
Loan_Amount_Term      6
Credit_History       29
Property_Area         0
dtype: int64


Train Data Null Values
Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64


In [None]:
# Delete rows with null values within data set
test_data = test_data.dropna()
train_data = train_data.dropna()

# Delete Loan_ID column as it serves no purpose  
test_data = test_data.drop(['Loan_ID'], axis=1)  # .drop() drops rows by default, axis=1 specifies column dropping
train_data = train_data.drop(['Loan_ID'], axis=1)

# Ensure that all rows with null values have been deleted
print('Test Data Null Values')
print(test_data.isnull().sum(), '\n')

print('Train Data Null Values')
print(train_data.isnull().sum())

Test Data Null Values
Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
dtype: int64 

Train Data Null Values
Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
Loan_Status          0
dtype: int64


In [None]:
# Display statistics regarding the gender of all loan applicants using pie chart
genderCount = test_data.groupby(['Gender']).size()
Gender = ['Male','Female']
genderFig = px.pie(genderCount, 
                   values=genderCount, 
                   names=['Female','Male'], 
                   hole = .5, 
                   color_discrete_sequence=px.colors.qualitative.Pastel)
genderFig.update_traces(textposition = 'inside',
                        textinfo = 'percent+label+value')
genderFig.update_layout(title_text='Gender of Loan Applicants',
                        title_font=dict(size=24))
genderFig.show()


# Display married status of applicants with pie chart
married = test_data.groupby(['Married']).size()
genderFig = px.pie(married,
                   values=married,
                   names=['No','Yes'],
                   hole=0.5,
                   color_discrete_sequence=px.colors.qualitative.Pastel)
genderFig.update_traces(textposition = 'inside',
                        textinfo = 'percent+label+value')
genderFig.update_layout(title_text='Married Status of Loan Applicants', 
                        title_font=dict(size=24))
genderFig.show()


# Display number of dependants using a bar graph
numDependants = test_data.groupby(['Dependents']).size()
fig = px.bar(numDependants, 
             title='Number of Dependants of Applicants', 
             labels={'Number of Dependents','Number of Applicants'})
fig.update_layout(xaxis_title = 'Number of Dependents', 
                  yaxis_title = 'Number of Applicants', 
                  showlegend = False)


# Display graduated status or not by pie chart
gradStatus = test_data.groupby(['Education']).size()
gradFig = px.pie(gradStatus,  
                 names=['Graduate','Non-Graduate'], 
                 values=gradStatus,
                 hole = .5, 
                 color_discrete_sequence=px.colors.qualitative.Pastel)
gradFig.update_traces(textposition = 'inside', 
                      textinfo = 'percent+label+value')
gradFig.update_layout(title_text='Graduate Status of Loan Applicants', 
                      title_font=dict(size=24))
gradFig.show()


# Display number of dependants using a bar graph
numDependants = test_data.groupby(['Dependents']).size()
fig = px.bar(numDependants, 
             title='Number of Dependants of Applicants', 
             labels={'Number of Dependents','Number of Applicants'})
fig.update_layout(xaxis_title = 'Number of Dependents', 
                  yaxis_title = 'Number of Applicants', 
                  showlegend = False)
fig.show()


# Create pie chart for self employed status
selfEmployedStatus = test_data.groupby(['Self_Employed']).size()
gradFig = px.pie(selfEmployedStatus,  
                 names=['Non Self-Employed','Self-Semployed'], 
                 values=selfEmployedStatus,
                 hole = .5, 
                 color_discrete_sequence=px.colors.qualitative.Pastel)
gradFig.update_traces(textposition = 'inside',
                      textinfo = 'percent+label+value')
gradFig.update_layout(title_text='Self-Employed Status of Loan Applicants', 
                      title_font=dict(size=24))
gradFig.show()


# Creatiing a Histogram of Distribution of Applicant Income
applicant_income = test_data['ApplicantIncome']
plt.hist(applicant_income, bins=10, edgecolor='black')
plt.xlabel('Applicant Income')
plt.ylabel('Frequency')
plt.title('Distribution of Applicant Incomes')
plt.show()


# Creatiing a Histogram of Distribution of Coapplicant Income
coapplicant_income = test_data['CoapplicantIncome']
plt.hist(coapplicant_income, bins=10, edgecolor='black')
plt.xlabel('Coapplicant Income')
plt.ylabel('Frequency')
plt.title('Distribution of Coapplicant Income')
plt.show()


# Creatiing a Histogram of Distribution of Loan Amounts
loan_amount = test_data['LoanAmount']
plt.hist(loan_amount, bins=10, edgecolor='black')
plt.xlabel('Loan Amount')
plt.ylabel('Frequency')
plt.title('Distribution of Loan Amounts')
plt.show()


#Creating a Countplot of Distribution of Loan Amount Terms
loan_amount_term = test_data['Loan_Amount_Term']
plt.figure(figsize=(10, 6))
sns.countplot(x=loan_amount_term)
plt.xlabel('Loan Amount Term')
plt.ylabel('Count')
plt.title('Loan Amount Term Distribution (Countplot)')
plt.xticks(rotation=45)
plt.show()


#Creating a Piechart of Distribution of Loan Amount Terms
credit_history = test_data['Credit_History']
credit_history_counts = credit_history.value_counts()
plt.figure(figsize=(6, 4))  # Adjust the figure size as needed
plt.pie(credit_history_counts, labels=credit_history_counts.index, autopct='%1.1f%%', startangle=90)
plt.title('Credit History Distribution (Pie Chart)')
plt.show()



# Creatiing a Piechart of Distribution of Property Areas
property_area = test_data['Property_Area']
property_area_counts = property_area.value_counts()
plt.figure(figsize=(6, 6))
plt.pie(property_area_counts, labels=property_area_counts.index, autopct='%1.1f%%', startangle=90)
plt.title('Property Area Distribution (Pie Chart)')
plt.show()

ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed

Train a machine learning model to read our data

In [None]:
# Examine shape and columns of train data
print('Shape of Train Data:', train_data.shape, '\n') 
print('Columns in Train Data:', train_data.columns)

Shape of Train Data: (480, 12) 

Columns in Train Data: Index(['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed',
       'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History', 'Property_Area', 'Loan_Status'],
      dtype='object')


In [None]:
# Create categorical and numerical columns to train model
categorical_columns = ['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed', 'Property_Area','Credit_History','Loan_Amount_Term']
numerical_columns = ['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount']

# Seperate the target variable and drop it from the train data
loanStatus = train_data['Loan_Status']  # Target variable to predict
train_data = train_data.drop(['Loan_Status'], axis=1)

In [19]:
# Create an instance of the OneHotEncoder with the 'ignore' strategy for handling unknown categories
encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)

# Use the OneHotEncoder to transform the categorical columns of the train_data DataFrame
ohe_X = pd.DataFrame(encoder.fit_transform(train_data[categorical_columns]))

# Assign meaningful column names to the OneHotEncoder output DataFrame
ohe_X.columns = encoder.get_feature_names_out(categorical_columns)

# Drop the original categorical columns from the train_data DataFrame
train_data.drop(categorical_columns, axis=1, inplace=True)

# Set the index (row labels) of ohe output DataFrame to match the index of train_data (aligning row labels)
ohe_X.index = train_data.index

# Concatenate the OneHotEncoder output DataFrame with the original train_data DataFrame horizontally
train_data = pd.concat([train_data, ohe_X], axis=1)

# Display the first few rows of the modified train_data DataFrame
train_data.head()


NameError: name 'OneHotEncoder' is not defined

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report


X = train_data
y = loanStatus

# Step 1: Split the data into training and testing sets (80% for training, 20% for testing). 'random_state=42' ensures data will be split the same where every run
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 2: We then create an instance of the RandomForestClassifier class and assign it to the variable model. This instance represents our machine learning model, specifically a random forest classifier
model = RandomForestClassifier(random_state=42)

# Step 3: Fit the model to the training data. 'fit()' to train the model on the given data
model.fit(X_train, y_train)

# Step 4: Make predictions on the testing data
y_pred = model.predict(X_test)

# Step 5: Evaluate the model's performance
accuracy = accuracy_score(y_test, y_pred)
classification_report_output = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("Classification Report:")
print(classification_report_output)

Accuracy: 0.8229166666666666
Classification Report:
              precision    recall  f1-score   support

           N       0.87      0.46      0.60        28
           Y       0.81      0.97      0.89        68

    accuracy                           0.82        96
   macro avg       0.84      0.72      0.75        96
weighted avg       0.83      0.82      0.80        96

