In [14]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import tabula

# Load data from an Excel file
excel_data = pd.read_excel('loan_data.xlsx')
excel_data.set_index('Loan_ID', inplace=True)

# Read data from a PDF file
pdf_file = 'pda_loan.pdf'
pdf_data = tabula.read_pdf(pdf_file, pages='all')

# Convert the extracted PDF data into a DataFrame
pdf_df = pd.concat(pdf_data)

# Check for duplicates in the Excel data
excel_data = excel_data[~excel_data.duplicated()]

# Handle missing values in the Excel data (e.g., fill with mean/median, drop, or impute)
excel_data['ApplicantIncome'].fillna(excel_data['ApplicantIncome'].median(), inplace=True)

# Merge the Excel and PDF data
merged_data = pd.concat([excel_data, pdf_df], axis=0, sort=False)

# EDA for merged data
female_applicants_approved = merged_data[merged_data['Gender'] == 1]['Loan_Status'].value_counts(normalize=True)['Y']
average_income = merged_data['ApplicantIncome'].mean()
average_income_self_employed = merged_data[merged_data['Self_Employed'] == 1]['ApplicantIncome'].mean()
average_income_not_self_employed = merged_data[merged_data['Self_Employed'] == 0]['ApplicantIncome'].mean()
average_income_graduate = merged_data[merged_data['Graduate'] == 1]['ApplicantIncome'].mean()
graduate_applicants_approved = merged_data[merged_data['Graduate'] == 1]['Loan_Status'].value_counts(normalize=True)['Y']

# Visualizations

# Pie chart for percentage of female applicants with approved loans
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
labels_female = ['Approved', 'Not Approved']
sizes_female = [female_applicants_approved, 1 - female_applicants_approved]
colors_female = ['#ff9999', '#66b3ff']
plt.pie(sizes_female, labels=labels_female, autopct='%1.1f%%', startangle=90, colors=colors_female)
plt.title("Female Applicants with Approved Loans")
plt.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.

# Pie chart for percentage of graduate applicants with approved loans
plt.subplot(1, 2, 2)
labels_graduate = ['Approved', 'Not Approved']
sizes_graduate = [graduate_applicants_approved, 1 - graduate_applicants_approved]
colors_graduate = ['#99ff99', '#ffcc99']
plt.pie(sizes_graduate, labels=labels_graduate, autopct='%1.1f%%', startangle=90, colors=colors_graduate)
plt.title("Graduate Applicants with Approved Loans")
plt.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.

plt.tight_layout()

# Histogram for applicant income distribution
plt.figure(figsize=(10, 6))
sns.histplot(merged_data['ApplicantIncome'], bins=20, kde=True)
plt.title("Applicant Income Distribution")
plt.xlabel("Applicant Income")
plt.ylabel("Frequency")
plt.show()

# Violin plots for income by self-employment status
plt.figure(figsize=(10, 6))
sns.violinplot(x='Self_Employed', y='ApplicantIncome', data=merged_data, inner="quart")
plt.title("Applicant Income Distribution by Self-Employment Status")
plt.xlabel("Self-Employed")
plt.xticks([0, 1], ['Not Self-Employed', 'Self-Employed'])
plt.ylabel("Applicant Income")
plt.show()


FileNotFoundError: [Errno 2] No such file or directory: 'sample_data.pdf'