In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from collections import Counter

In [None]:
path_to_application = '/home/student/Machine Learning/application_record.csv'
applicant_data = pd.read_csv(path_to_application)

display(applicant_data.head())

# Dataset Documentation

### CODE_GENDER

Description: The gender of the applicant, represented as 'M' for male and 'F' for female.<br>
Importance: Understanding gender distribution can help analyze potential biases or trends in loan approvals.

### FLAG_OWN_CAR
Description: Indicates whether the applicant owns a car (Y for yes, N for no).<br>
Importance: Car ownership may be correlated with financial stability, impacting the likelihood of loan approval.

### FLAG_OWN_REALTY
Description: Indicates whether the applicant owns real estate (Y for yes, N for no).<br>
Importance: Homeownership can be a sign of financial security and can influence creditworthiness assessments.

### CNT_CHILDREN
Description: The number of children the applicant has.<br>
Importance: Family size may affect financial obligations and the applicant's ability to repay loans.

### AMT_INCOME_TOTAL
Description: The total annual income of the applicant.<br>
Importance: This is a critical factor in determining loan eligibility and the amount that can be approved.

### NAME_INCOME_TYPE
Description: The type of income source, e.g., Working, Commercial associate, Pensioner.<br>
Importance: Different income types can indicate varying levels of financial stability and risk.

### NAME_EDUCATION_TYPE
Description: The highest level of education attained by the applicant (e.g., Higher education, Secondary).<br>
Importance: Education level can correlate with earning potential and job stability, affecting loan approval chances.

### NAME_FAMILY_STATUS
Description: The applicant’s family situation (e.g., Single, Married, Civil marriage).<br>
Importance: Family status can influence financial responsibilities and stability.

### NAME_HOUSING_TYPE
Description: The type of housing where the applicant lives (e.g., House/apartment, Rented apartment).<br>
Importance: Housing stability may impact the applicant’s ability to repay a loan.

### DAYS_BIRTH
Description: The applicant's age in days (negative values indicate the age in the past).<br>
Importance: Age can be a factor in assessing risk, with certain age groups potentially being more reliable borrowers.

### DAYS_EMPLOYED
Description: The number of days the applicant has been employed (negative values indicate time since employment started).<br>
Importance: Employment duration is an indicator of job stability and income reliability, crucial for assessing loan risk.

### FLAG_MOBIL
Description: Indicates whether the applicant has a mobile phone (1 for yes, 0 for no).<br>
Importance: Having a mobile phone can facilitate communication and may correlate with financial stability.

### FLAG_WORK_PHONE
Description: Indicates whether the applicant has a work phone (1 for yes, 0 for no).<br>
Importance: Similar to mobile phone ownership, a work phone can suggest job stability.

### FLAG_PHONE
Description: Indicates whether the applicant has a home phone (1 for yes, 0 for no).<br>
Importance: Access to a phone may indicate stability and the ability to communicate effectively.

### FLAG_EMAIL
Description: Indicates whether the applicant has an email address (1 for yes, 0 for no).<br>
Importance: Having an email address can reflect modern communication norms and accessibility.

### OCCUPATION_TYPE
Description: The applicant’s occupation (e.g., Security staff, Sales staff).<br>
Importance: Occupation type can provide insight into income stability and potential risks associated with loan approval.

### CNT_FAM_MEMBERS
Description: The number of family members living with the applicant.<br>
Importance: The size of the household may affect financial obligations and stability, influencing loan approval outcomes.

In [None]:
path_to_application = '/home/student/Machine Learning/credit_record.csv'
credit_data = pd.read_csv(path_to_application)

display(credit_data.head())

### ID<br>
Description: A unique identifier for each loan account.<br>
Importance: This column allows for the tracking of individual loan records, facilitating analysis over time for each account.

### MONTHS_BALANCE
Description: The time frame in months relative to the current date. A value of 0 indicates the most recent month, while negative values indicate past months (e.g., -1 for one month ago, -2 for two months ago).<br>
Importance: This column is crucial for understanding the historical payment status of each loan account over time, enabling analysis of trends in repayment behavior.

### STATUS
Description: Indicates the payment status of the loan for the corresponding month balance. The possible values are:<br>
0: 1-29 days past due<br>
1: 30-59 days past due<br>
2: 60-89 days overdue<br>
3: 90-119 days overdue<br>
4: 120-149 days overdue<br>
5: Overdue or bad debts, write-offs for more than 150 days<br>
C: Paid off that month<br>
X: No loan for the month<br>
Importance: The status is critical for assessing the repayment behavior and creditworthiness of borrowers. Analyzing this data helps identify patterns in payment delays and defaults, which are vital for risk assessment and loan management.

### Bargraph represting the 'STATUS' column

In [None]:
# Count the occurrences of each status
status_counts = credit_data['STATUS'].value_counts()

# Define colors for each status
colors = ['skyblue', 'salmon', 'lightgreen', 'orange', 'purple', 'gold', 'lightcoral', 'lightpink']

# Create a bar graph
plt.figure(figsize=(10, 6))
status_counts.plot(kind='bar', color=colors[:len(status_counts)])
plt.title('Distribution of Loan Status')
plt.xlabel('Status')
plt.ylabel('Count')
plt.xticks(rotation=0)
plt.grid(axis='y')
plt.show()

In [None]:
# Convert age from days to years and make it positive
applicant_data['DAYS_BIRTH'] = (-applicant_data['DAYS_BIRTH']) / 365


# Convert age from days to years and make it positive
applicant_data['DAYS_EMPLOYED'] = (-applicant_data['DAYS_EMPLOYED']) / 365

# Display the updated DataFrame
applicant_data.head()

### Merging the two datasets based on the IDs

In [None]:
merged_df = pd.merge(applicant_data, credit_data, on='ID', how='inner')

merged_df.head()

In [None]:
# Filter out rows where STATUS is 'X'
merged_df_filtered = merged_df[merged_df['STATUS'] != 'X'].copy()

# Create loan labels based on STATUS
merged_df_filtered.loc[:, 'label'] = merged_df_filtered['STATUS'].apply(
    lambda status: 0 if status in ['0', '1', '2', '3', '4', '5'] 
                   else 1 if status == 'C' 
                   else 2
)

# Keep all columns from both DataFrames
result_df = merged_df_filtered

result_df.head()

In [None]:
## Checking for Null Values

In [None]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

missing_values = result_df.isnull().sum()
# print(missing_values)

missing_columns = missing_values[missing_values > 0]
print(missing_columns)

In [None]:
# Calculate the mode of the OCCUPATION_TYPE column
occupation_mode = result_df['OCCUPATION_TYPE'].mode()[0]  # Get the first mode

# Fill missing values with the mode and assign back to the column
result_df['OCCUPATION_TYPE'] = result_df['OCCUPATION_TYPE'].fillna(occupation_mode)

# Display the updated DataFrame
print("\nUpdated DataFrame:")
# print(result_df)

In [None]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

missing_values = result_df.isnull().sum()
# print(missing_values)

missing_columns = missing_values[missing_values > 0]
print(missing_columns)

In [None]:
label_counts = result_df['label'].value_counts()

label_names = ['Rejected Loan', 'Approved Loan', 'No Loan']

# Create a bar graph
plt.figure(figsize=(10, 6))
label_counts.plot(kind='bar', color='skyblue')
plt.title('Distribution of Loan Labels')
plt.xlabel('Loan Status')
plt.ylabel('Count')
plt.xticks(ticks=range(len(label_names)), labels=label_names, rotation=0)
plt.grid(axis='y')
plt.show()

In [None]:
categorical_ordinal_columns = ['education_type']
categorical_binary_columns = ['owns_car', 'owns_property']
categorical_label_encode_columns = ['family_status', 'housing_type', 'occupation_type', 'status']
numerical_columns = ['age_in_days', 'days_employed', 'days_employed', 'num_children']

# Renaming columns

In [None]:
new_column_names_applicant = {
    'ID': 'id',
    'CODE_GENDER': 'gender',
    'FLAG_OWN_CAR': 'owns_car',
    'FLAG_OWN_REALTY': 'owns_property',
    'CNT_CHILDREN': 'num_children',
    'AMT_INCOME_TOTAL': 'total_income',
    'NAME_INCOME_TYPE': 'income_type',
    'NAME_EDUCATION_TYPE': 'education_type',
    'NAME_FAMILY_STATUS': 'family_status',
    'NAME_HOUSING_TYPE': 'housing_type',
    'DAYS_BIRTH': 'age_in_days',
    'DAYS_EMPLOYED': 'days_employed',
    'FLAG_MOBIL': 'owns_mobile_phone',
    'FLAG_WORK_PHONE': 'owns_work_phone',
    'FLAG_PHONE': 'owns_mobile_phone',
    'FLAG_EMAIL': 'owns_mobile_phone',
    'OCCUPATION_TYPE': 'occupation_type',
    'CNT_FAM_MEMBERS': 'number_of_family_members',
    'MONTHS_BALANCE': 'months_balance',
    'STATUS': 'status'
}


### Apply renaming to the dataframe
result_df.rename(columns=new_column_names_applicant, inplace=True)

In [None]:
result_df.head()

In [None]:
categorical_ordinal_columns = ['education_type']
categorical_binary_columns = ['owns_car', 'owns_property']
categorical_label_encode_columns = ['family_status', 'housing_type', 'occupation_type', 'status']
numerical_columns = ['age_in_days', 'days_employed', 'days_employed', 'num_children']

In [None]:
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder

# 1. Apply Ordinal Encoding
ordinal_encoder = OrdinalEncoder()
result_df[categorical_ordinal_columns] = ordinal_encoder.fit_transform(result_df[categorical_ordinal_columns])

result_df.head()

In [None]:
# Binary encoding for only the 'gender' column
result_df['gender'] = result_df['gender'].map({'M': 1, 'F': 0})

result_df.head()

In [None]:
# 3. Apply Label Encoding for remaining categorical columns
label_encoder = LabelEncoder()
for col in categorical_label_encode_columns:
    result_df[col] = label_encoder.fit_transform(result_df[col])

result_df.head()

In [None]:
# 2. Apply Binary Encoding manually for binary columns
for col in categorical_binary_columns:
    result_df[col] = result_df[col].map({'Y': 1, 'N': 0})

In [None]:
result_df.head()

# Selecting relevant columns

In [None]:
# Drop the 'income_type' column
result_df = result_df.drop(columns=['income_type'])

# Print the updated DataFrame
# print(result_df.head())

In [None]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

missing_values = result_df.isnull().sum()
# print(missing_values)
fmissing_columns = missing_values[missing_values > 0]
print(missing_columns)

In [None]:
categorical_ordinal_columns = ['education_type']
categorical_binary_columns = ['owns_car', 'owns_property']
categorical_label_encode_columns = ['family_status', 'housing_type', 'occupation_type', 'status']
numerical_columns = ['age_in_days', 'days_employed', 'days_employed', 'num_children']

In [None]:
# Step 1: Import necessary libraries
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Step 2: Prepare your features and target variable
# Assuming 'result_df' is the final preprocessed DataFrame
# Replace 'features' with your feature columns and 'label' with the target column
X = result_df[['gender', 'owns_car', 'owns_property', 'num_children', 
            'total_income', 'education_type', 'family_status', 
            'housing_type', 'age_in_days', 'days_employed', 
            'occupation_type', 'number_of_family_members']]  # Replace with your feature column names

y = result_df['label']  # Target variable (loan approval or credit score class)

# Step 3: Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 4: Initialize and train the logistic regression model
model = LogisticRegression()
model.fit(X_train, y_train)

# Step 5: Make predictions on the test set
y_pred = model.predict(X_test)

# Step 6: Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

In [None]:
# Step 1: Import necessary libraries
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Step 2: Prepare your features and target variable
# Assuming 'result_df' is the final preprocessed DataFrame
# Replace 'features' with your feature columns and 'label' with the target column
X = result_df[['gender', 'owns_car', 'owns_property', 'num_children', 
               'total_income', 'education_type', 'family_status', 
               'housing_type', 'age_in_days', 'days_employed', 
               'occupation_type', 'number_of_family_members']]  # Replace with your feature column names

y = result_df['label']  # Target variable (loan approval or credit score class)

# Step 3: Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 4: Initialize and train the random forest model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Step 5: Make predictions on the test set
y_pred = model.predict(X_test)

# Step 6: Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

In [None]:
# Step 1: Import necessary libraries
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Step 2: Prepare your features and target variable
# Assuming 'result_df' is the final preprocessed DataFrame
# Replace 'features' with your feature column names and 'label' with the target column
X = result_df.drop(columns=['label'])  # Corrected to use drop with 'columns' parameter
y = result_df['label']  # Target variable (loan approval or credit score class)

# Step 3: Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 4: Initialize and train the Random Forest model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Step 5: Make predictions on the test set
y_pred = model.predict(X_test)

# Step 6: Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))