In [1]:
import pandas as pd

df = pd.read_csv("../data/processed/loan_data_features.csv")
df.head()


Unnamed: 0,ID,year,loan_limit,Gender,approv_in_adv,loan_type,loan_purpose,Credit_Worthiness,open_credit,business_or_commercial,...,age,submission_of_application,LTV,Region,Security_Type,Status,dtir1,loan_to_income,loan_to_income_capped,credit_score_band
0,24890,2019,cf,Sex Not Available,nopre,type1,p1,l1,nopc,nob/c,...,25-34,to_inst,98.728814,south,direct,1,45.0,66.954023,5.0,Excellent
1,24892,2019,cf,Male,pre,type1,p1,l1,nopc,nob/c,...,35-44,to_inst,80.019685,south,direct,0,46.0,42.879747,5.0,Excellent
2,24893,2019,cf,Male,nopre,type1,p4,l1,nopc,nob/c,...,45-54,not_inst,69.3769,North,direct,0,42.0,38.425926,5.0,Fair
3,24894,2019,cf,Joint,pre,type1,p1,l1,nopc,nob/c,...,25-34,not_inst,91.886544,North,direct,0,39.0,66.714559,5.0,Fair
4,24895,2019,cf,Joint,pre,type1,p1,l1,nopc,nob/c,...,35-44,not_inst,70.089286,North,direct,0,40.0,70.089286,5.0,Excellent


In [2]:
def risk_score(row):
    score = 0
    
    # Loan-to-income risk
    if row['loan_to_income_capped'] > 4:
        score += 3
    elif row['loan_to_income_capped'] > 2:
        score += 2
    else:
        score += 1

    # Credit score risk
    if row['Credit_Score'] < 600:
        score += 3
    elif row['Credit_Score'] < 700:
        score += 2
    else:
        score += 1

    # Debt-to-income risk
    if row['dtir1'] > 60:
        score += 3
    elif row['dtir1'] > 40:
        score += 2
    else:
        score += 1
        
    return score


In [3]:
df['risk_score'] = df.apply(risk_score, axis=1)


In [4]:
def risk_bucket(score):
    if score <= 5:
        return 'Low Risk'
    elif score <= 7:
        return 'Medium Risk'
    else:
        return 'High Risk'


In [5]:
df['risk_segment'] = df['risk_score'].apply(risk_bucket)


In [9]:
df.to_csv("../data/processed/loan_data_with_risk_segments.csv", index=False)


In [6]:
df['risk_segment'].value_counts(normalize=True) * 100


risk_segment
Medium Risk    60.957861
Low Risk       27.941236
High Risk      11.100903
Name: proportion, dtype: float64

In [7]:
df.groupby('risk_segment')['Status'].mean() * 100


risk_segment
High Risk      20.029038
Low Risk       13.163359
Medium Risk    16.731445
Name: Status, dtype: float64

In [8]:
### Explainable Risk Segmentation ###
#This rule-based segmentation approach is designed to ensure transparency, auditability, and alignment with regulatory expectations. 
#Each risk segment is interpretable and directly linked to policy decision-making.


In [3]:
### Data Quality Check 1: Completeness ###

#The following SQL query was executed in PostgreSQL to validate completeness of all critical risk attributes.


#SELECT
#    COUNT(*) AS total_rows,
#    COUNT(income) AS income_present,
#    COUNT(loan_amount) AS loan_amount_present,
#    COUNT(credit_score) AS credit_score_present,
#    COUNT(ltv) AS ltv_present,
#    COUNT(dtir1) AS dtir1_present,
#   COUNT(risk_segment) AS risk_segment_present
#FROM credit.loan_data_with_risk_segments;


#Result Summary#

# Total records: 124,089  
# All critical risk fields showed 100% completeness  
# No null leakage into the final risk segmentation layer

In [4]:
### Data Quality Check 2: Valid Ranges & Business Rules

#This check validates that key numerical risk variables fall within industry-accepted and business-reasonable ranges to ensure
#data integrity and prevent downstream analytical errors.


#SELECT
#    SUM(CASE WHEN credit_score < 300 OR credit_score > 900 THEN 1 ELSE 0 END) 
#        AS invalid_credit_score,
#    SUM(CASE WHEN ltv <= 0 OR ltv > 200 THEN 1 ELSE 0 END) 
#        AS invalid_ltv,
#    SUM(CASE WHEN dtir1 < 0 OR dtir1 > 100 THEN 1 ELSE 0 END) 
#        AS invalid_dtir
#FROM credit.loan_data_with_risk_segments;



#Result Summary#

# No invalid credit score values detected
# No out-of-range LTV values detected
# No invalid debt-to-income ratios detected

#All numerical fields conform to expected financial and regulatory ranges.


In [5]:
### Data Quality Check 3: Risk Segment Consistency ###

#This check validates that the risk segmentation logic meaningfully separates borrowers by financial strength, ensuring interpretability
#and business alignment of the risk model.


#SELECT
#    risk_segment,
#    MIN(credit_score) AS min_credit_score,
#    MAX(credit_score) AS max_credit_score,
#    MIN(ltv) AS min_ltv,
#    MAX(ltv) AS max_ltv,
#    MIN(dtir1) AS min_dtir,
#    MAX(dtir1) AS max_dtir
#FROM credit.loan_data_with_risk_segments
#GROUP BY risk_segment;

#Result Summary#

# Low Risk segment exhibits higher credit scores and lower leverage ratios
# Medium Risk segment shows moderate financial characteristics
# High Risk segment reflects weaker credit profiles and higher leverage

#These results confirm that the segmentation logic aligns with expected credit risk behavior.

