In [1]:
import json
import re
from pathlib import Path
from collections import defaultdict
from datetime import datetime
from typing import List, Dict, Union

import pandas as pd

In [2]:
# create extractor class
class CreditBureauFeatureExtractor:
    def __init__(self, file_paths: Union[Path, str, List[Union[Path, str]]]):
        """
        Initializes the extractor with one or more credit bureau JSON reports.

        Args:
            file_paths (Path | str | List[Path | str]): One or more paths to JSON report files.
        """
        # Ensure list
        if isinstance(file_paths, (str, Path)):
            file_paths = [file_paths]

        self.json_data = []

        for path in file_paths:
            path = Path(path)
            if not path.exists():
                raise FileNotFoundError(f"File not found: {path}")

            with open(path, 'r') as file:
                content = json.load(file)
                
                # If the JSON file contains a list of reports, extend the list
                if isinstance(content, list):
                    self.json_data.extend(content)
                else:
                    self.json_data.append(content)

    def extract_features(self) -> pd.DataFrame:
        features = []
        for report in self.json_data:
            app_id = report.get("application_id")
            data = report.get("data", {}).get("consumerfullcredit", {})
            feature_dict = {"application_id": app_id}

            # ---- Account Rating Features ----
            account_rating = data.get("accountrating", {})
            good_total = 0
            bad_total = 0

            for key, value in account_rating.items():
                count = self._safe_int(value)
                if key.endswith('good'):
                    good_total += count
                elif key.endswith('bad'):
                    bad_total += count

            total_accounts = good_total + bad_total
            feature_dict["good_account_ratio"] = good_total / total_accounts if total_accounts > 0 else 1
            
            # ---- Enquiry History Features ----
            enquiry_history_top = data.get("enquiryhistorytop", [])
            feature_dict.update(self.extract_enquiry_features(enquiry_history_top))
            
            # ---- Credit Account Summary Features ----
            credit_summary = data.get("creditaccountsummary", {})
            feature_dict.update(self.extract_credit_account_features(credit_summary))
            
            # ---- Delinquency Information ----
            delinq_info = data.get("deliquencyinformation", {})
            feature_dict["months_in_arrears"] = self._safe_int(delinq_info.get("monthsinarrears", 0))
            
            # ---- Credit Agreement Features ----
            agreements = data.get("creditagreementsummary", [])
            feature_dict.update(self.extract_credit_agreement_features(agreements))
            
            # ---- Payment History Features ----
            monthly_history = data.get("accountmonthlypaymenthistory", [])
            monthly_history_header = data.get("accountmonthlypaymenthistoryheader", {})
            feature_dict.update(self.extract_customer_delinquency_summary(monthly_history, monthly_history_header))
            
            # ---- Demographics Features ----
            personal_details = data.get("personaldetailssummary", {})
            feature_dict.update(self.extract_basic_demographics(personal_details))
            
            # ---- Employment Information ----
            employment_history = data.get("employmenthistory", [])
            feature_dict["occupation"], feature_dict["employer"] = self.extract_most_recent_employment(employment_history)
            
            features.append(feature_dict)
        
        return pd.DataFrame(features)

    
    def extract_enquiry_features(self, enquiryhistorytop: List) -> Dict:
        """Extract key features from inquiry history."""
        current_date = datetime.now()
        total_inquiries = len(enquiryhistorytop)
        recent_inquiries_count = 0
        unique_subscribers = set()

        for enquiry in enquiryhistorytop:
            try:
                enquiry_date = datetime.strptime(enquiry.get("daterequested", ""), "%d/%m/%Y %H:%M:%S")
                if (current_date - enquiry_date).days <= 180:  # Last 6 months
                    recent_inquiries_count += 1
                unique_subscribers.add(enquiry.get("subscribername", ""))
            except (ValueError, TypeError):
                continue

        return {
            "total_inquiries": total_inquiries,
            "recent_inquiries_count": recent_inquiries_count,
            "unique_lenders_inquiries": len(unique_subscribers)
        }
    
    def extract_credit_account_features(self, credit_account_summary: Dict) -> Dict:
        """Extract key features from credit account summary."""
        return {
            "credit_rating": self._safe_int(credit_account_summary.get("rating", 0)),
            "total_accounts": self._safe_int(credit_account_summary.get("totalaccounts", 0)),
            "amount_in_arrears": self._safe_float(credit_account_summary.get("amountarrear", 0)),
            "total_account_arrears": self._safe_int(credit_account_summary.get("totalaccountarrear", 0)),
            "total_outstanding_debt": self._safe_float(credit_account_summary.get("totaloutstandingdebt", 0)),
            "total_monthly_installment": self._safe_float(credit_account_summary.get("totalmonthlyinstalment", 0)),
            "total_number_of_judgements": self._safe_int(credit_account_summary.get("totalnumberofjudgement", 0))
        }
    
    def extract_credit_agreement_features(self, agreements):
        """Extract key features from credit agreements."""
        features = defaultdict(float)
        performing_count = 0
        lost_count = 0
        open_count = 0
        closed_count = 0
        overdraft_count = 0
        unique_lenders = set()
        
        for acc in agreements:
            status = (acc.get("accountstatus") or "").lower()
            perf = (acc.get("performancestatus") or "").lower()
            desc = (acc.get("indicatordescription") or "").lower()
            lender = acc.get("subscribername", "")

            # Extract monetary values
            overdue = self._safe_float(acc.get("amountoverdue", 0))
            inst_amt = self._safe_float(acc.get("instalmentamount", 0))
            open_bal = self._safe_float(acc.get("openingbalanceamt", 0))
            current_bal = self._safe_float(acc.get("currentbalanceamt", 0))

            # Count account statuses
            if status == "open":
                open_count += 1
            elif status == "closed":
                closed_count += 1

            # Count performance statuses
            if perf == "performing":
                performing_count += 1
            elif perf == "lost":
                lost_count += 1
                features["total_amount_lost"] += current_bal

            # Accumulate monetary totals
            features["total_amount_overdue"] += overdue
            features["total_opening_balance"] += open_bal
            features["total_current_balance"] += current_bal
            features["total_instalment_amount"] += inst_amt

            # Count account types
            if "overdraft" in desc:
                overdraft_count += 1
            
            # Track unique lenders
            if lender:
                unique_lenders.add(lender)

        # Calculate key ratios
        features.update({
            "num_open_accounts": open_count,
            "num_closed_accounts": closed_count,
            "num_performing_accounts": performing_count,
            "num_lost_accounts": lost_count,
            "num_overdraft_accounts": overdraft_count,
            "unique_lenders": len(unique_lenders),
            "debt_payment_ratio": features["total_instalment_amount"] / features["total_current_balance"] 
                if features["total_current_balance"] > 0 else 0
        })

        return dict(features)
    
    def extract_customer_delinquency_summary(self, account_history: List, header: Dict) -> Dict:
        """Extract key delinquency features from payment history."""
        month_keys = [f"m{str(i).zfill(2)}" for i in range(1, 25)]
        
        total_months_reported = 0
        total_months_delinquent = 0
        all_delinquencies = []
        recent_delinquencies = 0
        
        for account in account_history:
            for idx, m in enumerate(month_keys):
                val = account.get(m)
                if val is None or val == "#":
                    continue

                try:
                    delinquency = int(val)
                except ValueError:
                    continue

                total_months_reported += 1
                if delinquency > 0:
                    total_months_delinquent += 1
                    all_delinquencies.append(delinquency)
                
                if idx < 6 and delinquency > 0:  # Last 6 months
                    recent_delinquencies += 1

        return {
            "total_months_reported": total_months_reported,
            "total_months_delinquent": total_months_delinquent,
            "max_delinquency": max(all_delinquencies) if all_delinquencies else 0,
            "recent_delinquencies": recent_delinquencies,
            "delinquency_ratio": total_months_delinquent / max(total_months_reported, 1)
        }
    
    def extract_basic_demographics(self, personal_details: Dict) -> Dict:
        """Extract basic demographic information."""
        birthdate_str = personal_details.get("birthdate")
        
        # Calculate age
        try:
            birthdate = datetime.strptime(birthdate_str, "%d/%m/%Y")
            today = datetime.today()
            age = today.year - birthdate.year - ((today.month, today.day) < (birthdate.month, birthdate.day))
        except (ValueError, TypeError):
            age = None
        
        return {
            "age": age,
            "gender": personal_details.get("gender")
        }
    
    def extract_most_recent_employment(self, employment_history: List) -> tuple:
        """Extract the most recent employment information."""
        if not employment_history:
            return None, None
            
        # Try to find entry with update date
        entries_with_date = [entry for entry in employment_history if "updatedate" in entry]
        
        if entries_with_date:
            # Sort by date (most recent first)
            try:
                for entry in entries_with_date:
                    entry["parsed_date"] = datetime.strptime(entry["updatedate"], "%d/%m/%Y")
                most_recent = sorted(entries_with_date, key=lambda x: x["parsed_date"], reverse=True)[0]
            except (ValueError, TypeError, IndexError):
                most_recent = entries_with_date[0]
        else:
            # Fall back to first entry
            most_recent = employment_history[0]
        
        return most_recent.get("occupation"), most_recent.get("employerdetail")
    
    def _safe_float(self, value) -> float:
        try:
            return float(str(value).replace(",", ""))
        except:
            return 0.0

    def _safe_int(self, value) -> int:
        try:
            return int(str(value).replace(",", ""))
        except:
            return 0

In [3]:
# test extractor class
extractor = CreditBureauFeatureExtractor(file_paths='data/Credit_bureau_sample_data.json')
dataset = extractor.extract_features()

In [4]:
# Examine dataset
dataset.head()

Unnamed: 0,application_id,good_account_ratio,total_inquiries,recent_inquiries_count,unique_lenders_inquiries,credit_rating,total_accounts,amount_in_arrears,total_account_arrears,total_outstanding_debt,...,total_months_reported,total_months_delinquent,max_delinquency,recent_delinquencies,delinquency_ratio,age,gender,occupation,employer,total_amount_lost
0,97,1.0,15,0,2,13,7,24041.0,2,105435.0,...,50,13,36,1,0.26,34,Male,PUBLIC SERVANTS,ALL MILITARY STAFFS,
1,9714953,1.0,10,0,4,2,17,0.0,1,294770.0,...,58,13,40,1,0.224138,39,Female,CIVIL SERVANT,,
2,9714978,0.666667,10,0,3,109,3,12000.0,1,110919.0,...,25,3,109,1,0.12,41,Female,STUDENT,,12000.0


In [5]:
# Further look at created columns
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 34 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   application_id              3 non-null      int64  
 1   good_account_ratio          3 non-null      float64
 2   total_inquiries             3 non-null      int64  
 3   recent_inquiries_count      3 non-null      int64  
 4   unique_lenders_inquiries    3 non-null      int64  
 5   credit_rating               3 non-null      int64  
 6   total_accounts              3 non-null      int64  
 7   amount_in_arrears           3 non-null      float64
 8   total_account_arrears       3 non-null      int64  
 9   total_outstanding_debt      3 non-null      float64
 10  total_monthly_installment   3 non-null      float64
 11  total_number_of_judgements  3 non-null      int64  
 12  months_in_arrears           3 non-null      int64  
 13  total_amount_overdue        3 non-null 