In [24]:
import pandas as pd
import numpy as np
import os
import re

In [3]:
# Directory containing the data folders and Excel layout file
data_dir = '/Users/dr/Documents/GitHub/MBS_RiskManagement/data/'
layout_file = '/Users/dr/Documents/GitHub/MBS_RiskManagement/READ_ME/SF LLD File Layout Release 44.xlsx'

In [4]:
# Function to parse sheet content from Excel
def parse_sheet_from_excel(sheet_name):
    df = pd.read_excel(layout_file, sheet_name=sheet_name, header=None)
    columns = []
    dtypes = {}
    start_row = df.index[df[0].str.contains('FIELD POSITION', na=False)].tolist()
    if start_row:
        start_row = start_row[0] + 1
    else:
        start_row = 0
    
    for index, row in df.iloc[start_row:].iterrows():
        if pd.isna(row[0]) or not isinstance(row[1], str):
            break
        attribute_name = row[1].strip()
        data_type = row[2].strip() if pd.notna(row[2]) else 'object'
        columns.append(attribute_name)
        if 'Alpha' in data_type or 'Alpha Numeric' in data_type or '- PYYQnXXXXXXX' in data_type:
            dtypes[attribute_name] = 'object'
        elif 'Numeric' in data_type and not ' - ' in data_type:
            dtypes[attribute_name] = 'Int64'
        elif 'Numeric - ' in data_type:
            dtypes[attribute_name] = 'float64'
        elif 'Date' in data_type:
            dtypes[attribute_name] = 'datetime64[ns]'
        else:
            dtypes[attribute_name] = 'object'
    print(f"{sheet_name} columns: {columns}")
    print(f"{sheet_name} dtypes: {dtypes}")
    return columns, dtypes

In [5]:
# Parse Origination and Performance sheets from Excel
origination_columns, origination_dtypes = parse_sheet_from_excel('Origination Data File')
performance_columns, performance_dtypes = parse_sheet_from_excel('Monthly Performance Data File')

Origination Data File columns: ['Credit Score', 'First Payment Date', 'First Time Homebuyer Flag', 'Maturity Date', 'Metropolitan Statistical Area (MSA) Or Metropolitan Division', 'Mortgage Insurance Percentage (MI %)', 'Number of Units', 'Occupancy Status', 'Original Combined Loan-to-Value (CLTV)', 'Original Debt-to-Income (DTI) Ratio', 'Original UPB', 'Original Loan-to-Value (LTV)', 'Original Interest Rate', 'Channel', 'Prepayment Penalty Mortgage (PPM) Flag', 'Amortization Type (Formerly Product Type)', 'Property State', 'Property Type', 'Postal Code', 'Loan Sequence Number', 'Loan Purpose', 'Original Loan Term', 'Number of Borrowers', 'Seller Name', 'Servicer Name', 'Super Conforming Flag', 'Pre-HARP Loan Sequence Number', 'Special Eligibility Program', 'HARP Indicator', 'Property Valuation Method', 'Interest Only (I/O) Indicator', 'Mortgage Insurance Cancellation Indicator']
Origination Data File dtypes: {'Credit Score': 'Int64', 'First Payment Date': 'datetime64[ns]', 'First Time

In [6]:
# Quarters to process for 2024
quarters = ['Q1', 'Q2', 'Q3', 'Q4']

In [7]:
# Process each quarter
for quarter in quarters:
    folder_path = os.path.join(data_dir, f'historical_data_2024{quarter}')
    
    if os.path.exists(folder_path) and os.path.isdir(folder_path):
        print(f"Processing folder: {folder_path}")
        
        # Assign columns and data types to origination data file
        origination_file_pattern = f'historical_data_2024{quarter}.txt'
        origination_file = os.path.join(folder_path, origination_file_pattern)
        if os.path.exists(origination_file):
            origination_df = pd.read_csv(origination_file, sep='|', header=None, names=origination_columns, encoding='utf-8', low_memory=False)
            try:
                origination_df = origination_df.astype(origination_dtypes, errors='raise')
            except (ValueError, TypeError) as e:
                print(f"Type casting error for origination_df: {e}")
                for col in origination_df.columns:
                    if col in origination_dtypes:
                        if pd.api.types.is_numeric_dtype(origination_df[col]):
                            if origination_dtypes[col] == 'Int64' and origination_df[col].dtype == 'float64':
                                # Allow float64 if decimals or NaN are present
                                origination_df[col] = origination_df[col].astype('float64', errors='ignore')
                            elif origination_df[col].dtype == 'object' and not origination_df[col].str.match(r'^\d*\.?\d+$', na=False).all():
                                origination_df[col] = origination_df[col].astype('object')
                origination_df = origination_df.astype(origination_dtypes, errors='ignore')
            origination_output = os.path.join(folder_path, f'origination_2024{quarter}_with_columns.csv')
            origination_df.to_csv(origination_output, index=False)
            print(f"Assigned columns and data types to origination data and saved as {origination_output}")
        else:
            print(f"Origination file not found: {origination_file_pattern}")
        
        # Assign columns and data types to performance data file
        performance_file_pattern = f'historical_data_time_2024{quarter}.txt'
        performance_file = os.path.join(folder_path, performance_file_pattern)
        if os.path.exists(performance_file):
            performance_df = pd.read_csv(performance_file, sep='|', header=None, names=performance_columns, encoding='utf-8', low_memory=False)
            try:
                performance_df = performance_df.astype(performance_dtypes, errors='raise')
            except (ValueError, TypeError) as e:
                print(f"Type casting error for performance_df: {e}")
                for col in performance_df.columns:
                    if col in performance_dtypes:
                        if pd.api.types.is_numeric_dtype(performance_df[col]):
                            if performance_dtypes[col] == 'Int64' and performance_df[col].dtype == 'float64':
                                # Allow float64 if decimals or NaN are present
                                performance_df[col] = performance_df[col].astype('float64', errors='ignore')
                            elif performance_df[col].dtype == 'object' and not performance_df[col].str.match(r'^\d*\.?\d+$', na=False).all():
                                performance_df[col] = performance_df[col].astype('object')
                performance_df = performance_df.astype(performance_dtypes, errors='ignore')
            performance_output = os.path.join(folder_path, f'performance_2024{quarter}_with_columns.csv')
            performance_df.to_csv(performance_output, index=False)
            print(f"Assigned columns and data types to performance data and saved as {performance_output}")
        else:
            print(f"Performance file not found: {performance_file_pattern}")
    else:
        print(f"Folder not found: {folder_path}")

print("Column and data type assignment complete for all 2024 quarters.")

Processing folder: /Users/dr/Documents/GitHub/MBS_RiskManagement/data/historical_data_2024Q1
Assigned columns and data types to origination data and saved as /Users/dr/Documents/GitHub/MBS_RiskManagement/data/historical_data_2024Q1/origination_2024Q1_with_columns.csv
Type casting error for performance_df: cannot safely cast non-equivalent float64 to int64
Assigned columns and data types to performance data and saved as /Users/dr/Documents/GitHub/MBS_RiskManagement/data/historical_data_2024Q1/performance_2024Q1_with_columns.csv
Processing folder: /Users/dr/Documents/GitHub/MBS_RiskManagement/data/historical_data_2024Q2
Assigned columns and data types to origination data and saved as /Users/dr/Documents/GitHub/MBS_RiskManagement/data/historical_data_2024Q2/origination_2024Q2_with_columns.csv
Type casting error for performance_df: cannot safely cast non-equivalent float64 to int64
Assigned columns and data types to performance data and saved as /Users/dr/Documents/GitHub/MBS_RiskManagemen

Based on the Origination Data File and Monthly Performance Data File schemas:

Target Variable (to be derived):

- Current Loan Delinquency Status: The number of days the borrower is delinquent in making loan payments as of the end of the monthly reporting period. Used to derive the target (e.g., delinquent if >0). Guide notes: 0 = Current, 1 = 30-59 days, 2 = 60-89 days, ..., RA = Repayment Plan, RF = REO, 999 = Unknown.


Predictor Variables (Features):

From Origination Data:

- Credit Score: The standardized credit score used to evaluate the borrower during the loan origination process. Lower scores indicate higher risk. Guide notes: FICO score, masked as 300 for <300, 850 for >850, or 9999 for missing.
- Original Combined Loan-to-Value (CLTV): The ratio of the original loan amount and any subordinate lien amount to the property value at origination. Higher ratios increase default risk. Guide notes: Rounded to nearest integer, 999 for missing.
- Original Debt-to-Income (DTI) Ratio: The ratio of the borrower's total monthly debt payments to gross monthly income at origination. Higher DTI suggests financial strain. Guide notes: Rounded to nearest integer, 999 for missing or not considered.
- Original Interest Rate: The interest rate on the loan as stated on the note at the time the loan was originated. Higher rates may lead to higher payments and defaults. Guide notes: Reported to the nearest eighth of a percent.
- Original Loan Term: The number of months in which the loan is scheduled to be repaid. Longer terms may reduce monthly payments but increase long-term risk. Guide notes: In months, e.g., 360 for 30-year loans.
- Number of Borrowers: The number of borrowers who are obligated to repay the mortgage note. Multiple borrowers may reduce risk. Guide notes: 99 for missing.
- Property State: The two-letter postal abbreviation for the state in which the property is located. Captures regional economic factors. Guide notes: U.S. states only.
- Occupancy Status: The classification for the property occupancy status at the time the loan was originated. Investment properties have higher risk. Guide notes: O = Owner Occupied, S = Second Home, I = Investment Property, 9 = Unknown.


From Performance Data:

- Loan Age: The number of scheduled monthly payments that have elapsed since the loan was originated. Helps capture loan seasoning. Guide notes: In months, 999 for missing.
- Remaining Months to Legal Maturity: The number of months remaining until the loan is scheduled to mature. Shorter terms may indicate higher risk near maturity. Guide notes: In months, 999 for missing.
- Current Actual UPB: The unpaid principal balance of the loan as of the end of the monthly reporting period. Higher UPB may correlate with defaults. Guide notes: Rounded to nearest $1,000, 000000 for zero balance.
- Current Interest Rate: The interest rate on the loan as of the end of the monthly reporting period. Adjustments can affect affordability. Guide notes: Reported to the nearest eighth of a percent, 99.999 for missing.


Rationale for Selection: 

These variables cover borrower creditworthiness, loan affordability, property details, and ongoing performance, which are key drivers of default risk. The target is derived from 'Current Loan Delinquency Status' as a binary flag (1 for delinquent, 0 for current).


Key Identifiers:

- Loan Sequence Number: A unique identifier for each loan, critical for merging and tracking across origination and performance data. Guide notes: 12-character alphanumeric, masked for privacy.
- Original Loan-to-Value (LTV): The ratio of the original loan amount to the property value at origination, providing additional context to Original Combined Loan-to-Value (CLTV). Guide notes: Rounded to nearest integer, 999 for missing.
- First Payment Date: The date of the first scheduled payment, offering a temporal anchor for loan age and performance. Guide notes: Format YYYYMMDD, parsed as datetime64[ns].

In [46]:
# Selected columns for logistic regression (from origination and performance data)
selected_columns = [
    'Loan Sequence Number', 'Credit Score', 'Original Combined Loan-to-Value (CLTV)', 
    'Original Loan-to-Value (LTV)', 'Original Debt-to-Income (DTI) Ratio', 'Original Interest Rate', 
    'Original Loan Term', 'Number of Borrowers', 'Property State', 'Occupancy Status', 
    'Loan Age', 'Remaining Months to Legal Maturity', 'Current Actual UPB', 
    'Current Interest Rate', 'Current Loan Delinquency Status', 'First Payment Date', 'Maturity Date'
]

In [47]:
# Define dtypes for performance data columns (excluding datetime)
performance_dtypes = {
    'Current Loan Delinquency Status': 'object',  # Alpha Numeric, handle as string
    'Remaining Months to Legal Maturity': 'Int64',  # Numeric, allow NaN
    'Modification Cost': 'float64',               # Numeric
    'Step Modification Flag': 'object',           # Alpha, explicitly set to object
    'Delinquency Due to Disaster': 'object',      # Alpha
    'Borrower Assistance Status Code': 'object'   # Alpha
}

In [48]:
# Define dtypes for origination data columns (excluding datetime)
origination_dtypes = {}  # Empty for now

In [49]:
# Custom date parser function for YYYYMM format from last 6 digits
def parse_ymd_from_last6(date_str):
    if pd.isna(date_str) or not isinstance(date_str, str):
        return pd.NaT
    try:
        # Extract last 6 digits (YYYYMM)
        last6 = date_str[-6:]
        if len(last6) == 6 and last6.isdigit():
            year = int(last6[:4])
            month = int(last6[4:])
            if 1 <= month <= 12:
                return pd.to_datetime(f"{year}-{month:02d}-01", format='%Y-%m-%d')
        print(f"Failed to parse '{date_str}' as YYYYMM from last 6 digits")
        return pd.NaT
    except ValueError:
        print(f"Error parsing '{date_str}'")
        return pd.NaT

In [50]:
# Columns to process as dates (not used in read_csv, for reference)
origination_date_cols = ['First Payment Date', 'Maturity Date']
performance_date_cols = ['Monthly Reporting Period']

In [51]:
# Initialize an empty dataframe to store combined data
combined_df = pd.DataFrame()

In [52]:
# Process each quarter
for quarter in quarters:
    folder_path = os.path.join(data_dir, f'historical_data_2024{quarter}')
    
    if os.path.exists(folder_path) and os.path.isdir(folder_path):
        print(f"Processing folder: {folder_path}")
        
        # Load origination data without initial date parsing
        origination_file = os.path.join(folder_path, f'origination_2024{quarter}_with_columns.csv')
        if os.path.exists(origination_file):
            origination_df = pd.read_csv(origination_file, dtype=origination_dtypes, encoding='utf-8')
            # Debug and post-process date columns
            for col in origination_date_cols:
                if col in origination_df.columns:
                    print(f"Raw '{col}' unique values: {origination_df[col].dropna().unique()}")
                    origination_df[col] = origination_df[col].apply(parse_ymd_from_last6)
                    print(f"{col} parsed for {quarter}. Unique values: {origination_df[col].dropna().unique()}")
                else:
                    print(f"Warning: '{col}' not found in {origination_file}")
        else:
            print(f"Origination file not found: {origination_file}")
            origination_df = pd.DataFrame()
        
        # Load performance data with columns, specified dtypes, and parse dates, disabling low_memory
        performance_file = os.path.join(folder_path, f'performance_2024{quarter}_with_columns.csv')
        if os.path.exists(performance_file):
            performance_df = pd.read_csv(performance_file, dtype=performance_dtypes, low_memory=False)
            # Post-process 'Monthly Reporting Period' with custom parser
            if 'Monthly Reporting Period' in performance_df.columns:
                print(f"Raw 'Monthly Reporting Period' unique values: {performance_df['Monthly Reporting Period'].dropna().unique()}")
                performance_df['Monthly Reporting Period'] = performance_df['Monthly Reporting Period'].apply(parse_ymd_from_last6)
                print(f"Monthly Reporting Period parsed for {quarter}. Unique values: {performance_df['Monthly Reporting Period'].dropna().unique()}")
            else:
                print(f"Warning: 'Monthly Reporting Period' not found in {performance_file}")
        else:
            print(f"Performance file not found: {performance_file}")
            performance_df = pd.DataFrame()
        
        # Merge origination and performance data on 'Loan Sequence Number' if both exist
        if not origination_df.empty and not performance_df.empty:
            merged_df = pd.merge(origination_df, performance_df, on='Loan Sequence Number', how='inner', suffixes=('_orig', '_perf'))
            # Select only the relevant columns
            selected_df = merged_df[selected_columns]
            combined_df = pd.concat([combined_df, selected_df], ignore_index=True)
        else:
            print(f"No data to merge for {quarter}")
    else:
        print(f"Folder not found: {folder_path}")

Processing folder: /Users/dr/Documents/GitHub/MBS_RiskManagement/data/historical_data_2024Q1
Raw 'First Payment Date' unique values: ['1970-01-01 00:00:00.000202403' '1970-01-01 00:00:00.000202405'
 '1970-01-01 00:00:00.000202402' '1970-01-01 00:00:00.000202404'
 '1970-01-01 00:00:00.000202406' '1970-01-01 00:00:00.000202408'
 '1970-01-01 00:00:00.000202407' '1970-01-01 00:00:00.000202501'
 '1970-01-01 00:00:00.000202409' '1970-01-01 00:00:00.000202410'
 '1970-01-01 00:00:00.000202411' '1970-01-01 00:00:00.000202412'
 '1970-01-01 00:00:00.000202502' '1970-01-01 00:00:00.000202503'
 '1970-01-01 00:00:00.000202504']
First Payment Date parsed for Q1. Unique values: <DatetimeArray>
['2024-03-01 00:00:00', '2024-05-01 00:00:00', '2024-02-01 00:00:00',
 '2024-04-01 00:00:00', '2024-06-01 00:00:00', '2024-08-01 00:00:00',
 '2024-07-01 00:00:00', '2025-01-01 00:00:00', '2024-09-01 00:00:00',
 '2024-10-01 00:00:00', '2024-11-01 00:00:00', '2024-12-01 00:00:00',
 '2025-02-01 00:00:00', '2025-03-

In [54]:
# Derive the binary target from 'Current Loan Delinquency Status' if the column exists
if 'Current Loan Delinquency Status' in combined_df.columns:
    # Convert to numeric where possible, keeping non-numeric as NaN
    combined_df['Current Loan Delinquency Status'] = pd.to_numeric(combined_df['Current Loan Delinquency Status'], errors='coerce')
    combined_df['Default'] = np.where(combined_df['Current Loan Delinquency Status'].fillna(0) > 0, 1, 0)
    print("Binary target 'Default' added.")
else:
    print("Target column 'Current Loan Delinquency Status' not found in the data.")

Binary target 'Default' added.


In [55]:
combined_df.head()

Unnamed: 0,Loan Sequence Number,Credit Score,Original Combined Loan-to-Value (CLTV),Original Loan-to-Value (LTV),Original Debt-to-Income (DTI) Ratio,Original Interest Rate,Original Loan Term,Number of Borrowers,Property State,Occupancy Status,Loan Age,Remaining Months to Legal Maturity,Current Actual UPB,Current Interest Rate,Current Loan Delinquency Status,First Payment Date,Maturity Date,Default
0,F24Q10000001,747,67,67,26,8.0,360,1,KS,P,0,360,82000.0,8.0,0.0,2024-03-01,2054-02-01,0
1,F24Q10000001,747,67,67,26,8.0,360,1,KS,P,1,359,82000.0,8.0,0.0,2024-03-01,2054-02-01,0
2,F24Q10000001,747,67,67,26,8.0,360,1,KS,P,2,358,82000.0,8.0,0.0,2024-03-01,2054-02-01,0
3,F24Q10000001,747,67,67,26,8.0,360,1,KS,P,3,357,82000.0,8.0,0.0,2024-03-01,2054-02-01,0
4,F24Q10000001,747,67,67,26,8.0,360,1,KS,P,4,356,82000.0,8.0,0.0,2024-03-01,2054-02-01,0


In [57]:
# Save the combined dataframe to CSV
combined_df.to_csv(os.path.join('/Users/dr/Documents/GitHub/MBS_RiskManagement/', 'logistic_regression_data.csv'), index=False)
print("Combined dataframe saved as 'logistic_regression_data.csv'")

Combined dataframe saved as 'logistic_regression_data.csv'
