In [None]:
import pyarrow
import pandas as pd
import polars as pl
import numpy as np
import sweetviz as sv
from collections import Counter
import numpy as np

In [2]:
pd.set_option('display.max_columns', None)

In [4]:
# Assuming the columns in the dataset are in the order as follows, based on standard Freddie Mac file format
column_names = [
    "LOAN_SEQUENCE_NUMBER", "MONTHLY_REPORTING_PERIOD", "CURRENT_ACTUAL_UPB", "CURRENT_LOAN_DELINQUENCY_STATUS",
    "LOAN_AGE", "REMAINING_MONTHS", "REPURCHASE_FLAG", "MODIFICATION_FLAG", "ZERO_BALANCE_CODE",
    "ZERO_BALANCE_EFFECTIVE_DATE", "CURRENT_INTEREST_RATE", "CURRENT_DEFERRED_UPB", "DUE_DATE_OF_LAST_PAID_INSTALLMENT",
    "MI_RECOVERIES", "NET_SALES_PROCEEDS", "NON_MI_RECOVERIES", "EXPENSES", "LEGAL_COSTS", "MAINTENANCE_AND_PRESERVATION_COSTS",
    "TAXES_AND_INSURANCE", "MISCELLANEOUS_EXPENSES", "ACTUAL_LOSS_CALCULATION", "MODIFICATION_COST", "STEP_MODIFICATION_FLAG",
    "DEFERRED_PAYMENT_MODIFICATION", "ESTIMATED_LOAN_TO_VALUE", "ZERO_BALANCE_REMOVAL_UPB", "DELINQUENT_ACCRUED_INTEREST",
    "DELINQUENCY_DUE_TO_DISASTER", "BORROWER_ASSISTANCE_STATUS_CODE", "CURRENT_MONTH_LIQUIDATION_FLAG", "CURRENT_MONTH_REPURCHASE_FLAG"
]

In [16]:
datapath='/Users/juliusgruber/Downloads/historical_data_2006/historical_data_time_2006Q1.txt'
outputpath="data/hist_data_time_2006_Q1.parquet"

df = pd.read_csv(datapath, sep='|', header=None, low_memory=False)
df.columns = column_names

In [8]:
compressed_df = df[df['ZERO_BALANCE_CODE'] == 1]
prepayed_list = compressed_df["LOAN_SEQUENCE_NUMBER"].values

In [17]:
string_counts = Counter(df["LOAN_SEQUENCE_NUMBER"].values)
counts_df = pd.DataFrame(list(string_counts.items()), columns=['LOAN_SEQUENCE_NUMBER', 'COUNT'])

In [18]:
counts_df

Unnamed: 0,LOAN_SEQUENCE_NUMBER,COUNT
0,F06Q10000001,39
1,F06Q10000002,36
2,F06Q10000003,75
3,F06Q10000004,15
4,F06Q10000005,52
...,...,...
298574,F06Q10374571,16
298575,F06Q10374572,58
298576,F06Q10374574,55
298577,F06Q10374575,51


In [19]:
df_fixed_30yr = df[(df['CURRENT_INTEREST_RATE'].notna()) & (df['REMAINING_MONTHS'] == 360)]
    
relevant_features = ["LOAN_SEQUENCE_NUMBER", "MONTHLY_REPORTING_PERIOD","CURRENT_ACTUAL_UPB","ZERO_BALANCE_CODE","CURRENT_INTEREST_RATE","CURRENT_MONTH_REPURCHASE_FLAG" ]
final_df = df_fixed_30yr[relevant_features].reset_index(drop=True)

list_df = pd.DataFrame(prepayed_list, columns=['LOAN_SEQUENCE_NUMBER'])
list_df['indicator'] = 1

merged_df = pd.merge(final_df, list_df, on='LOAN_SEQUENCE_NUMBER', how='left').fillna({'indicator': 0})

#find out how long until prepayment
string_counts = Counter(df["LOAN_SEQUENCE_NUMBER"].values)
counts_df = pd.DataFrame(list(string_counts.items()), columns=['LOAN_SEQUENCE_NUMBER', 'COUNT'])

merged_df = pd.merge(final_df, counts_df, on='LOAN_SEQUENCE_NUMBER', how='left')

merged_df = merged_df.dropna(subset=['CURRENT_INTEREST_RATE'])

merged_df.to_parquet(outputpath)  

In [20]:
merged_df

Unnamed: 0,LOAN_SEQUENCE_NUMBER,MONTHLY_REPORTING_PERIOD,CURRENT_ACTUAL_UPB,ZERO_BALANCE_CODE,CURRENT_INTEREST_RATE,CURRENT_MONTH_REPURCHASE_FLAG,COUNT
0,F06Q10000001,200603,130000.00,,6.500,130000.00,39
1,F06Q10000002,200603,214000.00,,6.250,214000.00,36
2,F06Q10000003,200603,81000.00,,6.375,81000.00,75
3,F06Q10000005,200604,296000.00,,6.250,296000.00,52
4,F06Q10000006,200602,255000.00,,6.250,255000.00,87
...,...,...,...,...,...,...,...
207178,F06Q10373684,201602,165694.96,,5.875,165694.96,133
207179,F06Q10373697,200905,150000.00,,5.000,150000.00,53
207180,F06Q10373781,200908,146000.00,,5.200,146000.00,135
207181,F06Q10373947,202210,101710.78,,3.375,96510.78,160


In [13]:

len(merged_df)

207183

In [11]:
df

Unnamed: 0,LOAN_SEQUENCE_NUMBER,MONTHLY_REPORTING_PERIOD,CURRENT_ACTUAL_UPB,CURRENT_LOAN_DELINQUENCY_STATUS,LOAN_AGE,REMAINING_MONTHS,REPURCHASE_FLAG,MODIFICATION_FLAG,ZERO_BALANCE_CODE,ZERO_BALANCE_EFFECTIVE_DATE,CURRENT_INTEREST_RATE,CURRENT_DEFERRED_UPB,DUE_DATE_OF_LAST_PAID_INSTALLMENT,MI_RECOVERIES,NET_SALES_PROCEEDS,NON_MI_RECOVERIES,EXPENSES,LEGAL_COSTS,MAINTENANCE_AND_PRESERVATION_COSTS,TAXES_AND_INSURANCE,MISCELLANEOUS_EXPENSES,ACTUAL_LOSS_CALCULATION,MODIFICATION_COST,STEP_MODIFICATION_FLAG,DEFERRED_PAYMENT_MODIFICATION,ESTIMATED_LOAN_TO_VALUE,ZERO_BALANCE_REMOVAL_UPB,DELINQUENT_ACCRUED_INTEREST,DELINQUENCY_DUE_TO_DISASTER,BORROWER_ASSISTANCE_STATUS_CODE,CURRENT_MONTH_LIQUIDATION_FLAG,CURRENT_MONTH_REPURCHASE_FLAG
0,F06Q10000001,200603,130000.00,0,0,360,,,,,6.5,0.0,,,,,,,,,,,,,,,,,,,,130000.00
1,F06Q10000001,200604,130000.00,0,1,359,,,,,6.5,0.0,,,,,,,,,,,,,,,,,,,,130000.00
2,F06Q10000001,200605,130000.00,0,2,358,,,,,6.5,0.0,,,,,,,,,,,,,,,,,,,,130000.00
3,F06Q10000001,200606,130000.00,0,3,357,,,,,6.5,0.0,,,,,,,,,,,,,,,,,,,,130000.00
4,F06Q10000001,200607,129000.00,0,4,356,,,,,6.5,0.0,,,,,,,,,,,,,,,,,,,,129000.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20379139,F06Q10374577,202004,44394.76,1,53,191,,,,,4.0,0.0,,,,,,,,,,,,,,17.0,,,,,,44394.76
20379140,F06Q10374577,202005,44394.76,2,54,190,,,,,4.0,0.0,,,,,,,,,,,,,,16.0,,,Y,,,44394.76
20379141,F06Q10374577,202006,44394.76,3,55,189,,,,,4.0,0.0,,,,,,,,,,,,,,16.0,,,Y,,,44394.76
20379142,F06Q10374577,202007,44394.76,4,56,188,,,,,4.0,0.0,,,,,,,,,,,,,,16.0,,,Y,,,44394.76
