## Efficient

In [1]:
import pyarrow
import pandas as pd
import polars as pl
import numpy as np
import sweetviz as sv
from collections import Counter
import numpy as np

In [2]:
# Assuming the columns in the dataset are in the order as follows, based on standard Freddie Mac file format
column_names = [
    "LOAN_SEQUENCE_NUMBER", "MONTHLY_REPORTING_PERIOD", "CURRENT_ACTUAL_UPB", "CURRENT_LOAN_DELINQUENCY_STATUS",
    "LOAN_AGE", "REMAINING_MONTHS", "REPURCHASE_FLAG", "MODIFICATION_FLAG", "ZERO_BALANCE_CODE",
    "ZERO_BALANCE_EFFECTIVE_DATE", "CURRENT_INTEREST_RATE", "CURRENT_DEFERRED_UPB", "DUE_DATE_OF_LAST_PAID_INSTALLMENT",
    "MI_RECOVERIES", "NET_SALES_PROCEEDS", "NON_MI_RECOVERIES", "EXPENSES", "LEGAL_COSTS", "MAINTENANCE_AND_PRESERVATION_COSTS",
    "TAXES_AND_INSURANCE", "MISCELLANEOUS_EXPENSES", "ACTUAL_LOSS_CALCULATION", "MODIFICATION_COST", "STEP_MODIFICATION_FLAG",
    "DEFERRED_PAYMENT_MODIFICATION", "ESTIMATED_LOAN_TO_VALUE", "ZERO_BALANCE_REMOVAL_UPB", "DELINQUENT_ACCRUED_INTEREST",
    "DELINQUENCY_DUE_TO_DISASTER", "BORROWER_ASSISTANCE_STATUS_CODE", "CURRENT_MONTH_LIQUIDATION_FLAG", "CURRENT_MONTH_REPURCHASE_FLAG"
]

In [22]:
debug = False

In [4]:
from datetime import datetime
def get_now():
    now = datetime.now()
    current_time = now.strftime("%H:%M:%S")
    print(current_time)

In [23]:
def convert_parquet_final(datapath='/Users/juliusgruber/Downloads/historical_data_2006/historical_data_time_2006Q1.txt', 
    outputpath="data/hist_data_time_2006_Q1.parquet"):
    get_now()
    # Load the data
    if debug==True:
        df = pd.read_csv(datapath, sep='|', header=None, nrows=10000, low_memory=False)
    else:
        df = pd.read_csv(datapath, sep='|', header=None, low_memory=False)
    get_now()
    df.columns = column_names
    df_fixed_30yr = df[(df['CURRENT_INTEREST_RATE'].notna()) & (df['REMAINING_MONTHS'] == 360)]
    
    compressed_df = df[df['ZERO_BALANCE_CODE'] == 1]
    prepayed_list = compressed_df["LOAN_SEQUENCE_NUMBER"].values
    
    relevant_features = ["LOAN_SEQUENCE_NUMBER", "MONTHLY_REPORTING_PERIOD","CURRENT_ACTUAL_UPB","ZERO_BALANCE_CODE","CURRENT_INTEREST_RATE","CURRENT_MONTH_REPURCHASE_FLAG" ]
    final_df = df_fixed_30yr[relevant_features].reset_index(drop=True)

    list_df = pd.DataFrame(prepayed_list, columns=['LOAN_SEQUENCE_NUMBER'])
    list_df['TARGET'] = 1

    merged_df = pd.merge(final_df, list_df, on='LOAN_SEQUENCE_NUMBER', how='left').fillna({'TARGET': 0})
    #find out how long until prepayment
    string_counts = Counter(df["LOAN_SEQUENCE_NUMBER"].values)
    counts_df = pd.DataFrame(list(string_counts.items()), columns=['LOAN_SEQUENCE_NUMBER', 'COUNT'])

    merged_df = pd.merge(merged_df, counts_df, on='LOAN_SEQUENCE_NUMBER', how='left')

    merged_df = merged_df.dropna(subset=['CURRENT_INTEREST_RATE'])

    merged_df.to_parquet(outputpath)  

In [24]:
%%time

# Iterate over each year and quarter
for year in range(2006, 2023):  # 2024 because the range end is exclusive
    for quarter in range(1, 5):  # From Q1 to Q4
        print(year,quarter)
        
        # Construct the file path
        file_path = f'/Users/juliusgruber/Downloads/historical_data_{year}/historical_data_time_{year}Q{quarter}.txt'
        output_path = f'data/hist_data_time_{year}Q{quarter}.parquet'
        try:
            convert_parquet_final(file_path,output_path)
        except Exception as e:
            # If there's an error reading a file, print the error message and file path
            print(f"Could not read {file_path}: {e}")

2006 1
21:07:51
21:09:06
2006 2
21:09:07
21:10:13
2006 3
21:10:14
21:10:35
2006 4
21:10:36
21:11:33
2007 1
21:11:34
21:12:08
2007 2
21:12:09
21:13:27
2007 3
21:13:28
21:13:46
2007 4
21:13:48
21:14:07
2008 1
21:14:08
21:15:31
2008 2
21:15:33
21:16:01
2008 3
21:16:03
21:16:12
2008 4
21:16:13
21:16:22
2009 1
21:16:23
21:18:33
2009 2
21:18:36
21:22:02
2009 3
21:22:08
21:23:50
2009 4
21:23:52
21:25:35
2010 1
21:25:38
21:26:50
2010 2
21:26:52
21:27:49
2010 3
21:27:51
21:29:57
2010 4
21:30:00
21:33:12
2011 1
21:33:16
21:34:30
2011 2
21:34:32
21:34:50
2011 3
21:34:52
21:36:15
2011 4
21:36:17
21:38:18
2012 1
21:38:21
21:40:24
2012 2
21:40:26
21:42:47
2012 3
21:42:50
21:46:47
2012 4
21:46:53
21:51:41
2013 1
21:51:47
21:57:01
2013 2
21:57:08
22:00:51
2013 3
22:00:57
22:02:37
2013 4
22:02:39
22:03:02
2014 1
22:03:03
22:03:18
2014 2
22:03:19
22:03:39
2014 3
22:03:40
22:04:04
2014 4
22:04:05
22:04:27
2015 1
22:04:28
22:05:49
2015 2
22:05:50
22:07:27
2015 3
22:07:29
22:08:04
2015 4
22:08:06
22:08:29


In [8]:
def convert_parquet_final_printable(datapath='/Users/juliusgruber/Downloads/historical_data_2006/historical_data_time_2006Q1.txt', 
    outputpath="data/hist_data_time_2006_Q1.parquet"):
    get_now()
    # Load the data
    if debug==True:
        df = pd.read_csv(datapath, sep='|', header=None, nrows=10000, low_memory=False)
    else:
        df = pd.read_csv(datapath, sep='|', header=None, low_memory=False)
    get_now()
    df.columns = column_names
    df_fixed_30yr = df[(df['CURRENT_INTEREST_RATE'].notna()) & (df['REMAINING_MONTHS'] == 360)]
    
    compressed_df = df[df['ZERO_BALANCE_CODE'] == 1]
    prepayed_list = compressed_df["LOAN_SEQUENCE_NUMBER"].values
    
    relevant_features = ["LOAN_SEQUENCE_NUMBER", "MONTHLY_REPORTING_PERIOD","CURRENT_ACTUAL_UPB","ZERO_BALANCE_CODE","CURRENT_INTEREST_RATE","CURRENT_MONTH_REPURCHASE_FLAG" ]
    final_df = df_fixed_30yr[relevant_features].reset_index(drop=True)

    list_df = pd.DataFrame(prepayed_list, columns=['LOAN_SEQUENCE_NUMBER'])
    list_df['TARGET'] = 1

    merged_df = pd.merge(final_df, list_df, on='LOAN_SEQUENCE_NUMBER', how='left').fillna({'TARGET': 0})
    print(merged_df)
    #find out how long until prepayment
    string_counts = Counter(df["LOAN_SEQUENCE_NUMBER"].values)
    counts_df = pd.DataFrame(list(string_counts.items()), columns=['LOAN_SEQUENCE_NUMBER', 'COUNT'])

    merged_df = pd.merge(merged_df, counts_df, on='LOAN_SEQUENCE_NUMBER', how='left')

    merged_df = merged_df.dropna(subset=['CURRENT_INTEREST_RATE'])

    merged_df.to_parquet(outputpath)  
    return merged_df

In [9]:
df =convert_parquet_final_printable()

21:05:05
21:05:05
    LOAN_SEQUENCE_NUMBER  MONTHLY_REPORTING_PERIOD  CURRENT_ACTUAL_UPB  \
0           F06Q10000001                    200603            130000.0   
1           F06Q10000002                    200603            214000.0   
2           F06Q10000003                    200603             81000.0   
3           F06Q10000005                    200604            296000.0   
4           F06Q10000006                    200602            255000.0   
..                   ...                       ...                 ...   
105         F06Q10000155                    200602            158000.0   
106         F06Q10000156                    200602            176000.0   
107         F06Q10000157                    200603             93000.0   
108         F06Q10000158                    200602            100000.0   
109         F06Q10000160                    200602             72000.0   

     ZERO_BALANCE_CODE  CURRENT_INTEREST_RATE  CURRENT_MONTH_REPURCHASE_FLAG  \
0            

In [20]:
df = pd.read_parquet('data/hist_data_time_2022Q1.parquet')

In [21]:
df

Unnamed: 0,LOAN_SEQUENCE_NUMBER,MONTHLY_REPORTING_PERIOD,CURRENT_ACTUAL_UPB,ZERO_BALANCE_CODE,CURRENT_INTEREST_RATE,CURRENT_MONTH_REPURCHASE_FLAG,TARGET,COUNT
0,F22Q10000001,202202,625000.0,,3.375,625000.0,0.0,20
1,F22Q10000004,202202,216000.0,,3.250,216000.0,0.0,20
2,F22Q10000005,202202,150000.0,,2.875,150000.0,0.0,20
3,F22Q10000006,202202,400000.0,,3.125,400000.0,0.0,20
4,F22Q10000008,202202,59000.0,,2.875,59000.0,0.0,20
...,...,...,...,...,...,...,...,...
333,F22Q10000525,202202,300000.0,,3.250,300000.0,0.0,20
334,F22Q10000526,202202,288000.0,,4.500,288000.0,0.0,20
335,F22Q10000527,202202,153000.0,,3.250,153000.0,0.0,20
336,F22Q10000528,202202,60000.0,,3.625,60000.0,0.0,20
