## Efficient

In [2]:
import pyarrow
import pandas as pd
import polars as pl
import numpy as np
import sweetviz as sv
from collections import Counter
import numpy as np

In [3]:
# Assuming the columns in the dataset are in the order as follows, based on standard Freddie Mac file format
column_names = [
    "LOAN_SEQUENCE_NUMBER", "MONTHLY_REPORTING_PERIOD", "CURRENT_ACTUAL_UPB", "CURRENT_LOAN_DELINQUENCY_STATUS",
    "LOAN_AGE", "REMAINING_MONTHS", "REPURCHASE_FLAG", "MODIFICATION_FLAG", "ZERO_BALANCE_CODE",
    "ZERO_BALANCE_EFFECTIVE_DATE", "CURRENT_INTEREST_RATE", "CURRENT_DEFERRED_UPB", "DUE_DATE_OF_LAST_PAID_INSTALLMENT",
    "MI_RECOVERIES", "NET_SALES_PROCEEDS", "NON_MI_RECOVERIES", "EXPENSES", "LEGAL_COSTS", "MAINTENANCE_AND_PRESERVATION_COSTS",
    "TAXES_AND_INSURANCE", "MISCELLANEOUS_EXPENSES", "ACTUAL_LOSS_CALCULATION", "MODIFICATION_COST", "STEP_MODIFICATION_FLAG",
    "DEFERRED_PAYMENT_MODIFICATION", "ESTIMATED_LOAN_TO_VALUE", "ZERO_BALANCE_REMOVAL_UPB", "DELINQUENT_ACCRUED_INTEREST",
    "DELINQUENCY_DUE_TO_DISASTER", "BORROWER_ASSISTANCE_STATUS_CODE", "CURRENT_MONTH_LIQUIDATION_FLAG", "CURRENT_MONTH_REPURCHASE_FLAG"
]

In [4]:
debug = True

In [5]:
from datetime import datetime
def get_now():
    now = datetime.now()
    current_time = now.strftime("%H:%M:%S")
    print(current_time)

In [19]:
def convert_parquet_final(datapath='/Users/juliusgruber/Downloads/historical_data_2006/historical_data_time_2006Q1.txt', 
    outputpath="data/hist_data_time_2006_Q1.parquet"):
    get_now()
    # Load the data
    if debug==True:
        df = pd.read_csv(datapath, sep='|', header=None, nrows=10000, low_memory=False)
    else:
        df = pd.read_csv(datapath, sep='|', header=None, low_memory=False)
    get_now()
    df.columns = column_names
    df_fixed_30yr = df[(df['CURRENT_INTEREST_RATE'].notna()) & (df['REMAINING_MONTHS'] == 360)]
    
    compressed_df = df[df['ZERO_BALANCE_CODE'] == 1]
    prepayed_list = compressed_df["LOAN_SEQUENCE_NUMBER"].values
    
    relevant_features = ["LOAN_SEQUENCE_NUMBER", "MONTHLY_REPORTING_PERIOD","CURRENT_ACTUAL_UPB","ZERO_BALANCE_CODE","CURRENT_INTEREST_RATE","CURRENT_MONTH_REPURCHASE_FLAG" ]
    final_df = df_fixed_30yr[relevant_features].reset_index(drop=True)

    list_df = pd.DataFrame(prepayed_list, columns=['LOAN_SEQUENCE_NUMBER'])
    list_df['TARGET'] = 1

    merged_df = pd.merge(final_df, list_df, on='LOAN_SEQUENCE_NUMBER', how='left').fillna({'TARGET': 0})

    #find out how long until prepayment
    string_counts = Counter(df["LOAN_SEQUENCE_NUMBER"].values)
    counts_df = pd.DataFrame(list(string_counts.items()), columns=['LOAN_SEQUENCE_NUMBER', 'COUNT'])

    merged_df = pd.merge(final_df, counts_df, on='LOAN_SEQUENCE_NUMBER', how='left')

    merged_df = merged_df.dropna(subset=['CURRENT_INTEREST_RATE'])

    merged_df.to_parquet(outputpath)  

In [20]:
%%time

# Iterate over each year and quarter
for year in range(2006, 2023):  # 2024 because the range end is exclusive
    for quarter in range(1, 5):  # From Q1 to Q4
        print(year,quarter)
        
        # Construct the file path
        file_path = f'/Users/juliusgruber/Downloads/historical_data_{year}/historical_data_time_{year}Q{quarter}.txt'
        output_path = f'data/hist_data_time_{year}Q{quarter}.parquet'
        try:
            convert_parquet_final(file_path,output_path)
        except Exception as e:
            # If there's an error reading a file, print the error message and file path
            print(f"Could not read {file_path}: {e}")

2006 1
15:50:32
15:50:32
2006 2
15:50:32
15:50:32
2006 3
15:50:32
15:50:32
2006 4
15:50:32
15:50:32
2007 1
15:50:32
15:50:33
2007 2
15:50:33
15:50:33
2007 3
15:50:33
15:50:33
2007 4
15:50:33
15:50:33
2008 1
15:50:33
15:50:33
2008 2
15:50:33
15:50:33
2008 3
15:50:33
15:50:33
2008 4
15:50:33
15:50:33
2009 1
15:50:33
15:50:33
2009 2
15:50:33
15:50:33
2009 3
15:50:33
15:50:33
2009 4
15:50:33
15:50:33
2010 1
15:50:33
15:50:33
2010 2
15:50:33
15:50:33
2010 3
15:50:33
15:50:33
2010 4
15:50:33
15:50:33
2011 1
15:50:33
15:50:33
2011 2
15:50:33
15:50:33
2011 3
15:50:33
15:50:33
2011 4
15:50:33
15:50:33
2012 1
15:50:33
15:50:33
2012 2
15:50:33
15:50:33
2012 3
15:50:33
15:50:33
2012 4
15:50:33
15:50:33
2013 1
15:50:33
15:50:33
2013 2
15:50:33
15:50:33
2013 3
15:50:33
15:50:33
2013 4
15:50:33
15:50:33
2014 1
15:50:33
15:50:33
2014 2
15:50:33
15:50:33
2014 3
15:50:33
15:50:33
2014 4
15:50:33
15:50:33
2015 1
15:50:33
15:50:33
2015 2
15:50:33
15:50:33
2015 3
15:50:33
15:50:33
2015 4
15:50:33
15:50:33


In [16]:
def convert_parquet_final_printable(datapath='/Users/juliusgruber/Downloads/historical_data_2006/historical_data_time_2006Q1.txt', 
    outputpath="data/hist_data_time_2006_Q1.parquet"):
    get_now()
    # Load the data
    if debug==True:
        df = pd.read_csv(datapath, sep='|', header=None, nrows=10000, low_memory=False)
    else:
        df = pd.read_csv(datapath, sep='|', header=None, low_memory=False)
    get_now()
    df.columns = column_names
    df_fixed_30yr = df[(df['CURRENT_INTEREST_RATE'].notna()) & (df['REMAINING_MONTHS'] == 360)]
    
    compressed_df = df[df['ZERO_BALANCE_CODE'] == 1]
    prepayed_list = compressed_df["LOAN_SEQUENCE_NUMBER"].values
    
    relevant_features = ["LOAN_SEQUENCE_NUMBER", "MONTHLY_REPORTING_PERIOD","CURRENT_ACTUAL_UPB","ZERO_BALANCE_CODE","CURRENT_INTEREST_RATE","CURRENT_MONTH_REPURCHASE_FLAG" ]
    final_df = df_fixed_30yr[relevant_features].reset_index(drop=True)

    list_df = pd.DataFrame(prepayed_list, columns=['LOAN_SEQUENCE_NUMBER'])
    list_df['TARGET'] = 1

    merged_df = pd.merge(final_df, list_df, on='LOAN_SEQUENCE_NUMBER', how='left').fillna({'indicator': 0})
    print(merged_df)
    #find out how long until prepayment
    string_counts = Counter(df["LOAN_SEQUENCE_NUMBER"].values)
    counts_df = pd.DataFrame(list(string_counts.items()), columns=['LOAN_SEQUENCE_NUMBER', 'COUNT'])

    merged_df = pd.merge(merged_df, counts_df, on='LOAN_SEQUENCE_NUMBER', how='left')

    merged_df = merged_df.dropna(subset=['CURRENT_INTEREST_RATE'])

    merged_df.to_parquet(outputpath)  
    return merged_df

In [21]:
df =convert_parquet_final_printable()

15:50:36
15:50:36
    LOAN_SEQUENCE_NUMBER  MONTHLY_REPORTING_PERIOD  CURRENT_ACTUAL_UPB  \
0           F06Q10000001                    200603            130000.0   
1           F06Q10000002                    200603            214000.0   
2           F06Q10000003                    200603             81000.0   
3           F06Q10000005                    200604            296000.0   
4           F06Q10000006                    200602            255000.0   
..                   ...                       ...                 ...   
105         F06Q10000155                    200602            158000.0   
106         F06Q10000156                    200602            176000.0   
107         F06Q10000157                    200603             93000.0   
108         F06Q10000158                    200602            100000.0   
109         F06Q10000160                    200602             72000.0   

     ZERO_BALANCE_CODE  CURRENT_INTEREST_RATE  CURRENT_MONTH_REPURCHASE_FLAG  \
0            

In [22]:
df

Unnamed: 0,LOAN_SEQUENCE_NUMBER,MONTHLY_REPORTING_PERIOD,CURRENT_ACTUAL_UPB,ZERO_BALANCE_CODE,CURRENT_INTEREST_RATE,CURRENT_MONTH_REPURCHASE_FLAG,indicator,COUNT
0,F06Q10000001,200603,130000.0,,6.500,130000.0,1.0,39
1,F06Q10000002,200603,214000.0,,6.250,214000.0,1.0,36
2,F06Q10000003,200603,81000.0,,6.375,81000.0,1.0,75
3,F06Q10000005,200604,296000.0,,6.250,296000.0,1.0,52
4,F06Q10000006,200602,255000.0,,6.250,255000.0,1.0,87
...,...,...,...,...,...,...,...,...
105,F06Q10000155,200602,158000.0,,6.500,158000.0,1.0,28
106,F06Q10000156,200602,176000.0,,6.250,176000.0,1.0,84
107,F06Q10000157,200603,93000.0,,6.500,93000.0,0.0,211
108,F06Q10000158,200602,100000.0,,5.375,100000.0,1.0,47
