# Notebook for Extracting CoreLogic Data
Emily Philippides

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import gc 
import time

In [None]:
pd.__version__

In [None]:
import platform
print(platform.architecture())

In [None]:
%cd E:\Thesis

In [None]:
print('STARTING')

# Helper Functions

In [None]:
# Code execution timer
# Code taken from: https://www.codespeedy.com/how-to-create-a-stopwatch-in-python/
def time_convert(sec):
    mins = sec // 60
    sec = sec % 60
    hours = mins // 60
    mins = mins % 60
    return ("Time Lapsed = {}:{}:{}".format(int(hours), int(mins), int(sec)))

start_time = time.time()

In [None]:
# Properly encode 'missing' values as NaN
def edit_data(df, cols):
    print('Editing...')
    
    # Convert categorical variables 'missing' values to NaN
    cat = ['mba_delinquency_status', #  X = Unknown/Missing LPI date
    'orig_active_status',
    'current_investor_code', # U = No Info
    'current_product_type', # NULL = No Info
    'mba_worst_ever', # U = No Info
    'property_type', # U = No Info
    'occupancy_type', # U = No Info
    'product_type', # U = No Info
    'loan_type', # U = No Info
    'loan_purpose', # U = No Info
    'payment_frequency', # U = No Info
    'channel', # U = No Info
    'documentation_type',  # U = No Info
    'gse_eligible_flag',  # U = No Info
    'payment_frequency'] # U = No Info
    
    c_cat = []
    for i in cols:
        if i in cat:
            c_cat.append(i)
    
    for i in c_cat:
        print('-{}'.format(i))
        if True in df[i].astype(str).isin(['U']).values.tolist():
            df.loc[(df[i] == 'U'), i] = np.NaN
        elif True in df[i].astype(str).isin(['X']).values.tolist():
            df.loc[(df[i] == 'X'), i] = np.NaN
        elif True in df[i].astype(str).isin(['NULL']).values.tolist():
            df.loc[(df[i] == 'NULL'), i] = np.NaN
            
    # Convert binary variables 'missing' values to NaN
    dummies = ['prepay_penalty_flag',
    'collateral_type', 
    'product_type_category', 
    'loan_purpose_category', 
    'mortgage_insurance_flag', 
    'active_status', # no NA
    'bk_flag', # no NA
    'buydown_flag', 
    'convertible_flag', # no NA
    'pool_insurance_flag', 
    'negative_amortization_flag', 
    'io_flag', 
    'paid_off_flag', 
    'inferred_collateral_type']

    c_dum = []
    for i in cols:
        if i in dummies:
            c_dum.append(i)
    
    for i in c_dum:
        print('-{}'.format(i))
        unique_values = df[i].astype(str).unique().tolist()
        if True in df[i].astype(str).isin(['U']).values.tolist():
            unique_values.remove('U')
            df.loc[(df[i] == 'U'), i] = np.NaN
        
        # Ensure binary variables are indeed binary
        if len(unique_values) > 2:
            print("ERROR!!!!!! More than 2 unique values")
            print(unique_values, '| Removing...', unique_values[2])
            df.loc[(df[i] == unique_values[2]).values, i] = np.NaN        
    
    print('Done editing. {} \n'.format(time_convert(time.time() - start_time)))
    return df

In [None]:
# Count number of performance instances for each unique loan
def add_loan_counts(df):
    df['one'] = 1
    counts_df = df.groupby(by=['loan_id'])[['one']].sum().reset_index()
    counts_df.columns = ['loan_id', 'counts']
    df = df.merge(counts_df, on='loan_id', how='inner', validate='many_to_one')
    df = df.drop(columns=['one'])
    print('Done merging \n')
    return df

In [None]:
# Get range of values of each column in specified dataframe
def get_ranges(df):
    neg_numbers = []
    df = df.select_dtypes(include=np.number)
    for col in df.columns:
        print('{} range of values: [{}, {}]'.format(col, df[col].min(), df[col].max()))
        if df[col].min() < 0:
            neg_numbers.append(col)
    return neg_numbers

In [None]:
# Remove rows with strange (e.g. negative) values in specified columns
def clean_rows(df, neg_cols):
    for col in neg_cols:
        print(col)
        display(df[df[col] < 0])
        df = df[df[col] > 0]

    display(df)
    return df

In [None]:
# Count the number of null values in each column and each row
def count_null(df, thresh):
    print('***{}***'.format(get_df_name(df)))
    print('Columns = {}. Rows = {}.'.format(df.shape[1], df.shape[0]))
    t = str(int(thresh*100))

    null_rows = df.isnull().sum(axis=1).values
    fiftyp_rows = null_rows/len(df.columns)
    fiftyp_rows = len(fiftyp_rows[fiftyp_rows>thresh])
    print('There are no rows with more than {} NaN columns. {} rows have more than {}% NaN columns.'.format(null_rows.max(), fiftyp_rows, t))

    null_cols = df.isnull().sum(axis=0).values
    fiftyp_cols = null_cols/len(df)
    fiftyp_cols = len(fiftyp_cols[fiftyp_cols>thresh])
    print('There are no columns with more than {} NaN rows. {} columns have more than {}% NaN rows. \n'.format(null_cols.max(), fiftyp_cols, t))

    return

In [None]:
# Remove columns with > x NaN values in the rows
def drop_columns(df, thresh):
    print('***{}***'.format(get_df_name(df)))
    print('Shape before: {}'.format(df.shape))
    cutoff = len(df) * thresh
    df = df.drop(columns = (df.columns[df.isna().sum().values > cutoff]))
    print('Shape after: {} \n'.format(df.shape))
    return df

# Origination Data

In [None]:
def extract_origination_data(filename):
    
    print('Extracting {}'.format(filename))
    
    cols = ['loan_id', 'origination_date', 'property_zip', 'state', 'property_type', 
            'number_of_units', 'occupancy_type', 'original_balance', 
            'sale_price', 'appraised_value', 'product_type', 'original_term',
            'initial_interest_rate', 'back_end_ratio', 'loan_type', 
            'loan_purpose', 'payment_frequency', 'channel', 'buydown_flag', 
            'documentation_type',  'convertible_flag', 'pool_insurance_flag', 'original_ltv', 
            'negative_amortization_flag', 'margin', 'periodic_rate_cap', 
            'periodic_rate_floor', 'lifetime_rate_cap', 'lifetime_rate_floor',
            'rate_reset_frequency', 'pay_reset_frequency', 'first_rate_reset_period', 
            'fico_score_at_origination', 'prepay_penalty_flag', 'prepay_penalty_term', 
            'combined_ltv_at_origination', 'cbsa', 'io_term', 'io_flag', 
            'msa', 'paid_off_flag', 'inferred_collateral_type', 'collateral_type', 
            'orig_active_status', 'period', 'product_type_category', 'loan_purpose_category', 
            'mortgage_insurance_flag', 'gse_eligible_flag', 'payment_frequency']
    
    origination_data = pd.read_csv(filename, sep='|', low_memory=False, usecols = cols)
    
    print('Done extracting. {}'.format(time_convert(time.time() - start_time)))
        
    origination_data = edit_data(origination_data, cols)
        
    return origination_data

In [None]:
# Load origination records for all mortgages originated 2008-2010
inactive_origination_2008_2010 = extract_origination_data('Inactive_Origination_Firsts_2008_2010.txt')

# Load origination records for all mortgages originated December 2013 - July 2020
active_origination_202007 = extract_origination_data('Recent_Origination_Firsts_202007.txt')

In [None]:
# Merge dataframes
all_origination = inactive_origination_2008_2010.append(active_origination_202007)
print('\nDone appending origination data. {}'.format(time_convert(time.time() - start_time)))
del inactive_origination_2008_2010
del active_origination_202007
gc.collect()

In [None]:
# Save dataframe
# all_origination.to_csv('ORIGINATION.csv')
# print(time_convert(time.time() - start_time))

# Full Data
Load performance data and merge with origination data

In [None]:
def extract_performance_data(year, all_origination):
    
    print('{} performance data'.format(year))
    
    cols = ['loan_id', 'current_balance', 'current_interest_rate', 
             'total_payment_due', 'scheduled_principal', 'scheduled_monthly_pi', 
             'mba_delinquency_status', 'mba_days_delinquent', 'active_status', 
             'period_of_payment', 'current_investor_code', 'current_product_type', 
             'loan_age', 'mba_worst_ever', 'bk_flag']
    
    i=1
    name = 'Performance_Firsts_' + year
    for chunk in pd.read_csv((name + '/' + name + '.txt'), sep='|', usecols = cols, low_memory=False, chunksize=20000000):
        performance_data = chunk if i == 1 else pd.concat([performance_data, chunk])
        i += 1

    performance_data['year'] = int(year)
    
    print('Done extracting. {}'.format(time_convert(time.time() - start_time)))
    
    merged_df = performance_data.merge(all_origination, on='loan_id', how='inner', validate='many_to_one')
    
    print('Done merging. {}'.format(time_convert(time.time() - start_time)))
    
    del performance_data
    gc.collect()
    
    merged_df = edit_data(merged_df, cols)
    
    return merged_df

In [None]:
# Load performance data for all active mortgages between 2008 and 2019
data_2019 = extract_performance_data('2019', all_origination)
data_2018 = extract_performance_data('2018', all_origination)
data_2017 = extract_performance_data('2017', all_origination)
data_2016 = extract_performance_data('2016', all_origination)
data_2015 = extract_performance_data('2015', all_origination)
data_2014 = extract_performance_data('2014', all_origination)
data_2013 = extract_performance_data('2013', all_origination)
data_2012 = extract_performance_data('2012', all_origination)
data_2011 = extract_performance_data('2011', all_origination)
data_2010 = extract_performance_data('2010', all_origination)
data_2009 = extract_performance_data('2009', all_origination)
data_2008 = extract_performance_data('2008', all_origination)

In [None]:
# Append all yearly data into one dataframe
merged_df = data_2019.append(data_2018)
print('Done appending 18-19 data. {} \n'.format(time_convert(time.time() - start_time)))
del data_2019
del data_2018
gc.collect()

merged_df = merged_df.append(data_2017)
print('Done appending 17 data. {} \n'.format(time_convert(time.time() - start_time)))
del data_2017
gc.collect()

merged_df = merged_df.append(data_2016)
print('Done appending 16 data. {} \n'.format(time_convert(time.time() - start_time)))
del data_2016
gc.collect()

merged_df = merged_df.append(data_2015)
print('Done appending 15 data. {} \n'.format(time_convert(time.time() - start_time)))
del data_2015
gc.collect()

merged_df = merged_df.append(data_2014)
print('Done appending 14 data. {} \n'.format(time_convert(time.time() - start_time)))
del data_2014
gc.collect()

merged_df = merged_df.append(data_2013)
print('Done appending 13 data. {} \n'.format(time_convert(time.time() - start_time)))
del data_2013
gc.collect()

merged_df = merged_df.append(data_2012)
print('Done appending 12 data. {} \n'.format(time_convert(time.time() - start_time)))
del data_2012
gc.collect()

merged_df = merged_df.append(data_2011)
print('Done appending 11 data. {} \n'.format(time_convert(time.time() - start_time)))
del data_2011
gc.collect()

merged_df = merged_df.append(data_2010)
print('Done appending 10 data. {} \n'.format(time_convert(time.time() - start_time)))
del data_2010
gc.collect()

merged_df = merged_df.append(data_2009)
print('Done appending 09 data. {} \n'.format(time_convert(time.time() - start_time)))
del data_2009
gc.collect()

merged_df = merged_df.append(data_2008)
print('Done appending 08 data. {} \n'.format(time_convert(time.time() - start_time)))
del data_2008
gc.collect()

In [None]:
# Remove columns with >= 75% NaN values in the rows
merged_df = drop_columns(merged_df, 0.75)

In [None]:
# Save dataframe
# merged_df.to_csv('MERGED.csv')
# print(time_convert(time.time() - start_time))

# Save Smaller Chunks
By state and origination date

In [None]:
%cd F:\Thesis

In [None]:
# Get all unique origination dates (formatted as YYYYMM)
merged_df['origination_date'] = merged_df['origination_date'].astype(int)
unique_dates = merged_df['origination_date'].unique()
unique_dates = unique_dates[unique_dates > 200800]
print(unique_dates)

In [None]:
# Split and save dataframe by state, origination date
for date in unique_dates:
    for state in merged_df['state'].unique():
        file_name = 'MERGED_{}_{}.csv'.format(state, date)
        condition = (merged_df['origination_date'] == date) & (merged_df['state'] == state)
        merged_df[condition].to_csv(file_name)

In [None]:
print('DONE')