## Reading data

In [33]:
from pyspark.sql.types import *
import pandas as pd
pd.set_option('display.max_columns', 100)

acquisition_schema = StructType([
    StructField("loan_identifier", StringType(), True),
    StructField("channel", StringType(), True),
    StructField("seller_name", StringType(), True),
    StructField("original_interest_rate", DoubleType(), True),
    StructField("original_upb", DoubleType(), True),
    StructField("origin_loan_term", DoubleType(), True),
    StructField("original_date", StringType(), True),
    StructField("first_payment_date", StringType(), True),
    StructField("original_loan_to_value", DoubleType(), True),
    StructField("original_combined_loan_to_value", DoubleType(), True),
    StructField("number_of_borrowers", DoubleType(), True),
    StructField("debt_to_income_ratio", DoubleType(), True),
    StructField("borrower_credit_score", DoubleType(), True),
    StructField("first_time_homebuyer", StringType(), True),
    StructField("loan_purpose", StringType(), True),
    StructField("property_type", StringType(), True),
    StructField("number_of_units", StringType(), True),
    StructField("occupancy_status", StringType(), True),
    StructField("property_state", StringType(), True),
    StructField("zip", StringType(), True),
    StructField("insurance_percentage", DoubleType(), True),
    StructField("product_type", StringType(), True),
    StructField("co_borrower_credit_score", DoubleType(), True),
    StructField("insurance_type", DoubleType(), True),
    StructField("relocation_mortrage", StringType(), True)
])

acquisition = spark.read.csv(
    './data/Acquisition_2000Q1.txt',
    schema=acquisition_schema,
    sep='|')

In [34]:
performance_schema = StructType([
    StructField("loan_idetifier", DoubleType(), True),
    StructField("monthly_reporting_period", StringType(), True),
    StructField("servicer_name", StringType(), True),
    StructField("current_interst_rate", DoubleType(), True),
    StructField("current_upb", DoubleType(), True),
    StructField("loan_age", DoubleType(), True),
    StructField("remaining_month_to_maturity", DoubleType(), True),
    StructField("adjusted_remaining_month_to_maturity", DoubleType(), True),
    StructField("maturity_date", StringType(), True),
    StructField("metropolitan", StringType(), True),
    StructField("current_loan_delinquency_status", StringType(), True),
    StructField("modification_flag", StringType(), True),
    StructField("zero_balance_code", StringType(), True),
    StructField("zero_balance_effective_date", StringType(), True),
    StructField("last_paid_installment_date", StringType(), True),
    StructField("foreclosure_date", StringType(), True),
    StructField("disposition_date", StringType(), True),
    StructField("foreclosure_cost", DoubleType(), True),
    StructField("property_repair_cost", DoubleType(), True),
    StructField("asset_recovery_cost", DoubleType(), True),
    StructField("miscellaneous_expenses", DoubleType(), True),
    StructField("taxes_for_holding_property", DoubleType(), True),
    StructField("net_sale_proceeds", DoubleType(), True),
    StructField("credit_enchancement_proceeds", DoubleType(), True),
    StructField("repurchuse_make_whole_proceeds", DoubleType(), True),
    StructField("other_foreclosure_proceed", DoubleType(), True),
    StructField("non_interest_bearing_upb", DoubleType(), True),
    StructField("principle_forgiveness", DoubleType(), True),
    StructField("repurchase_make_whole_proceed", StringType(), True),
    StructField("foreclosure_principal_write_off_amount", DoubleType(), True),
    StructField("servicing_activity_indicator", StringType(), True)
])

performance = spark.read.csv('./data/Performance_2000Q1.txt', schema=performance_schema, sep='|')

In [35]:
acquisition.limit(5).toPandas()

Unnamed: 0,loan_identifier,channel,seller_name,original_interest_rate,original_upb,origin_loan_term,original_date,first_payment_date,original_loan_to_value,original_combined_loan_to_value,number_of_borrowers,debt_to_income_ratio,borrower_credit_score,first_time_homebuyer,loan_purpose,property_type,number_of_units,occupancy_status,property_state,zip,insurance_percentage,product_type,co_borrower_credit_score,insurance_type,relocation_mortrage
0,100007365142,R,"JPMORGAN CHASE BANK, NA",8.0,75000.0,360.0,12/1999,02/2000,79.0,,1.0,62.0,763.0,N,R,SF,1,P,PA,173,,FRM,,,N
1,100011322040,C,AMTRUST BANK,7.75,123000.0,360.0,11/1999,01/2000,80.0,,1.0,28.0,750.0,N,P,SF,1,P,MO,630,,FRM,,,N
2,100015192562,R,OTHER,8.5,51000.0,360.0,02/2000,04/2000,95.0,,1.0,27.0,686.0,N,P,SF,1,P,GA,316,25.0,FRM,,1.0,N
3,100015874399,C,"CITIMORTGAGE, INC.",8.75,242000.0,360.0,02/2000,04/2000,95.0,,1.0,47.0,706.0,N,P,SF,1,P,FL,335,30.0,FRM,,1.0,N
4,100017922445,C,AMTRUST BANK,8.25,240000.0,360.0,12/1999,02/2000,77.0,,2.0,19.0,737.0,N,P,SF,1,P,MI,483,,FRM,731.0,,N


In [36]:
performance.limit(5).toPandas()

Unnamed: 0,loan_idetifier,monthly_reporting_period,servicer_name,current_interst_rate,current_upb,loan_age,remaining_month_to_maturity,adjusted_remaining_month_to_maturity,maturity_date,metropolitan,current_loan_delinquency_status,modification_flag,zero_balance_code,zero_balance_effective_date,last_paid_installment_date,foreclosure_date,disposition_date,foreclosure_cost,property_repair_cost,asset_recovery_cost,miscellaneous_expenses,taxes_for_holding_property,net_sale_proceeds,credit_enchancement_proceeds,repurchuse_make_whole_proceeds,other_foreclosure_proceed,non_interest_bearing_upb,principle_forgiveness,repurchase_make_whole_proceed,foreclosure_principal_write_off_amount,servicing_activity_indicator
0,100007400000.0,01/01/2000,,8.0,,0.0,360.0,359.0,01/2030,0,0,N,,,,,,,,,,,,,,,,,,,
1,100007400000.0,02/01/2000,,8.0,,1.0,359.0,358.0,01/2030,0,0,N,,,,,,,,,,,,,,,,,,,
2,100007400000.0,03/01/2000,,8.0,,2.0,358.0,357.0,01/2030,0,0,N,,,,,,,,,,,,,,,,,,,
3,100007400000.0,04/01/2000,,8.0,,3.0,357.0,356.0,01/2030,0,0,N,,,,,,,,,,,,,,,,,,,
4,100007400000.0,05/01/2000,,8.0,,4.0,356.0,355.0,01/2030,0,0,N,,,,,,,,,,,,,,,,,,,


## Calculating the zero balance code binary class