In [1]:
import zipfile
import pandas as pd
import itertools
import os.path
from progress_bar import ProgressBar
import numpy as np
from distance_correlation.distcorr import distcorr_mem_efficient as distcorr

In [2]:
#################### CONSTANTS
# The headers aren't in the file.
HEADER = ("LOAN_ID", "Monthly.Rpt.Prd", "Servicer.Name", "LAST_RT", "LAST_UPB", "Loan.Age", "Months.To.Legal.Mat",
    "Adj.Month.To.Mat", "Maturity.Date", "MSA", "Delq.Status", "MOD_FLAG", "Zero.Bal.Code", 
    "ZB_DTE", "LPI_DTE", "FCC_DTE","DISP_DT", "FCC_COST", "PP_COST", "AR_COST", "IE_COST", "TAX_COST", "NS_PROCS",
    "CE_PROCS", "RMW_PROCS", "O_PROCS", "NON_INT_UPB", "PRIN_FORG_UPB", "REPCH_FLAG")
TYPES = (np.str_, np.str_, np.str_, np.float64, np.float64, 
    np.float64, np.float64, np.float64, np.str_, np.str_, np.str_, np.str_, np.str_, 
    np.str_, np.str_, np.str_, np.str_, np.float64, np.float64, np.float64, np.float64, 
    np.float64, np.float64, np.float64, np.float64, np.float64, np.float64, np.float64, np.str_)
FIELDS_OF_INTEREST = ["LOAN_ID", "Monthly.Rpt.Prd", "Loan.Age", "Months.To.Legal.Mat", "Delq.Status", "MSA"]
DATE_COLUMNS = [1,8,13,14,15,16,17]
CONVERTERS = {HEADER[i]:TYPES[i] for i in range(len(HEADER))}

SEP = "|" # The csv divider

DATA_DIRECTORY = "data"
DATA_FILE_TEMPLATE = "Performance_{}Q{}.txt"
ZIP_FILE_TEMPLATE = "{}Q{}.zip"

QUARTERS = list(itertools.product(range(2000,2016), range(1,5)))[:-1] #2015 Q4 doesn't exist

In [3]:
def read_zipped_data(year, quarter):
    # a function for reading the data from the zipped files
    zf = os.path.join(DATA_DIRECTORY, ZIP_FILE_TEMPLATE.format(year, quarter))
    df = DATA_FILE_TEMPLATE.format(year, quarter)
    with zipfile.ZipFile(zf).open(df, 'r') as f:
        data = pd.read_csv(f, sep=SEP, names=HEADER, dtype=CONVERTERS)#, parse_dates=DATE_COLUMNS, infer_datetime_format=True)
    return data

In [None]:
def pairwise(series):
    # returns an iterator of the rows, pairwise
    return itertools.combinations(series.iteritems(),2)

def delq_dc_pairwise(data):
    X = []
    Y = []
    for x,y in pairwise(data):
        X.append(x[1])
        Y.append(y[1])
    return distcorr(X,Y)

In [5]:
# We want to perform a distance correlation... 
# Should X be the set of observations in some MSA 
#   at time T and y is T?
# Should X be a single observation over all of its available times, Y is some other observation in the same MSA?
# Should an (x,y) observation be pairwise loans at a single time T?  
#   Would it make sense to agglomerate those over all T in a particular MSA?... This actually gives a new time
#   series D_t of distance correlations over time.  We can take meaningful averages and whatnot...
#   this takes up a boatcrap of time and space though.... pairwise is n**2, then DC is n**2 as well...
#   mebbe is feasible for small MSA and short time frames, tho.
# 

In [12]:
one_quarter_data = read_zipped_data(*QUARTERS[0])[["MSA","Monthly.Rpt.Prd","Delq.Status"]]
one_quarter_data = one_quarter_data[~one_quarter_data['Delq.Status'].astype(str).isin(['X'])]
one_quarter_data['Delq.Status'] = pd.to_numeric(one_quarter_data['Delq.Status'])

In [13]:
grouped = one_quarter_data.groupby(by=["MSA","Monthly.Rpt.Prd"])

In [14]:
dc_time_series_by_MSA = grouped.aggregate(delq_dc_pairwise)

0       0.0
591     0.0
2473    0.0
2489    0.0
3380    0.0
Name: Delq.Status, dtype: float64



KeyboardInterrupt: 

In [None]:
# BUILD a big dataframe... this is too slow and too hungry
'''
data = pd.DataFrame()
i=0
pbar = ProgressBar(len(QUARTERS))
for year, quarter in QUARTERS:
    pbar.animate(i)
    data = data.append(read_zipped_data(year, quarter), ignore_index=True)
    i += 1
'''