In [1]:
from collections import OrderedDict
import os
import glob
import numpy as np
import pandas as pd
import dask
from dask.delayed import delayed
from dask.distributed import Client, wait
import dask_cudf
import datetime

In [2]:
print(datetime.datetime.utcnow().isoformat())

2019-11-23T03:48:55.403003


In [3]:
!cat start_dask.yaml

device_memory_limit_gib: 26.0
docker_image: claudiofahey/rapidsai:a359097c3c18a534b91557d5abe772c73ef57d11de3dfb632e1516b0a01745f1
host:
  - 10.200.11.12
  - 10.200.11.13
memory_limit_gib: 64.0


In [4]:
use_local_cuda_cluster = False
# Use below for a local-only CUDA cluster
if use_local_cuda_cluster:
    from dask_cuda import LocalCUDACluster
    cluster = LocalCUDACluster(ip='0.0.0.0')
    client = Client(cluster)

In [5]:
# Use below for a multi-host multi-GPU CUDA cluster started with start_dask.py.
if not use_local_cuda_cluster:
    scheduler_address = '10.200.11.12:8786'
    client = Client(address=scheduler_address)

In [6]:
client

0,1
Client  Scheduler: tcp://10.200.11.12:8786  Dashboard: http://10.200.11.12:8787/status,Cluster  Workers: 32  Cores: 160  Memory: 2.20 TB


In [7]:
import cudf

In [8]:
%%time
# Restart all workers. This also clears GPU memory.
client.restart()

CPU times: user 21.4 ms, sys: 336 µs, total: 21.7 ms
Wall time: 18.7 s


0,1
Client  Scheduler: tcp://10.200.11.12:8786  Dashboard: http://10.200.11.12:8787/status,Cluster  Workers: 32  Cores: 160  Memory: 2.20 TB


In [23]:
def gpu_load_performance_data(performance_path, **kwargs):
    """ Loads performance data

    Returns
    -------
    GPU DataFrame
    """
    
    cols = [
        "loan_id", "monthly_reporting_period", "servicer", "interest_rate", "current_actual_upb",
        "loan_age", "remaining_months_to_legal_maturity", "adj_remaining_months_to_maturity",
        "maturity_date", "msa", "current_loan_delinquency_status", "mod_flag", "zero_balance_code",
        "zero_balance_effective_date", "last_paid_installment_date", "foreclosed_after",
        "disposition_date", "foreclosure_costs", "prop_preservation_and_repair_costs",
        "asset_recovery_costs", "misc_holding_expenses", "holding_taxes", "net_sale_proceeds",
        "credit_enhancement_proceeds", "repurchase_make_whole_proceeds", "other_foreclosure_proceeds",
        "non_interest_bearing_upb", "principal_forgiveness_upb", "repurchase_make_whole_proceeds_flag",
        "foreclosure_principal_write_off_amount", "servicing_activity_indicator"
    ]

    ddf = dask_cudf.read_parquet(performance_path)
    #ddf = cudf.read_parquet(performance_path)
#     ddf = ddf.rename(columns=dict(zip(ddf.columns, cols)))
    return ddf

In [24]:
# Identify list of files to load.
data_dir = '/mnt/isilon1/data/mortgage'
perf_file = []
perf_file += glob.glob(data_dir + '/perf-snappy.parquet/part.0.parquet')
#perf_file += glob.glob(data_dir + '/perf/Performance_*')
#perf_file += glob.glob(data_dir + '/perf/Performance_2016*.txt')
len(perf_file)

1

In [18]:
cudf.read_parquet(data_dir + '/perf-snappy.parquet/part.0.parquet')

Unnamed: 0_level_0,loan_id,monthly_reporting_period,servicer,interest_rate,current_actual_upb,loan_age,remaining_months_to_legal_maturity,adj_remaining_months_to_maturity,maturity_date,msa,...,holding_taxes,net_sale_proceeds,credit_enhancement_proceeds,repurchase_make_whole_proceeds,other_foreclosure_proceeds,non_interest_bearing_upb,principal_forgiveness_upb,repurchase_make_whole_proceeds_flag,foreclosure_principal_write_off_amount,servicing_activity_indicator
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,100000040778,2017-01-01,,3.875,234856.62,6.0,354.0,352.0,1910-05-25 17:31:44,40140.0,...,,,,,,,,,,2313200
1,100000040778,2017-02-01,,3.875,234470.9,7.0,353.0,351.0,1910-05-25 17:31:44,40140.0,...,,,,,,,,,,2313200
2,100000040778,2017-03-01,,3.875,234083.94,8.0,352.0,350.0,1910-05-25 17:31:44,40140.0,...,,,,,,,,,,2313200
3,100000040778,2017-04-01,310110673,3.875,233720.67,9.0,351.0,349.0,1910-05-25 17:31:44,40140.0,...,,,,,,,,,,933513450
4,100000040778,2017-05-01,,3.875,233356.22,10.0,350.0,348.0,1910-05-25 17:31:44,40140.0,...,,,,,,,,,,2313200
5,100000040778,2017-06-01,,3.875,232983.66,11.0,349.0,346.0,1910-05-25 17:31:44,40140.0,...,,,,,,,,,,2313200
6,100000040778,2016-07-01,1240500859,3.875,,0.0,360.0,359.0,1910-05-25 17:31:44,40140.0,...,,,,,,,,,,2313200
7,100000040778,2017-07-01,,3.875,232616.83,12.0,348.0,345.0,1910-05-25 17:31:44,40140.0,...,,,,,,,,,,2313200
8,100000040778,2016-08-01,,3.875,,1.0,359.0,358.0,1910-05-25 17:31:44,40140.0,...,,,,,,,,,,2313200
9,100000040778,2017-08-01,,3.875,232116.94,13.0,347.0,344.0,1910-05-25 17:31:44,40140.0,...,,,,,,,,,,2313200


In [26]:
%%time
perf_ddf = gpu_load_performance_data(data_dir + '/perf-snappy.parquet/part.0.parquet')

CPU times: user 139 ms, sys: 0 ns, total: 139 ms
Wall time: 140 ms


In [27]:
perf_ddf

Unnamed: 0_level_0,loan_id,monthly_reporting_period,servicer,interest_rate,current_actual_upb,loan_age,remaining_months_to_legal_maturity,adj_remaining_months_to_maturity,maturity_date,msa,current_loan_delinquency_status,mod_flag,zero_balance_code,zero_balance_effective_date,last_paid_installment_date,foreclosed_after,disposition_date,foreclosure_costs,prop_preservation_and_repair_costs,asset_recovery_costs,misc_holding_expenses,holding_taxes,net_sale_proceeds,credit_enhancement_proceeds,repurchase_make_whole_proceeds,other_foreclosure_proceeds,non_interest_bearing_upb,principal_forgiveness_upb,repurchase_make_whole_proceeds_flag,foreclosure_principal_write_off_amount,servicing_activity_indicator
npartitions=1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1
,int64,object,object,float64,float64,float64,float64,float64,object,float64,int32,object,object,object,object,object,object,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,object,float64,object
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...


In [28]:
perf_ddf.head()

Unnamed: 0_level_0,loan_id,monthly_reporting_period,servicer,interest_rate,current_actual_upb,loan_age,remaining_months_to_legal_maturity,adj_remaining_months_to_maturity,maturity_date,msa,...,holding_taxes,net_sale_proceeds,credit_enhancement_proceeds,repurchase_make_whole_proceeds,other_foreclosure_proceeds,non_interest_bearing_upb,principal_forgiveness_upb,repurchase_make_whole_proceeds_flag,foreclosure_principal_write_off_amount,servicing_activity_indicator
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,100000040778,01/01/2017,,3.875,234856.62,6.0,354.0,352.0,07/2046,40140.0,...,,,,,,,,,,N
1,100000040778,02/01/2017,,3.875,234470.9,7.0,353.0,351.0,07/2046,40140.0,...,,,,,,,,,,N
2,100000040778,03/01/2017,,3.875,234083.94,8.0,352.0,350.0,07/2046,40140.0,...,,,,,,,,,,N
3,100000040778,04/01/2017,NEW RESIDENTIAL MORTGAGE LLC,3.875,233720.67,9.0,351.0,349.0,07/2046,40140.0,...,,,,,,,,,,Y
4,100000040778,05/01/2017,,3.875,233356.22,10.0,350.0,348.0,07/2046,40140.0,...,,,,,,,,,,N


In [30]:
%%time
# Read from files into GPU memory.
perf_ddf = perf_ddf.persist()
wait(perf_ddf)

CPU times: user 72.5 ms, sys: 6.39 ms, total: 78.9 ms
Wall time: 76.4 ms


In [31]:
#perf_ddf.dask

In [32]:
%%time
perf_ddf.head()

CPU times: user 111 ms, sys: 3.38 ms, total: 115 ms
Wall time: 229 ms


Unnamed: 0_level_0,loan_id,monthly_reporting_period,servicer,interest_rate,current_actual_upb,loan_age,remaining_months_to_legal_maturity,adj_remaining_months_to_maturity,maturity_date,msa,...,holding_taxes,net_sale_proceeds,credit_enhancement_proceeds,repurchase_make_whole_proceeds,other_foreclosure_proceeds,non_interest_bearing_upb,principal_forgiveness_upb,repurchase_make_whole_proceeds_flag,foreclosure_principal_write_off_amount,servicing_activity_indicator
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,100000040778,01/01/2017,,3.875,234856.62,6.0,354.0,352.0,07/2046,40140.0,...,,,,,,,,,,N
1,100000040778,02/01/2017,,3.875,234470.9,7.0,353.0,351.0,07/2046,40140.0,...,,,,,,,,,,N
2,100000040778,03/01/2017,,3.875,234083.94,8.0,352.0,350.0,07/2046,40140.0,...,,,,,,,,,,N
3,100000040778,04/01/2017,NEW RESIDENTIAL MORTGAGE LLC,3.875,233720.67,9.0,351.0,349.0,07/2046,40140.0,...,,,,,,,,,,Y
4,100000040778,05/01/2017,,3.875,233356.22,10.0,350.0,348.0,07/2046,40140.0,...,,,,,,,,,,N


In [33]:
%%time
len(perf_ddf)

CPU times: user 6.03 ms, sys: 1.47 ms, total: 7.5 ms
Wall time: 15.8 ms


508101

In [36]:
%%time
perf_ddf.groupby(['servicer'])['interest_rate'].max().compute().head(2)

CPU times: user 97.3 ms, sys: 14.8 ms, total: 112 ms
Wall time: 146 ms


PROVIDENT FUNDING ASSOCIATES, L.P.    4.375
FREEDOM MORTGAGE CORP.                5.000
Name: interest_rate, dtype: float64

In [37]:
%%time
perf_ddf.groupby(['servicer'])['interest_rate'].max().compute().head(2)

CPU times: user 101 ms, sys: 1.9 ms, total: 103 ms
Wall time: 136 ms


PROVIDENT FUNDING ASSOCIATES, L.P.    4.375
FREEDOM MORTGAGE CORP.                5.000
Name: interest_rate, dtype: float64

In [40]:
%%time
perf_ddf.groupby(['servicer'])['interest_rate'].max().compute().head(2)

CPU times: user 104 ms, sys: 37 µs, total: 104 ms
Wall time: 138 ms


PROVIDENT FUNDING ASSOCIATES, L.P.    4.375
FREEDOM MORTGAGE CORP.                5.000
Name: interest_rate, dtype: float64

In [41]:
checksum = perf_ddf['loan_id'].sum().compute()
checksum

62018971513503530

In [42]:
# compare to checksum from mortgage_etl_4.ipynb.
checksum - 6573355020803881490

-6511336049290377960

In [22]:
print(datetime.datetime.utcnow().isoformat())

2019-11-23T01:37:40.137584


In [23]:
!nvidia-smi

Sat Nov 23 01:37:40 2019       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 418.67       Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  Tesla V100-SXM3...  On   | 00000000:34:00.0 Off |                    0 |
| N/A   36C    P0    70W / 350W |  13836MiB / 32480MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
|   1  Tesla V100-SXM3...  On   | 00000000:36:00.0 Off |                    0 |
| N/A   35C    P0    67W / 350W |  13712MiB / 32480MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
|   2  Tesla V100-SXM3...  On   | 00000000:39:00.0 Off |                    0 |
| N/A   