In [1]:
from collections import OrderedDict
import os
import glob
import numpy as np
import pandas as pd
import dask
from dask.delayed import delayed
from dask.distributed import Client, wait
import dask_cudf
import datetime

In [2]:
print(datetime.datetime.utcnow().isoformat())

2019-11-23T01:36:37.828785


In [3]:
!cat start_dask.yaml

device_memory_limit_gib: 26.0
docker_image: claudiofahey/rapidsai:0.10-cuda10.0-runtime-ubuntu18.04-custom
host:
  - 10.200.11.12
  - 10.200.11.13
memory_limit_gib: 64.0


In [4]:
use_local_cuda_cluster = False
# Use below for a local-only CUDA cluster
if use_local_cuda_cluster:
    from dask_cuda import LocalCUDACluster
    cluster = LocalCUDACluster(ip='0.0.0.0')
    client = Client(cluster)

In [5]:
# Use below for a multi-host multi-GPU CUDA cluster started with start_dask.py.
if not use_local_cuda_cluster:
    scheduler_address = '10.200.11.12:8786'
    client = Client(address=scheduler_address)

In [6]:
client

0,1
Client  Scheduler: tcp://10.200.11.12:8786  Dashboard: http://10.200.11.12:8787/status,Cluster  Workers: 32  Cores: 160  Memory: 2.20 TB


In [7]:
import cudf

In [8]:
%%time
# Restart all workers. This also clears GPU memory.
client.restart()

distributed.client - ERROR - Restart timed out after 20.000000 seconds


CPU times: user 21.7 ms, sys: 0 ns, total: 21.7 ms
Wall time: 20 s


0,1
Client  Scheduler: tcp://10.200.11.12:8786  Dashboard: http://10.200.11.12:8787/status,Cluster  Workers: 0  Cores: 0  Memory: 0 B


In [9]:
def gpu_load_performance_data(performance_path, **kwargs):
    """ Loads performance data

    Returns
    -------
    GPU DataFrame
    """
    
    cols = [
        "loan_id", "monthly_reporting_period", "servicer", "interest_rate", "current_actual_upb",
        "loan_age", "remaining_months_to_legal_maturity", "adj_remaining_months_to_maturity",
        "maturity_date", "msa", "current_loan_delinquency_status", "mod_flag", "zero_balance_code",
        "zero_balance_effective_date", "last_paid_installment_date", "foreclosed_after",
        "disposition_date", "foreclosure_costs", "prop_preservation_and_repair_costs",
        "asset_recovery_costs", "misc_holding_expenses", "holding_taxes", "net_sale_proceeds",
        "credit_enhancement_proceeds", "repurchase_make_whole_proceeds", "other_foreclosure_proceeds",
        "non_interest_bearing_upb", "principal_forgiveness_upb", "repurchase_make_whole_proceeds_flag",
        "foreclosure_principal_write_off_amount", "servicing_activity_indicator"
    ]

    ddf = dask_cudf.read_orc(performance_path)
    # Fix column names from ORC file
    ddf = ddf.rename(columns=dict(zip(ddf.columns, cols)))
    return ddf

In [10]:
# Identify list of files to load.
data_dir = '/mnt/isilon1/data/mortgage'
perf_file = []
perf_file += glob.glob(data_dir + '/perf-snappy.orc/*')
#perf_file += glob.glob(data_dir + '/perf/Performance_*')
#perf_file += glob.glob(data_dir + '/perf/Performance_2016*.txt')
len(perf_file)

96

In [11]:
%%time
perf_ddf = gpu_load_performance_data(perf_file)

CPU times: user 1.26 s, sys: 1.32 s, total: 2.58 s
Wall time: 9.55 s


In [12]:
perf_ddf

Unnamed: 0_level_0,loan_id,monthly_reporting_period,servicer,interest_rate,current_actual_upb,loan_age,remaining_months_to_legal_maturity,adj_remaining_months_to_maturity,maturity_date,msa,current_loan_delinquency_status,mod_flag,zero_balance_code,zero_balance_effective_date,last_paid_installment_date,foreclosed_after,disposition_date,foreclosure_costs,prop_preservation_and_repair_costs,asset_recovery_costs,misc_holding_expenses,holding_taxes,net_sale_proceeds,credit_enhancement_proceeds,repurchase_make_whole_proceeds,other_foreclosure_proceeds,non_interest_bearing_upb,principal_forgiveness_upb,repurchase_make_whole_proceeds_flag,foreclosure_principal_write_off_amount,servicing_activity_indicator
npartitions=96,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1
,int64,datetime64[ns],int32,float64,float64,float64,float64,float64,datetime64[ns],float64,int32,int32,int32,datetime64[ns],datetime64[ns],datetime64[ns],datetime64[ns],float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,int32,float64,int32
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...


In [13]:
%%time
# Read from files into GPU memory.
perf_ddf = perf_ddf.persist()
wait(perf_ddf)

CPU times: user 638 ms, sys: 20.3 ms, total: 658 ms
Wall time: 25.7 s


In [14]:
#perf_ddf.dask

In [15]:
%%time
perf_ddf.head()

CPU times: user 90 ms, sys: 2.06 ms, total: 92.1 ms
Wall time: 914 ms


Unnamed: 0,loan_id,monthly_reporting_period,servicer,interest_rate,current_actual_upb,loan_age,remaining_months_to_legal_maturity,adj_remaining_months_to_maturity,maturity_date,msa,...,holding_taxes,net_sale_proceeds,credit_enhancement_proceeds,repurchase_make_whole_proceeds,other_foreclosure_proceeds,non_interest_bearing_upb,principal_forgiveness_upb,repurchase_make_whole_proceeds_flag,foreclosure_principal_write_off_amount,servicing_activity_indicator
0,709011185621,2002-05-01,,7.0,,2.0,358.0,358.0,2032-03-01,12060.0,...,,,,,,,,,,
1,709011185621,2003-05-01,,7.0,91594.85,14.0,346.0,346.0,2032-03-01,12060.0,...,,,,,,,,,,
2,709011185621,2004-05-01,,7.0,90573.25,26.0,334.0,334.0,2032-03-01,12060.0,...,,,,,,,,,,
3,709011185621,2005-05-01,,7.0,89477.78,38.0,322.0,322.0,2032-03-01,12060.0,...,,,,,,,,,,
4,709011185621,2006-05-01,,7.0,88303.13,50.0,310.0,310.0,2032-03-01,12060.0,...,,,,,,,,,,


In [16]:
%%time
len(perf_ddf)

CPU times: user 26.1 ms, sys: 0 ns, total: 26.1 ms
Wall time: 154 ms


1890353680

In [17]:
%%time
perf_ddf.groupby(['servicer'])['interest_rate'].max().compute().head(2)

CPU times: user 243 ms, sys: 35.8 ms, total: 278 ms
Wall time: 1.88 s


servicer
2129140583    4.875
2139325342    3.875
Name: interest_rate, dtype: float64

In [18]:
%%time
perf_ddf.groupby(['servicer'])['interest_rate'].max().compute().head(2)

CPU times: user 159 ms, sys: 32.5 ms, total: 192 ms
Wall time: 1.41 s


servicer
-407380        6.500
 2131178196    5.875
Name: interest_rate, dtype: float64

In [19]:
%%time
perf_ddf.groupby(['servicer'])['interest_rate'].max().compute().head(2)

CPU times: user 183 ms, sys: 10.4 ms, total: 193 ms
Wall time: 1.46 s


servicer
2131392767    3.875
1091397644    4.500
Name: interest_rate, dtype: float64

In [20]:
checksum = perf_ddf['loan_id'].sum().compute()
checksum

6573355020803881490

In [21]:
# compare to checksum from mortgage_etl_4.ipynb.
checksum - 6573355020803881490

0

In [22]:
print(datetime.datetime.utcnow().isoformat())

2019-11-23T01:37:40.137584


In [23]:
!nvidia-smi

Sat Nov 23 01:37:40 2019       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 418.67       Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  Tesla V100-SXM3...  On   | 00000000:34:00.0 Off |                    0 |
| N/A   36C    P0    70W / 350W |  13836MiB / 32480MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
|   1  Tesla V100-SXM3...  On   | 00000000:36:00.0 Off |                    0 |
| N/A   35C    P0    67W / 350W |  13712MiB / 32480MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
|   2  Tesla V100-SXM3...  On   | 00000000:39:00.0 Off |                    0 |
| N/A   