In [33]:
from collections import OrderedDict
import os
import glob
import numpy as np
import pandas as pd
import dask
from dask.delayed import delayed
from dask.distributed import Client, wait
import dask_cudf
import datetime

In [34]:
print(datetime.datetime.utcnow().isoformat())

2019-12-03T01:24:51.908569


In [35]:
!cat start_dask.yaml

device_memory_limit_gib: 26.0

# Use below for Parquet. Use build_docker_nightly.sh.
#docker_image: claudiofahey/rapidsai:a359097c3c18a534b91557d5abe772c73ef57d11de3dfb632e1516b0a01745f1

# Use below for ORC. Use build_docker_010.sh.
docker_image: claudiofahey/rapidsai:0.10-cuda10.0-runtime-ubuntu18.04-custom

host:
  - 10.200.11.12
  - 10.200.11.13
  - 10.200.11.11
memory_limit_gib: 64.0


In [36]:
use_local_cuda_cluster = False
# Use below for a local-only CUDA cluster
if use_local_cuda_cluster:
    from dask_cuda import LocalCUDACluster
    cluster = LocalCUDACluster(ip='0.0.0.0')
    client = Client(cluster)

In [37]:
# Use below for a multi-host multi-GPU CUDA cluster started with start_dask.py.
if not use_local_cuda_cluster:
    scheduler_address = '10.200.11.12:8786'
    client = Client(address=scheduler_address)

In [38]:
client

0,1
Client  Scheduler: tcp://10.200.11.12:8786  Dashboard: http://10.200.11.12:8787/status,Cluster  Workers: 48  Cores: 240  Memory: 3.30 TB


In [39]:
import cudf

In [40]:
%%time
# Restart all workers. This also clears GPU memory.
client.restart()

CPU times: user 30 ms, sys: 14 ms, total: 44 ms
Wall time: 18.7 s


0,1
Client  Scheduler: tcp://10.200.11.12:8786  Dashboard: http://10.200.11.12:8787/status,Cluster  Workers: 48  Cores: 240  Memory: 3.30 TB


In [41]:
def gpu_load_performance_data(performance_path, **kwargs):
    """ Loads performance data

    Returns
    -------
    GPU DataFrame
    """
    
    cols = [
        "loan_id", "monthly_reporting_period", "servicer", "interest_rate", "current_actual_upb",
        "loan_age", "remaining_months_to_legal_maturity", "adj_remaining_months_to_maturity",
        "maturity_date", "msa", "current_loan_delinquency_status", "mod_flag", "zero_balance_code",
        "zero_balance_effective_date", "last_paid_installment_date", "foreclosed_after",
        "disposition_date", "foreclosure_costs", "prop_preservation_and_repair_costs",
        "asset_recovery_costs", "misc_holding_expenses", "holding_taxes", "net_sale_proceeds",
        "credit_enhancement_proceeds", "repurchase_make_whole_proceeds", "other_foreclosure_proceeds",
        "non_interest_bearing_upb", "principal_forgiveness_upb", "repurchase_make_whole_proceeds_flag",
        "foreclosure_principal_write_off_amount", "servicing_activity_indicator"
    ]

    ddf = dask_cudf.read_orc(performance_path)
    # Fix column names from ORC file
    ddf = ddf.rename(columns=dict(zip(ddf.columns, cols)))
    return ddf

In [42]:
# Identify list of files to load.
data_dir = '/mnt/isilon1/data/mortgage'
perf_file = []
#perf_file += glob.glob(data_dir + '/perf-snappy.orc/*')
#perf_file += glob.glob(data_dir + '/perf-from-spark-4.00x-48p-2048MiB-snappy.orc/*.orc')
perf_file += glob.glob(data_dir + '/perf-from-spark-3.00x-48p-2048MiB-snappy.orc/*.orc')
#perf_file = perf_file[0:1]
len(perf_file)

48

In [43]:
%%time
perf_ddf = gpu_load_performance_data(perf_file)

CPU times: user 1.16 s, sys: 623 ms, total: 1.78 s
Wall time: 1.83 s


In [30]:
perf_ddf

Unnamed: 0_level_0,loan_id,monthly_reporting_period,servicer,interest_rate,current_actual_upb,loan_age,remaining_months_to_legal_maturity,adj_remaining_months_to_maturity,maturity_date,msa,current_loan_delinquency_status,mod_flag,zero_balance_code,zero_balance_effective_date,last_paid_installment_date,foreclosed_after,disposition_date,foreclosure_costs,prop_preservation_and_repair_costs,asset_recovery_costs,misc_holding_expenses,holding_taxes,net_sale_proceeds,credit_enhancement_proceeds,repurchase_make_whole_proceeds,other_foreclosure_proceeds,non_interest_bearing_upb,principal_forgiveness_upb,repurchase_make_whole_proceeds_flag,foreclosure_principal_write_off_amount,servicing_activity_indicator
npartitions=96,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1
,int64,datetime64[ns],object,float64,float64,float64,float64,float64,object,float64,int32,object,object,datetime64[ns],datetime64[ns],datetime64[ns],datetime64[ns],float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,object,float64,object
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...


In [45]:
%%time
perf_ddf = perf_ddf.persist()
wait(perf_ddf)

CPU times: user 110 ms, sys: 465 µs, total: 111 ms
Wall time: 106 ms


In [46]:
%%time
perf_ddf.groupby(['servicer'])['interest_rate'].max().compute().head(2)

CPU times: user 153 ms, sys: 8.44 ms, total: 161 ms
Wall time: 9.65 s


QUICKEN LOANS INC.     8.875
FLAGSTAR BANK, FSB    10.625
Name: interest_rate, dtype: float64

In [50]:
del perf_ddf

In [51]:
%%time
perf_ddf = gpu_load_performance_data(perf_file)

CPU times: user 1.1 s, sys: 677 ms, total: 1.77 s
Wall time: 1.8 s


In [52]:
%%time
perf_ddf = perf_ddf.persist()
wait(perf_ddf)

CPU times: user 339 ms, sys: 2.15 ms, total: 341 ms
Wall time: 6.74 s


In [54]:
del perf_ddf

In [53]:
%%time
perf_ddf.groupby(['servicer'])['interest_rate'].max().compute().head(2)

CPU times: user 122 ms, sys: 4.79 ms, total: 127 ms
Wall time: 1.5 s


QUICKEN LOANS INC.     8.875
FLAGSTAR BANK, FSB    10.625
Name: interest_rate, dtype: float64

In [16]:
%%time
perf_ddf = gpu_load_performance_data(perf_file)

CPU times: user 1.17 s, sys: 584 ms, total: 1.76 s
Wall time: 1.81 s


In [45]:
%%time
perf_ddf = perf_ddf.persist()
wait(perf_ddf)

CPU times: user 110 ms, sys: 465 µs, total: 111 ms
Wall time: 106 ms


In [17]:
%%time
perf_ddf.groupby(['servicer'])['interest_rate'].max().compute().head(2)

CPU times: user 362 ms, sys: 15.1 ms, total: 377 ms
Wall time: 9.61 s


QUICKEN LOANS INC.     8.875
FLAGSTAR BANK, FSB    10.625
Name: interest_rate, dtype: float64

In [58]:
%%time
persist = False
for i in range(3):
    print('i=%d' % i)
    perf_ddf = gpu_load_performance_data(perf_file)
    if persist:
        perf_ddf = perf_ddf.persist()
        wait(perf_ddf)
    result_df = perf_ddf.groupby(['servicer'])['interest_rate'].max().compute()
    print(len(result_df))
    del perf_ddf

i=0
46
i=1
46
i=2
46
CPU times: user 4.4 s, sys: 1.97 s, total: 6.37 s
Wall time: 34.7 s


In [64]:
%%time
perf_ddfs = [gpu_load_performance_data(perf_file) for _ in range(100)]

CPU times: user 1min 57s, sys: 1min 4s, total: 3min 2s
Wall time: 3min 7s


In [65]:
%%time
for perf_ddf in perf_ddfs:
    result_df = perf_ddf.groupby(['servicer'])['interest_rate'].max().compute()
    print(len(result_df))
    del perf_ddf

46
46
46
46
46
46
46
46
46
46
46
46
46
46
46
46
46
46
46
46
46
46
46
46
46
46
46
46
46
46
46
46
46
46
46
46
46
46
46
46
46
46
46
46
46
46
46
46
46
46
46
46
46
46
46
46
46
46
46
46
46
46
46
46
46
46
46
46
46
46
46
46
46
46
46
46
46
46
46
46
46
46
46
46
46
46
46
46
46
46
46
46
46
46
46
46
46
46
46
46
CPU times: user 38.6 s, sys: 2.32 s, total: 40.9 s
Wall time: 18min 48s


In [19]:
print(datetime.datetime.utcnow().isoformat())

2019-12-03T01:19:57.373627


In [20]:
!nvidia-smi

Tue Dec  3 01:19:58 2019       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 418.67       Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  Tesla V100-SXM3...  On   | 00000000:34:00.0 Off |                    0 |
| N/A   37C    P0    85W / 350W |    965MiB / 32480MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
|   1  Tesla V100-SXM3...  On   | 00000000:36:00.0 Off |                    0 |
| N/A   36C    P0    82W / 350W |    490MiB / 32480MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
|   2  Tesla V100-SXM3...  On   | 00000000:39:00.0 Off |                    0 |
| N/A   