In [1]:
from collections import OrderedDict
import os
import glob
import numpy as np
import pandas as pd
import dask
from dask.delayed import delayed
from dask.distributed import Client, wait
import dask_cudf
import datetime

In [2]:
print(datetime.datetime.utcnow().isoformat())

2019-11-28T05:13:03.063038


In [3]:
!cat start_dask.yaml

device_memory_limit_gib: 26.0

# Use below for Parquet. Use build_docker_nightly.sh.
docker_image: claudiofahey/rapidsai:a359097c3c18a534b91557d5abe772c73ef57d11de3dfb632e1516b0a01745f1

# Use below for ORC. Use build_docker_010.sh.
#docker_image: claudiofahey/rapidsai:0.10-cuda10.0-runtime-ubuntu18.04-custom

host:
  - 10.200.11.12
  - 10.200.11.13
  - 10.200.11.11
memory_limit_gib: 64.0


In [4]:
use_local_cuda_cluster = False
# Use below for a local-only CUDA cluster
if use_local_cuda_cluster:
    from dask_cuda import LocalCUDACluster
    cluster = LocalCUDACluster(ip='0.0.0.0')
    client = Client(cluster)

In [5]:
# Use below for a multi-host multi-GPU CUDA cluster started with start_dask.py.
if not use_local_cuda_cluster:
    scheduler_address = '10.200.11.12:8786'
    client = Client(address=scheduler_address)

In [6]:
client

0,1
Client  Scheduler: tcp://10.200.11.12:8786  Dashboard: http://10.200.11.12:8787/status,Cluster  Workers: 48  Cores: 240  Memory: 3.30 TB


In [7]:
import cudf

In [8]:
%%time
# Restart all workers. This also clears GPU memory.
#client.restart()

CPU times: user 2 µs, sys: 2 µs, total: 4 µs
Wall time: 7.39 µs


In [9]:
def gpu_load_performance_data(performance_path, **kwargs):
    """ Loads performance data

    Returns
    -------
    GPU DataFrame
    """
    
    cols = [
        "loan_id", "monthly_reporting_period", "servicer", "interest_rate", "current_actual_upb",
        "loan_age", "remaining_months_to_legal_maturity", "adj_remaining_months_to_maturity",
        "maturity_date", "msa", "current_loan_delinquency_status", "mod_flag", "zero_balance_code",
        "zero_balance_effective_date", "last_paid_installment_date", "foreclosed_after",
        "disposition_date", "foreclosure_costs", "prop_preservation_and_repair_costs",
        "asset_recovery_costs", "misc_holding_expenses", "holding_taxes", "net_sale_proceeds",
        "credit_enhancement_proceeds", "repurchase_make_whole_proceeds", "other_foreclosure_proceeds",
        "non_interest_bearing_upb", "principal_forgiveness_upb", "repurchase_make_whole_proceeds_flag",
        "foreclosure_principal_write_off_amount", "servicing_activity_indicator"
    ]

    ddf = dask_cudf.read_orc(performance_path)
    # Fix column names from ORC file
    ddf = ddf.rename(columns=dict(zip(ddf.columns, cols)))
    return ddf

In [15]:
# Identify list of files to load.
data_dir = '/mnt/isilon1/data/mortgage'
perf_file = []
#perf_file += glob.glob(data_dir + '/perf-snappy.orc/*')
perf_file += glob.glob(data_dir + '/from-spark.orc/*.orc')
perf_file = perf_file[0:1]
len(perf_file)

1

In [16]:
%%time
perf_ddf = gpu_load_performance_data(perf_file)

CPU times: user 302 ms, sys: 693 ms, total: 995 ms
Wall time: 1.02 s


In [17]:
perf_ddf

Unnamed: 0_level_0,loan_id,monthly_reporting_period,servicer,interest_rate,current_actual_upb,loan_age,remaining_months_to_legal_maturity,adj_remaining_months_to_maturity,maturity_date,msa,current_loan_delinquency_status,mod_flag,zero_balance_code,zero_balance_effective_date,last_paid_installment_date,foreclosed_after,disposition_date,foreclosure_costs,prop_preservation_and_repair_costs,asset_recovery_costs,misc_holding_expenses,holding_taxes,net_sale_proceeds,credit_enhancement_proceeds,repurchase_make_whole_proceeds,other_foreclosure_proceeds,non_interest_bearing_upb,principal_forgiveness_upb,repurchase_make_whole_proceeds_flag,foreclosure_principal_write_off_amount,servicing_activity_indicator
npartitions=24,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1
,int64,datetime64[ns],object,float64,float64,float64,float64,float64,object,float64,int32,object,object,datetime64[ns],datetime64[ns],datetime64[ns],datetime64[ns],float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,object,float64,object
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...


In [19]:
1042/24

43.416666666666664

In [None]:
perf_ddf.head()

In [None]:
%%time
# Read from files into GPU memory.
perf_ddf = perf_ddf.persist()
wait(perf_ddf)

In [None]:
perf_ddf

In [None]:
#perf_ddf.dask

In [None]:
%%time
perf_ddf.head()

In [None]:
%%time
len(perf_ddf)

In [None]:
%%time
perf_ddf.groupby(['servicer'])['interest_rate'].max().compute().head(2)

In [None]:
%%time
perf_ddf.groupby(['servicer'])['interest_rate'].max().compute().head(2)

In [None]:
%%time
perf_ddf.groupby(['servicer'])['interest_rate'].max().compute().head(2)

In [None]:
checksum = perf_ddf['loan_id'].sum().compute()
checksum

In [None]:
# compare to checksum from mortgage_etl_4.ipynb.
checksum - 6573355020803881490

In [None]:
print(datetime.datetime.utcnow().isoformat())

In [None]:
!nvidia-smi