In [None]:
from collections import OrderedDict
import os
import glob
import numpy as np
import pandas as pd
import dask
from dask.delayed import delayed
from dask.distributed import Client, wait
import dask_cudf
import datetime

In [None]:
print(datetime.datetime.utcnow().isoformat())

In [None]:
!cat start_dask.yaml

In [None]:
use_local_cuda_cluster = False
# Use below for a local-only CUDA cluster
if use_local_cuda_cluster:
    from dask_cuda import LocalCUDACluster
    cluster = LocalCUDACluster(ip='0.0.0.0')
    client = Client(cluster)

In [None]:
# Use below for a multi-host multi-GPU CUDA cluster started with start_dask.py.
if not use_local_cuda_cluster:
    scheduler_address = '10.200.11.12:8786'
    client = Client(address=scheduler_address)

In [None]:
client

In [None]:
import cudf

In [None]:
%%time
# Restart all workers. This also clears GPU memory.
client.restart()

In [None]:
def gpu_load_performance_data(performance_path, **kwargs):
    """ Loads performance data

    Returns
    -------
    GPU DataFrame
    """
    
    cols = [
        "loan_id", "monthly_reporting_period", "servicer", "interest_rate", "current_actual_upb",
        "loan_age", "remaining_months_to_legal_maturity", "adj_remaining_months_to_maturity",
        "maturity_date", "msa", "current_loan_delinquency_status", "mod_flag", "zero_balance_code",
        "zero_balance_effective_date", "last_paid_installment_date", "foreclosed_after",
        "disposition_date", "foreclosure_costs", "prop_preservation_and_repair_costs",
        "asset_recovery_costs", "misc_holding_expenses", "holding_taxes", "net_sale_proceeds",
        "credit_enhancement_proceeds", "repurchase_make_whole_proceeds", "other_foreclosure_proceeds",
        "non_interest_bearing_upb", "principal_forgiveness_upb", "repurchase_make_whole_proceeds_flag",
        "foreclosure_principal_write_off_amount", "servicing_activity_indicator"
    ]

    ddf = dask_cudf.read_orc(performance_path)
    # Fix column names from ORC file
#     ddf = ddf.rename(columns=dict(zip(ddf.columns, cols)))
    return ddf

In [None]:
# Identify list of files to load.
data_dir = '/mnt/isilon1/data/mortgage'
perf_file = []
#perf_file += glob.glob(data_dir + '/perf-snappy.orc/*')
#perf_file += glob.glob(data_dir + '/perf-from-spark-4.00x-48p-2048MiB-snappy.orc/*.orc')
#perf_file += glob.glob(data_dir + '/perf-from-spark-3.00x-48p-2048MiB-snappy.orc/*.orc')
perf_file += glob.glob(data_dir + '/perf-no-strings-0.10x-48p-2048MiB-snappy.orc/*.orc')
#perf_file = perf_file[0:1]
len(perf_file)

In [None]:
%%time
perf_ddf = gpu_load_performance_data(perf_file)

In [None]:
perf_ddf

In [None]:
%%time
perf_ddf = perf_ddf.persist()
wait(perf_ddf)

In [None]:
%%time
perf_ddf.groupby(['servicer'])['interest_rate'].max().compute().head(2)

In [None]:
del perf_ddf

In [None]:
%%time
perf_ddf = gpu_load_performance_data(perf_file)

In [None]:
%%time
perf_ddf = perf_ddf.persist()
wait(perf_ddf)

In [None]:
del perf_ddf

In [None]:
%%time
persist = False
for i in range(3):
    print('i=%d' % i)
    perf_ddf = gpu_load_performance_data(perf_file)
    if persist:
        perf_ddf = perf_ddf.persist()
        wait(perf_ddf)
    result_df = perf_ddf.groupby(['servicer'])['interest_rate'].max().compute()
    print(len(result_df))
    del perf_ddf

In [None]:
%%time
perf_ddfs = [gpu_load_performance_data(perf_file) for _ in range(100)]

In [None]:
%%time
for perf_ddf in perf_ddfs:
    result_df = perf_ddf.groupby(['servicer'])['interest_rate'].max().compute()
    print(len(result_df))
    del perf_ddf

In [None]:
print(datetime.datetime.utcnow().isoformat())

In [None]:
!nvidia-smi