In [1]:
from collections import OrderedDict
import os
import glob
import numpy as np
import pandas as pd
import dask
from dask.delayed import delayed
from dask.distributed import Client, wait
import dask_cudf

In [2]:
use_local_cuda_cluster = False
# Use below for a local-only CUDA cluster
if use_local_cuda_cluster:
    from dask_cuda import LocalCUDACluster
    cluster = LocalCUDACluster(ip='0.0.0.0')
    client = Client(cluster)

In [3]:
# Use below for a multi-host multi-GPU CUDA cluster started with start_dask.py.
if not use_local_cuda_cluster:
    scheduler_address = '10.200.11.12:8786'
    client = Client(address=scheduler_address)

In [4]:
client

0,1
Client  Scheduler: tcp://10.200.11.12:8786  Dashboard: http://10.200.11.12:8787/status,Cluster  Workers: 32  Cores: 160  Memory: 2.20 TB


In [5]:
%%time
# Restart all workers. This also clears GPU memory.
client.restart()

CPU times: user 17 ms, sys: 1.42 ms, total: 18.4 ms
Wall time: 19.3 s


0,1
Client  Scheduler: tcp://10.200.11.12:8786  Dashboard: http://10.200.11.12:8787/status,Cluster  Workers: 32  Cores: 160  Memory: 2.20 TB


In [6]:
!nvidia-smi

Fri Nov 22 17:59:50 2019       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 418.67       Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  Tesla V100-SXM3...  On   | 00000000:34:00.0 Off |                    0 |
| N/A   35C    P0    70W / 350W |    322MiB / 32480MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
|   1  Tesla V100-SXM3...  On   | 00000000:36:00.0 Off |                    0 |
| N/A   34C    P0    68W / 350W |    322MiB / 32480MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
|   2  Tesla V100-SXM3...  On   | 00000000:39:00.0 Off |                    0 |
| N/A   

In [7]:
# Below from mortgage/E2E.ipynb.
def gpu_load_performance_csv(performance_path, **kwargs):
    """ Loads performance data

    Returns
    -------
    GPU DataFrame
    """
    
    cols = [
        "loan_id", "monthly_reporting_period", "servicer", "interest_rate", "current_actual_upb",
        "loan_age", "remaining_months_to_legal_maturity", "adj_remaining_months_to_maturity",
        "maturity_date", "msa", "current_loan_delinquency_status", "mod_flag", "zero_balance_code",
        "zero_balance_effective_date", "last_paid_installment_date", "foreclosed_after",
        "disposition_date", "foreclosure_costs", "prop_preservation_and_repair_costs",
        "asset_recovery_costs", "misc_holding_expenses", "holding_taxes", "net_sale_proceeds",
        "credit_enhancement_proceeds", "repurchase_make_whole_proceeds", "other_foreclosure_proceeds",
        "non_interest_bearing_upb", "principal_forgiveness_upb", "repurchase_make_whole_proceeds_flag",
        "foreclosure_principal_write_off_amount", "servicing_activity_indicator"
    ]
    
    dtypes = OrderedDict([
        ("loan_id", "int64"),
        ("monthly_reporting_period", "date"),
        ("servicer", "category"),
        ("interest_rate", "float64"),
        ("current_actual_upb", "float64"),
        ("loan_age", "float64"),
        ("remaining_months_to_legal_maturity", "float64"),
        ("adj_remaining_months_to_maturity", "float64"),
        ("maturity_date", "date"),
        ("msa", "float64"),
        ("current_loan_delinquency_status", "int32"),
        ("mod_flag", "category"),
        ("zero_balance_code", "category"),
        ("zero_balance_effective_date", "date"),
        ("last_paid_installment_date", "date"),
        ("foreclosed_after", "date"),
        ("disposition_date", "date"),
        ("foreclosure_costs", "float64"),
        ("prop_preservation_and_repair_costs", "float64"),
        ("asset_recovery_costs", "float64"),
        ("misc_holding_expenses", "float64"),
        ("holding_taxes", "float64"),
        ("net_sale_proceeds", "float64"),
        ("credit_enhancement_proceeds", "float64"),
        ("repurchase_make_whole_proceeds", "float64"),
        ("other_foreclosure_proceeds", "float64"),
        ("non_interest_bearing_upb", "float64"),
        ("principal_forgiveness_upb", "float64"),
        ("repurchase_make_whole_proceeds_flag", "category"),
        ("foreclosure_principal_write_off_amount", "float64"),
        ("servicing_activity_indicator", "category")
    ])

    return dask_cudf.read_csv(performance_path, names=cols, delimiter='|', dtype=list(dtypes.values()))

In [8]:
# Identify list of files to load.
data_dir = '/mnt/isilon1/data/mortgage'
perf_file = []
perf_file += glob.glob(data_dir + '/perf/Performance_*')
#perf_file += glob.glob(data_dir + '/perf/Performance_*')
#perf_file += glob.glob(data_dir + '/perf/Performance_2016*.txt')
len(perf_file)

112

In [9]:
%%time
perf_ddf = gpu_load_performance_csv(perf_file)

CPU times: user 1.13 s, sys: 694 ms, total: 1.82 s
Wall time: 2.33 s


In [10]:
perf_ddf

Unnamed: 0_level_0,loan_id,monthly_reporting_period,servicer,interest_rate,current_actual_upb,loan_age,remaining_months_to_legal_maturity,adj_remaining_months_to_maturity,maturity_date,msa,current_loan_delinquency_status,mod_flag,zero_balance_code,zero_balance_effective_date,last_paid_installment_date,foreclosed_after,disposition_date,foreclosure_costs,prop_preservation_and_repair_costs,asset_recovery_costs,misc_holding_expenses,holding_taxes,net_sale_proceeds,credit_enhancement_proceeds,repurchase_make_whole_proceeds,other_foreclosure_proceeds,non_interest_bearing_upb,principal_forgiveness_upb,repurchase_make_whole_proceeds_flag,foreclosure_principal_write_off_amount,servicing_activity_indicator
npartitions=823,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1
,int64,datetime64[ms],int32,float64,float64,float64,float64,float64,datetime64[ms],float64,int32,int32,int32,datetime64[ms],datetime64[ms],datetime64[ms],datetime64[ms],float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,int32,float64,int32
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...


In [11]:
%%time
# Read from files into GPU memory.
perf_ddf = perf_ddf.persist()
wait(perf_ddf)

CPU times: user 627 ms, sys: 42.8 ms, total: 670 ms
Wall time: 25.7 s


In [12]:
#perf_ddf.dask

In [13]:
%%time
perf_ddf.head()

CPU times: user 89.2 ms, sys: 1.79 ms, total: 91 ms
Wall time: 904 ms


Unnamed: 0,loan_id,monthly_reporting_period,servicer,interest_rate,current_actual_upb,loan_age,remaining_months_to_legal_maturity,adj_remaining_months_to_maturity,maturity_date,msa,...,holding_taxes,net_sale_proceeds,credit_enhancement_proceeds,repurchase_make_whole_proceeds,other_foreclosure_proceeds,non_interest_bearing_upb,principal_forgiveness_upb,repurchase_make_whole_proceeds_flag,foreclosure_principal_write_off_amount,servicing_activity_indicator
0,548637365156,2005-01-01,,5.75,130964.67,9.0,351.0,351.0,2034-04-01,35840.0,...,,,,,,,,,,
1,548637365156,2006-01-01,,5.75,129125.98,21.0,339.0,338.0,2034-04-01,35840.0,...,,,,,,,,,,
2,548637365156,2007-01-01,,5.75,127061.77,33.0,327.0,325.0,2034-04-01,35840.0,...,,,,,,,,,,
3,548637365156,2005-02-01,,5.75,130820.14,10.0,350.0,350.0,2034-04-01,35840.0,...,,,,,,,,,,
4,548637365156,2006-02-01,,5.75,128972.64,22.0,338.0,337.0,2034-04-01,35840.0,...,,,,,,,,,,


In [18]:
%%time
len(perf_ddf)

CPU times: user 272 ms, sys: 7.94 ms, total: 280 ms
Wall time: 892 ms


1890353680

In [15]:
%%time
perf_ddf.head().to_pandas().info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 31 columns):
loan_id                                   5 non-null int64
monthly_reporting_period                  5 non-null datetime64[ns]
servicer                                  5 non-null int32
interest_rate                             5 non-null float64
current_actual_upb                        5 non-null float64
loan_age                                  5 non-null float64
remaining_months_to_legal_maturity        5 non-null float64
adj_remaining_months_to_maturity          5 non-null float64
maturity_date                             5 non-null datetime64[ns]
msa                                       5 non-null float64
current_loan_delinquency_status           5 non-null int32
mod_flag                                  5 non-null int32
zero_balance_code                         5 non-null int32
zero_balance_effective_date               0 non-null datetime64[ns]
last_paid_installment_date        

In [20]:
%%time
perf_ddf.groupby(['servicer'])['interest_rate'].max().compute()

CPU times: user 439 ms, sys: 1.87 ms, total: 441 ms
Wall time: 4.46 s


servicer
-607615835     10.625
-719450719      7.875
 43032076       7.125
 614598578     10.635
-1528626516     9.875
-2008831641     9.750
 355891246      9.875
 1612282686     8.875
-909118907     10.125
 412535281      7.875
 1244349582     6.000
 921762798     11.125
 976527792     10.250
-664828559     10.500
 1151096863    10.125
 1810806798    11.750
-1458911142    10.625
 1759296166     9.625
-830673843      9.375
-1183046192    10.750
-1993143021    10.125
-1583942043     9.000
 1251299360     9.000
 2115037238    10.750
-1587261923    11.125
 1240500859    13.500
-470063752      6.500
-1376962655     9.625
 1408366847    10.375
-552170331     10.625
 1421773951     6.875
 1802337997    10.625
 1569661717     9.250
 745151631      6.125
-762052852      9.750
 795749361     10.500
 10796916      10.500
-2127902343     8.875
-854413153      9.875
-796085340      7.750
 1624108708    10.625
 310110673     10.000
 540689291      8.500
-983513704     11.500
-1974001203    11.750
N