In [1]:
import dask
dask.config.config

# persist // compute .  pins to memory and future computation can use this result rather than re-compute
# lazy execution 

{'temporary-directory': None,
 'dataframe': {'shuffle-compression': None},
 'array': {'svg': {'size': 120}}}

In [3]:
from dask_jobqueue import LSFCluster

In [4]:
# Per node specification
dask_worker_prefix = "jsrun -n1 -a1 -g0 -c1"

cluster = LSFCluster(
    scheduler_options={"dashboard_address": ":3762"},
    cores=8,
    processes=1, # default sqrt(cores). set to one to max threads per machine.  better for numpy.  see https://docs.dask.org/en/latest/setup/single-machine.html
    memory="4 GB",
    project="VEN201",
    walltime="00:10",
    job_extra=["-nnodes 1"],          # <--- new!
    header_skip=["-R", "-n ", "-M"],  # <--- new!
    interface='ib0',
    use_stdin=False,
)

## Lets See what is sent to LSF

In [5]:
print(cluster.job_script())

#!/usr/bin/env bash

#BSUB -J dask-worker
#BSUB -P VEN201
#BSUB -W 00:30
#BSUB -nnodes 1

/ccs/home/vanstee/.conda/envs/powerai-ornl/bin/python -m distributed.cli.dask_worker tcp://10.41.0.32:36525 --nthreads 8 --memory-limit 4.00GB --name name --nanny --death-timeout 60 --interface ib0



In [6]:
from dask.distributed import Client
client = Client(cluster)

In [6]:
# client.restart()

In [7]:
client

0,1
Client  Scheduler: tcp://10.41.0.32:36525  Dashboard: http://10.41.0.32:3762/status,Cluster  Workers: 0  Cores: 0  Memory: 0 B


In [8]:
cluster.scale(2)

In [None]:
client

In [12]:
!bjobs

JOBID   USER       STAT   SLOTS    QUEUE       START_TIME    FINISH_TIME   JOB_NAME                      
376497  vanstee    RUN    43       batch       Sep 29 16:23  Sep 29 16:53  dask-worker                   
376498  vanstee    RUN    43       batch       Sep 29 16:23  Sep 29 16:53  dask-worker                   


# Numpy simple example ...

In [14]:
import dask.array as da
# 2.5 B element array , 500 chunks
x = da.random.random([5000,5000], chunks=[250,250])


In [18]:
cluster.scale(8)

In [15]:
x = x.persist()
x

Unnamed: 0,Array,Chunk
Bytes,200.00 MB,500.00 kB
Shape,"(5000, 5000)","(250, 250)"
Count,400 Tasks,400 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 200.00 MB 500.00 kB Shape (5000, 5000) (250, 250) Count 400 Tasks 400 Chunks Type float64 numpy.ndarray",5000  5000,

Unnamed: 0,Array,Chunk
Bytes,200.00 MB,500.00 kB
Shape,"(5000, 5000)","(250, 250)"
Count,400 Tasks,400 Chunks
Type,float64,numpy.ndarray


In [16]:
y = x.T ** x - x.mean()

In [17]:
y.persist()

Unnamed: 0,Array,Chunk
Bytes,200.00 MB,500.00 kB
Shape,"(5000, 5000)","(250, 250)"
Count,400 Tasks,400 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 200.00 MB 500.00 kB Shape (5000, 5000) (250, 250) Count 400 Tasks 400 Chunks Type float64 numpy.ndarray",5000  5000,

Unnamed: 0,Array,Chunk
Bytes,200.00 MB,500.00 kB
Shape,"(5000, 5000)","(250, 250)"
Count,400 Tasks,400 Chunks
Type,float64,numpy.ndarray


In [38]:
#del(y)
y.compute()

AttributeError: 'str' object has no attribute 'shape'

In [None]:
# Persist vs Compute https://distributed.dask.org/en/latest/memory.html
# use compute when the return value is small and you want to feed result into other analyses.
# use persist (similar to cache in spark) to trigger computation and pin results to memory.  
# Follow actions build task graphs, but only up to this point as it will use the value calculated by persist.

## Simple  Pandas Example with our lending club data ...

In [16]:
!ls ../Tabular/ldata2016.csv.gz

import dask
import dask.dataframe as dd
ddf = dd.read_csv("./ldata2016.csv", blocksize=15e6) # , compression="gzip")
#
#ddf = ddf.repartition(npartitions=5)
#ldata2016.csv.gz
ddf
print(ddf.columns)

../Tabular/ldata2016.csv.gz
Index(['id', 'member_id', 'loan_amnt', 'funded_amnt', 'funded_amnt_inv',
       'term', 'int_rate', 'installment', 'grade', 'sub_grade',
       ...
       'hardship_payoff_balance_amount', 'hardship_last_payment_amount',
       'disbursement_method', 'debt_settlement_flag',
       'debt_settlement_flag_date', 'settlement_status', 'settlement_date',
       'settlement_amount', 'settlement_percentage', 'settlement_term'],
      dtype='object', length=151)


In [10]:
ddf.head()

  args2 = [_execute_task(a, cache) for a in args]


Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,Unnamed: 10,Unnamed: 11,Unnamed: 12,Unnamed: 13,Unnamed: 14,Unnamed: 15,Unnamed: 16,Unnamed: 17,Unnamed: 18,Unnamed: 19,Unnamed: 20,Unnamed: 21,Unnamed: 22,Unnamed: 23,Unnamed: 24,Unnamed: 25,Unnamed: 26,Unnamed: 27,Unnamed: 28,Unnamed: 29,Unnamed: 30,Unnamed: 31,Unnamed: 32,Unnamed: 33,Unnamed: 34,Unnamed: 35,Unnamed: 36,Unnamed: 37,Unnamed: 38,Unnamed: 39,Unnamed: 40,Unnamed: 41,Unnamed: 42,Unnamed: 43,Unnamed: 44,Unnamed: 45,Unnamed: 46,Unnamed: 47,Unnamed: 48,Unnamed: 49,Unnamed: 50,Unnamed: 51,Unnamed: 52,Unnamed: 53,Unnamed: 54,Unnamed: 55,Unnamed: 56,Unnamed: 57,Unnamed: 58,Unnamed: 59,Unnamed: 60,Unnamed: 61,Unnamed: 62,Unnamed: 63,Unnamed: 64,Unnamed: 65,Unnamed: 66,Unnamed: 67,Unnamed: 68,Unnamed: 69,Unnamed: 70,Unnamed: 71,Unnamed: 72,Unnamed: 73,Unnamed: 74,Unnamed: 75,Unnamed: 76,Unnamed: 77,Unnamed: 78,Unnamed: 79,Unnamed: 80,Unnamed: 81,Unnamed: 82,Unnamed: 83,Unnamed: 84,Unnamed: 85,Unnamed: 86,Unnamed: 87,Unnamed: 88,Unnamed: 89,Unnamed: 90,Unnamed: 91,Unnamed: 92,Unnamed: 93,Unnamed: 94,Unnamed: 95,Unnamed: 96,Unnamed: 97,Unnamed: 98,Unnamed: 99,Unnamed: 100,Unnamed: 101,Unnamed: 102,Unnamed: 103,Unnamed: 104,Unnamed: 105,Unnamed: 106,Unnamed: 107,Unnamed: 108,Unnamed: 109,Unnamed: 110,Unnamed: 111,Unnamed: 112,Unnamed: 113,Unnamed: 114,Unnamed: 115,Unnamed: 116,Unnamed: 117,Unnamed: 118,Unnamed: 119,Unnamed: 120,Unnamed: 121,Unnamed: 122,Unnamed: 123,Unnamed: 124,Unnamed: 125,Unnamed: 126,Unnamed: 127,Unnamed: 128,Unnamed: 129,Unnamed: 130,Unnamed: 131,Unnamed: 132,Unnamed: 133,Unnamed: 134,Unnamed: 135,Unnamed: 136,Unnamed: 137,Unnamed: 138,Unnamed: 139,Unnamed: 140,Unnamed: 141,Unnamed: 142,Unnamed: 143,Unnamed: 144,Unnamed: 145,Unnamed: 146,Unnamed: 147,Unnamed: 148,Unnamed: 149,Notes offered by Prospectus (https://www.lendingclub.com/info/prospectus.action)
id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,emp_title,emp_length,home_ownership,annual_inc,verification_status,issue_d,loan_status,pymnt_plan,url,desc,purpose,title,zip_code,addr_state,dti,delinq_2yrs,earliest_cr_line,fico_range_low,fico_range_high,inq_last_6mths,mths_since_last_delinq,mths_since_last_record,open_acc,pub_rec,revol_bal,revol_util,total_acc,initial_list_status,out_prncp,out_prncp_inv,total_pymnt,total_pymnt_inv,total_rec_prncp,total_rec_int,total_rec_late_fee,recoveries,collection_recovery_fee,last_pymnt_d,last_pymnt_amnt,next_pymnt_d,last_credit_pull_d,last_fico_range_high,last_fico_range_low,collections_12_mths_ex_med,mths_since_last_major_derog,policy_code,application_type,annual_inc_joint,dti_joint,verification_status_joint,acc_now_delinq,tot_coll_amt,tot_cur_bal,open_acc_6m,open_act_il,open_il_12m,open_il_24m,mths_since_rcnt_il,total_bal_il,il_util,open_rv_12m,open_rv_24m,max_bal_bc,all_util,total_rev_hi_lim,inq_fi,total_cu_tl,inq_last_12m,acc_open_past_24mths,avg_cur_bal,bc_open_to_buy,bc_util,chargeoff_within_12_mths,delinq_amnt,mo_sin_old_il_acct,mo_sin_old_rev_tl_op,mo_sin_rcnt_rev_tl_op,mo_sin_rcnt_tl,mort_acc,mths_since_recent_bc,mths_since_recent_bc_dlq,mths_since_recent_inq,mths_since_recent_revol_delinq,num_accts_ever_120_pd,num_actv_bc_tl,num_actv_rev_tl,num_bc_sats,num_bc_tl,num_il_tl,num_op_rev_tl,num_rev_accts,num_rev_tl_bal_gt_0,num_sats,num_tl_120dpd_2m,num_tl_30dpd,num_tl_90g_dpd_24m,num_tl_op_past_12m,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,revol_bal_joint,sec_app_fico_range_low,sec_app_fico_range_high,sec_app_earliest_cr_line,sec_app_inq_last_6mths,sec_app_mort_acc,sec_app_open_acc,sec_app_revol_util,sec_app_open_act_il,sec_app_num_rev_accts,sec_app_chargeoff_within_12_mths,sec_app_collections_12_mths_ex_med,sec_app_mths_since_last_major_derog,hardship_flag,hardship_type,hardship_reason,hardship_status,deferral_term,hardship_amount,hardship_start_date,hardship_end_date,payment_plan_start_date,hardship_length,hardship_dpd,hardship_loan_status,orig_projected_additional_accrued_interest,hardship_payoff_balance_amount,hardship_last_payment_amount,disbursement_method,debt_settlement_flag,debt_settlement_flag_date,settlement_status,settlement_date,settlement_amount,settlement_percentage,settlement_term
76003542,,16000,16000,16000,36 months,5.32%,481.84,A,A1,Security specialist,8 years,RENT,105000,Not Verified,Mar-2016,Fully Paid,n,https://lendingclub.com/browse/loanDetail.action?loan_id=76003542,,debt_consolidation,Debt consolidation,206xx,MD,15.02,1,Nov-2000,720,724,0,9,,9,0,6219,27.9%,21,w,0.00,0.00,16098.34,16098.34,16000.00,98.34,0.0,0.0,0.0,May-2016,16107.8,,Nov-2016,679,675,0,,1,Individual,,,,0,0,23525,0,3,2,2,8,17306,69,0,1,3770,50,22300,1,3,1,3,3361,13632,29.7,0,0,124,184,22,8,0,52,,8,9,0,2,3,2,5,7,6,14,3,9,0,0,0,2,95.2,0,0,0,47543,23525,19400,25243,,,,,,,,,,,,,,N,,,,,,,,,,,,,,,Cash,N,,,,,,
76023477,,17000,17000,17000,36 months,9.75%,546.55,B,B3,Kindergarten Teacher,< 1 year,MORTGAGE,40000,Verified,Mar-2016,Current,n,https://lendingclub.com/browse/loanDetail.action?loan_id=76023477,,credit_card,Credit card refinancing,440xx,NC,24.3,0,Aug-2006,705,709,0,,,4,0,14009,85.9%,8,w,1856.98,1856.98,16630.58,16630.58,15143.02,1487.56,0.0,0.0,0.0,Nov-2017,546.55,Dec-2017,Nov-2017,799,795,0,,1,Individual,,,,0,0,31939,0,1,0,0,25,17930,64,0,0,4913,72,16300,1,0,1,0,7985,2291,85.9,0,0,25,115,35,25,0,35,,6,,0,3,3,3,5,1,3,7,3,4,0,0,0,0,100,100,0,0,44131,31939,16300,27831,,,,,,,,,,,,,,N,,,,,,,,,,,,,,,Cash,N,,,,,,
75993535,,15000,15000,15000,60 months,15.31%,359.3,C,C5,Branch Manager,10+ years,MORTGAGE,70000,Source Verified,Mar-2016,Fully Paid,n,https://lendingclub.com/browse/loanDetail.action?loan_id=75993535,,home_improvement,Home improvement,750xx,TX,9.99,0,Jan-2002,690,694,0,32,,6,0,11630,48.5%,22,w,0.00,0.00,17155.0456524628,17155.05,15000.00,2137.08,17.97,0.0,0.0,Apr-2017,13106.47,,Mar-2017,754,750,0,,1,Individual,,,,0,0,27322,0,1,1,1,12,15692,72,1,2,5840,60,24000,3,0,6,3,4554,9808,38.7,0,0,160,152,9,9,3,9,,8,,0,2,4,3,6,7,5,12,4,6,0,0,0,2,95.5,33.3,0,0,45706,27322,16000,21706,,,,,,,,,,,,,,N,,,,,,,,,,,,,,,Cash,N,,,,,,
73458582,,2425,2425,2425,36 months,15.31%,84.44,C,C5,Teacher,5 years,MORTGAGE,25000,Source Verified,Mar-2016,Fully Paid,n,https://lendingclub.com/browse/loanDetail.action?loan_id=73458582,,credit_card,Credit card refinancing,296xx,SC,38.5,0,Sep-2007,705,709,0,,,7,0,10604,84.8%,15,w,0.00,0.00,2653.18,2653.18,2425.00,228.18,0.0,0.0,0.0,Dec-2016,2062.67,,Nov-2017,714,710,0,,1,Individual,,,,0,0,97019,0,4,0,1,24,15963,65,0,0,7288,72,12500,0,0,0,1,13860,12,99.8,0,0,102,97,27,24,2,27,,,,0,1,2,1,3,5,2,8,2,7,0,0,0,0,100,100,0,0,115022,26567,7300,24550,,,,,,,,,,,,,,N,,,,,,,,,,,,,,,Cash,N,,,,,,


In [19]:
filtered_df = ddf["loan_amnt"] > 15000
filtered_df

Dask Series Structure:
npartitions=9
    bool
     ...
    ... 
     ...
     ...
Name: loan_amnt, dtype: bool
Dask Name: gt, 45 tasks

In [20]:
filtered_df.compute()

ValueError: Mismatched dtypes found in `pd.read_csv`/`pd.read_table`.

+----------------------------+---------+----------+
| Column                     | Found   | Expected |
+----------------------------+---------+----------+
| acc_now_delinq             | float64 | int64    |
| acc_open_past_24mths       | float64 | int64    |
| all_util                   | float64 | int64    |
| avg_cur_bal                | float64 | int64    |
| chargeoff_within_12_mths   | float64 | int64    |
| collections_12_mths_ex_med | float64 | int64    |
| delinq_2yrs                | float64 | int64    |
| delinq_amnt                | float64 | int64    |
| fico_range_high            | float64 | int64    |
| fico_range_low             | float64 | int64    |
| funded_amnt                | float64 | int64    |
| funded_amnt_inv            | float64 | int64    |
| id                         | object  | int64    |
| inq_fi                     | float64 | int64    |
| inq_last_12m               | float64 | int64    |
| inq_last_6mths             | float64 | int64    |
| last_fico_range_high       | float64 | int64    |
| last_fico_range_low        | float64 | int64    |
| loan_amnt                  | float64 | int64    |
| max_bal_bc                 | float64 | int64    |
| mo_sin_old_rev_tl_op       | float64 | int64    |
| mo_sin_rcnt_rev_tl_op      | float64 | int64    |
| mo_sin_rcnt_tl             | float64 | int64    |
| mort_acc                   | float64 | int64    |
| num_accts_ever_120_pd      | float64 | int64    |
| num_actv_bc_tl             | float64 | int64    |
| num_actv_rev_tl            | float64 | int64    |
| num_bc_sats                | float64 | int64    |
| num_bc_tl                  | float64 | int64    |
| num_il_tl                  | float64 | int64    |
| num_op_rev_tl              | float64 | int64    |
| num_rev_accts              | float64 | int64    |
| num_rev_tl_bal_gt_0        | float64 | int64    |
| num_sats                   | float64 | int64    |
| num_tl_30dpd               | float64 | int64    |
| num_tl_90g_dpd_24m         | float64 | int64    |
| num_tl_op_past_12m         | float64 | int64    |
| open_acc                   | float64 | int64    |
| open_acc_6m                | float64 | int64    |
| open_act_il                | float64 | int64    |
| open_il_12m                | float64 | int64    |
| open_il_24m                | float64 | int64    |
| open_rv_12m                | float64 | int64    |
| open_rv_24m                | float64 | int64    |
| policy_code                | float64 | int64    |
| pub_rec                    | float64 | int64    |
| pub_rec_bankruptcies       | float64 | int64    |
| revol_bal                  | float64 | int64    |
| tax_liens                  | float64 | int64    |
| tot_coll_amt               | float64 | int64    |
| tot_cur_bal                | float64 | int64    |
| tot_hi_cred_lim            | float64 | int64    |
| total_acc                  | float64 | int64    |
| total_bal_ex_mort          | float64 | int64    |
| total_bal_il               | float64 | int64    |
| total_bc_limit             | float64 | int64    |
| total_cu_tl                | float64 | int64    |
| total_il_high_credit_limit | float64 | int64    |
| total_rev_hi_lim           | float64 | int64    |
+----------------------------+---------+----------+

The following columns also raised exceptions on conversion:

- id
  ValueError("invalid literal for int() with base 10: 'Total amount funded in policy code 1: 2087217200'",)

Usually this is due to dask's dtype inference failing, and
*may* be fixed by specifying dtypes manually by adding:

dtype={'acc_now_delinq': 'float64',
       'acc_open_past_24mths': 'float64',
       'all_util': 'float64',
       'avg_cur_bal': 'float64',
       'chargeoff_within_12_mths': 'float64',
       'collections_12_mths_ex_med': 'float64',
       'delinq_2yrs': 'float64',
       'delinq_amnt': 'float64',
       'fico_range_high': 'float64',
       'fico_range_low': 'float64',
       'funded_amnt': 'float64',
       'funded_amnt_inv': 'float64',
       'id': 'object',
       'inq_fi': 'float64',
       'inq_last_12m': 'float64',
       'inq_last_6mths': 'float64',
       'last_fico_range_high': 'float64',
       'last_fico_range_low': 'float64',
       'loan_amnt': 'float64',
       'max_bal_bc': 'float64',
       'mo_sin_old_rev_tl_op': 'float64',
       'mo_sin_rcnt_rev_tl_op': 'float64',
       'mo_sin_rcnt_tl': 'float64',
       'mort_acc': 'float64',
       'num_accts_ever_120_pd': 'float64',
       'num_actv_bc_tl': 'float64',
       'num_actv_rev_tl': 'float64',
       'num_bc_sats': 'float64',
       'num_bc_tl': 'float64',
       'num_il_tl': 'float64',
       'num_op_rev_tl': 'float64',
       'num_rev_accts': 'float64',
       'num_rev_tl_bal_gt_0': 'float64',
       'num_sats': 'float64',
       'num_tl_30dpd': 'float64',
       'num_tl_90g_dpd_24m': 'float64',
       'num_tl_op_past_12m': 'float64',
       'open_acc': 'float64',
       'open_acc_6m': 'float64',
       'open_act_il': 'float64',
       'open_il_12m': 'float64',
       'open_il_24m': 'float64',
       'open_rv_12m': 'float64',
       'open_rv_24m': 'float64',
       'policy_code': 'float64',
       'pub_rec': 'float64',
       'pub_rec_bankruptcies': 'float64',
       'revol_bal': 'float64',
       'tax_liens': 'float64',
       'tot_coll_amt': 'float64',
       'tot_cur_bal': 'float64',
       'tot_hi_cred_lim': 'float64',
       'total_acc': 'float64',
       'total_bal_ex_mort': 'float64',
       'total_bal_il': 'float64',
       'total_bc_limit': 'float64',
       'total_cu_tl': 'float64',
       'total_il_high_credit_limit': 'float64',
       'total_rev_hi_lim': 'float64'}

to the call to `read_csv`/`read_table`.

  args2 = [_execute_task(a, cache) for a in args]
