## Get resources

In [None]:
import coiled
import dask.distributed

In [None]:
cluster = coiled.Cluster(
    n_workers=20, 
    configuration="coiled/default", 
)
client = dask.distributed.Client(cluster)

client

## Get and pre-process data

In [None]:
import dask.dataframe as dd
df = dd.read_csv(
    "s3://nyc-tlc/trip data/yellow_tripdata_2019-*.csv", 
    parse_dates=["tpep_pickup_datetime", "tpep_dropoff_datetime"],
    dtype={
        "VendorID": "UInt8",
        "passenger_count": "UInt8",
        "RatecodeID": "UInt8",
        "store_and_fwd_flag": "category",
        "PULocationID": "UInt16",
        "DOLocationID": "UInt16",    
        "payment_type": "UInt8",
    },
    blocksize="16 MiB",
)

data = df[["passenger_count", "trip_distance", "RatecodeID", "payment_type", "fare_amount"]]
data = data.fillna(0)

labels = (df.tip_amount / df.fare_amount) > 0.25
labels = labels.fillna(False)

from dask_ml.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(data, labels, shuffle=True)
X_train, X_test, y_train, y_test = dask.persist(X_train, X_test, y_train, y_test)

## Train model

In [None]:
from sklearn.linear_model import SGDClassifier
from dask_ml.model_selection import HyperbandSearchCV
from scipy.stats import uniform, loguniform


clf = SGDClassifier(tol=1e-3, penalty='elasticnet', random_state=0)

params = {'alpha': loguniform(1e-2, 1e0),  # or np.logspace
          'l1_ratio': uniform(0, 1)}  # or np.linspace

search = HyperbandSearchCV(clf, params, max_iter=81, random_state=0)

search.fit(X_train, y_train, classes=[0, 1]);

## Score

In [None]:
search.score(X_test.sample(frac=0.1, random_state=123), y_test.sample(frac=0.1, random_state=123))

## What if we just sampled instead?

In [None]:
clf.fit(
    X_train.sample(frac=0.01, random_state=123).compute(), 
    y_train.sample(frac=0.01, random_state=123).compute()
)

In [None]:
clf.score(
    X_test.sample(frac=0.01, random_state=123).compute(), 
    y_test.sample(frac=0.01, random_state=123).compute()
)

Depending on our business needs, we maybe didn't need to do all of this :)