In [None]:
from dask_cuda import LocalCUDACluster

In [None]:
import cudf

In [None]:
%%timeit -n 1 -r 1
train_features = cudf.read_csv('DB/new_data/training_features.csv')
test_features=cudf.read_csv('DB/new_data/testing_features.csv')
train_labels=cudf.read_csv('DB/new_data/training_labels.csv')
test_labels=cudf.read_csv('DB/new_data/testing_labels.csv')

In [None]:
import numpy as np
from cuml.test.utils import get_handle
from cuml.ensemble import RandomForestRegressor as curfc
from cuml.test.utils import get_handle

X = np.asarray([[0,10],[0,20],[0,30],[0,40]], dtype=np.float32)
y = np.asarray([0.0,1.0,2.0,3.0], dtype=np.float32)
cuml_model = curfc(max_features=1.0, n_bins=2,
                    split_algo=0, min_samples_leaf=1,
                    min_samples_split=2,
                    n_estimators=40, accuracy_metric='r2')

cuml_model.fit(X,y)
cuml_score = cuml_model.score(X,y)
print("MSE score of cuml : ", cuml_score)

In [None]:
#https://qiita.com/shin_ishiguro/items/8f39aac45acc8363a42e

import cudf
import cuml
import datetime as dt
import pandas as pd

file_path = 'yellow_tripdata_2019-01.csv'
cdf = cudf.read_csv(file_path) # cudf dataframeへのcsvファイル読み込み処理
df = pd.read_csv(file_path) # pandas dataframeへのcsvファイル読み込み処理

cdf = cdf.drop(columns='tpep_pickup_datetime')
cdf = cdf.drop(columns='tpep_dropoff_datetime')
cdf['pickup_date'] = pd.to_datetime(df.tpep_pickup_datetime)
cdf['dropoff_date'] = pd.to_datetime(df.tpep_dropoff_datetime)
cdf.shape

In [None]:
cdf.head(10)

In [None]:
search_date = dt.datetime.strptime('2019-01-01', '%Y-%m-%d')
cdf = cdf.query('pickup_date >= @search_date')
search_date = dt.datetime.strptime('2019-01-05', '%Y-%m-%d')
cdf = cdf.query('pickup_date < @search_date')
cdf = cdf.reset_index(drop=True) # rapids0.11から inplace=Trueが使えるようになったようです。
                                 # 0.10では使えませんでした

cdf['pickup_date_day'] = cdf.pickup_date.dt.day
cdf['pickup_date_weekday'] = cdf.pickup_date.dt.weekday
cdf['pickup_date_hour'] = cdf.pickup_date.dt.hour
cdf['dropoff_date_day'] = cdf.dropoff_date.dt.day
cdf['dropoff_date_weekday'] = cdf.dropoff_date.dt.weekday
cdf['dropoff_date_hour'] = cdf.dropoff_date.dt.hour

le = cuml.preprocessing.LabelEncoder()
cdf['store_and_fwd_flag'] = le.fit_transform(cdf.store_and_fwd_flag)
# cumlは、sklearn同様カテゴリ変数のラベル特徴量化の前処理等もできます。

cdf.congestion_surcharge.fillna(0, inplace=True)

_columns = ['VendorID', 'RatecodeID', 'store_and_fwd_flag', 'PULocationID', 
            'DOLocationID', 'payment_type', 
            'pickup_date_day', 'pickup_date_weekday', 'pickup_date_hour', 
            'dropoff_date_day', 'dropoff_date_weekday', 'dropoff_date_hour']

cdf = cudf.core.reshape.get_dummies(cdf, columns=_columns)
# 様々なカテゴリ変数を1-hot特徴量に変換しています。

for c in _columns:
    if c in cdf.columns:
        cdf = cdf.drop(columns=c)

cdf.fillna(0, inplace=True)

In [None]:
import dask
import dask_cudf
import dask_xgboost

from dask.distributed import Client, wait
from dask_cuda import LocalCUDACluster

import subprocess

cmd = "hostname --all-ip-addresses"
process = subprocess.Popen(cmd.split(), stdout=subprocess.PIPE)
output, error = process.communicate()
IPADDR = str(output.decode()).split()[0]

cluster = LocalCUDACluster(ip=IPADDR)
client = Client(cluster) #processes=False)#
client

In [None]:
_npartitions = 8
search_date = dt.datetime.strptime('2019-01-03', '%Y-%m-%d')
# rapids0.10だとdask_dataframeでdropしようとすると、inplace周りにバグがあるため、エラーが起きる
# このため、前処理で使えるメモリが減るが、cudf_dataframeの段階で前処理を済ませることにした

cdf = cdf.drop(columns='dropoff_date')
cdf_train = cdf.query('pickup_date < @search_date')
cdf_test  = cdf.query('pickup_date >= @search_date')
cdf_train = cdf_train.drop(columns='pickup_date')
cdf_test  = cdf_test.drop(columns='pickup_date')

ddf_train = dask_cudf.from_cudf(cdf_train, npartitions=_npartitions)
y_train   = ddf_train[['tip_amount']]
x_train   = ddf_train[ddf_train.columns.difference(['tip_amount'])]

ddf_test  = dask_cudf.from_cudf(cdf_test, npartitions=_npartitions)
y_test    = ddf_test[['tip_amount']]
x_test    = ddf_test[ddf_test.columns.difference(['tip_amount'])]

y_train.head(20)

In [None]:
params = {
    'num_rounds': 100, # 学習ラウンド数です。多いほどデータセットにフィットします
    'max_depth': 8,
    'max_leaves': 2**8,
    'n_gpus': 1, # 1つのGPUでは1つのプロセスで処理を行うため、n_gpusは1に固定して使うことが必須。
                 # Dask側でMulti GPU Processの設定をしているので、ちゃんと複数で計算してくれています。
    'tree_method': 'gpu_hist',
    'objective': 'reg:squarederror',
    'grow_policy': 'lossguide'
}

bst = dask_xgboost.train(client, params, x_train, y_train, num_boost_round=params['num_rounds'])


In [None]:
pred = dask_xgboost.predict(client, bst, x_test)
test = dask.dataframe.multi.concat([pred], axis=1)

test['squared_error'] = (test[0] - y_test['tip_amount'])**2

# 予測出力結果は、dask.dataframe.multi.concatを用いることで、
# [dask_cudf.Series]から、dask_cudf.DataFrameに変換を行っています。

rmse = np.sqrt(test.squared_error.mean().compute())
print('rmse value:', rmse)

In [None]:
import pandas as pd

csv = 'T-drive/tdrive.txt'
data = pd.read_csv(csv, header=None)
data.columns = ['id', 'time', 'lon', 'lat']
data0 = data[data['id'].isin(range(1,100))]

In [None]:
%%timeit
import numpy as np
from scipy.spatial.distance import directed_hausdorff

for i in range(1, 100):
    point0 = np.array(data0[data0['id'] == i][['lon', 'lat']])
    for j in range(1, 100):
        point1 = np.array(data0[data0['id'] == j][['lon', 'lat']])
        sklearn_distance = directed_hausdorff(point0, point1)

In [None]:
%%timeit
import numpy as np
from concurrent import futures
from scipy.spatial.distance import directed_hausdorff

future_list = []
with futures.ProcessPoolExecutor(max_workers=16) as executor:
    for i in range(1, 100):
        point0 = np.array(data0[data0['id'] == i][['lon', 'lat']])
        for j in range(1, 100):
            point1 = np.array(data0[data0['id'] == j][['lon', 'lat']])
            future = executor.submit(fn=directed_hausdorff, p0=point0, p1=point1)
            future_list.append(future)
    _ = futures.as_completed(fs=future_list)

In [None]:
%%timeit
import numpy as np
import pandas as pd
import cuspatial
from cudf import Series

# pandas, numpyからcudfに変換して利用します
cnt = Series(data0.groupby('id').count().iloc[:,0])
lon = Series(data0.lon)
lat = Series(data0.lat)
distance = cuspatial.directed_hausdorff_distance(lon, lat, cnt)