In [2]:
import cudf
import pandas as pd
from time import time
import matplotlib.pyplot as plt
from statistics import mean
import numpy as np

In [2]:
results = {
    "cpu": {},
    "gpu": {}
}

scale_factors = ["x0_1", "x0_5", "x1", "x5", "x10", "x20"]

### CPU

In [None]:
for scale_factor in scale_factors:
    results['cpu'][scale_factor] = {}
    results['cpu'][scale_factor]["io_opt"] = {}
    results['cpu'][scale_factor]["queries"] = {}

    start = time()
    cpu_df = pd.read_csv("../data/raw/online_retail_II_"+scale_factor+".csv")
    end = time()

    results["cpu"][scale_factor]["io_opt"]["load_csv"] = end-start

    start = time()
    cpu_df.info()
    end = time()

    results["cpu"][scale_factor]["io_opt"]["info"] = end-start

    start = time()
    cpu_df.describe(include="O")
    end = time()

    results["cpu"][scale_factor]["io_opt"]["describe"] = end-start

    start = time()
    cpu_df.groupby("Country").Price.sum()
    end = time()

    results["cpu"][scale_factor]["queries"]["q1"] = end-start

    start = time()
    cpu_df.groupby("Invoice").Price.sum()
    end = time()

    results["cpu"][scale_factor]["queries"]["q2"] = end-start

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 8 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   Invoice      100000 non-null  object 
 1   StockCode    100000 non-null  object 
 2   Description  99130 non-null   object 
 3   Quantity     100000 non-null  int64  
 4   InvoiceDate  100000 non-null  object 
 5   Price        100000 non-null  float64
 6   Customer ID  72861 non-null   float64
 7   Country      100000 non-null  object 
dtypes: float64(2), int64(1), object(5)
memory usage: 6.1+ MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500000 entries, 0 to 499999
Data columns (total 8 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   Invoice      500000 non-null  object 
 1   StockCode    500000 non-null  object 
 2   Description  497184 non-null  object 
 3   Quantity     500000 non-null  int64  
 4   InvoiceDate  500000 non-nu

## GPU

In [4]:
for scale_factor in scale_factors:
    if scale_factor != "x20":
        results['gpu'][scale_factor] = {}
        results['gpu'][scale_factor]["io_opt"] = {}
        results['gpu'][scale_factor]["queries"] = {}

        start = time()
        gpu_df = cudf.read_csv("../data/raw/online_retail_II_"+scale_factor+".csv")
        end = time()

        results["gpu"][scale_factor]["io_opt"]["load_csv"] = end-start

        start = time()
        gpu_df.info()
        end = time()

        results["gpu"][scale_factor]["io_opt"]["info"] = end-start

        start = time()
        gpu_df.describe(include="O")
        end = time()

        results["gpu"][scale_factor]["io_opt"]["describe"] = end-start

        start = time()
        gpu_df.groupby("Country").Price.sum()
        end = time()

        results["gpu"][scale_factor]["queries"]["q1"] = end-start

        start = time()
        gpu_df.groupby("Invoice").Price.sum()
        end = time()

        results["gpu"][scale_factor]["queries"]["q2"] = end-start

<class 'cudf.core.dataframe.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 8 columns):
 #   Column       Non-Null Count   Dtype
---  ------       --------------   -----
 0   Invoice      100000 non-null  object
 1   StockCode    100000 non-null  object
 2   Description  99130 non-null   object
 3   Quantity     100000 non-null  int64
 4   InvoiceDate  100000 non-null  object
 5   Price        100000 non-null  float64
 6   Customer ID  72861 non-null   float64
 7   Country      100000 non-null  object
dtypes: float64(2), int64(1), object(5)
memory usage: 10.9+ MB
<class 'cudf.core.dataframe.DataFrame'>
RangeIndex: 500000 entries, 0 to 499999
Data columns (total 8 columns):
 #   Column       Non-Null Count   Dtype
---  ------       --------------   -----
 0   Invoice      500000 non-null  object
 1   StockCode    500000 non-null  object
 2   Description  497184 non-null  object
 3   Quantity     500000 non-null  int64
 4   InvoiceDate  500000 non-null  object
 5  

TypeError: cuDF does not support iteration of DataFrame via iterrows. Consider using `.to_pandas().iterrows()` if you wish to iterate over each row.

### Join tests

In [5]:
# gpu_df = cudf.read_csv("../data/raw/online_retail_II_x1.csv")
# gpu_df_idx = cudf.read_csv("../data/raw/online_retail_II_indexes.csv")

In [None]:
# gpu_df["StockCode"].values

In [None]:
# start = time()
# ix_col = "StockCode"
# target_cols = ["Invoice", "Price", "Quantity"]
# broadcasted_ix = gpu_df[ix_col].values
# broadcasted_df_ix = gpu_df_idx.set_index(ix_col).reindex(broadcasted_ix)
# join_df = gpu_df.concat([gpu_df, broadcasted_df_ix[target_cols]], axis=1, ignore_index=True)
# end = time()

# print(end-start)

## PLOTS

In [None]:
def plot(sf, cpu_io, gpu_io, cpu_queries, gpu_queries):
    # PLOT 1
    labels = ['Load\ndataset', '.info()', '.describe()']
    # cpu_values = [i[1] for i in results["cpu"]["x1"]["io_opt"].items()]
    # gpu_values = [i[1] for i in results["gpu"]["x1"]["io_opt"].items()]

    x = np.arange(len(labels))  # the label locations
    width = 0.35  # the width of the bars

    ax1 = plt.subplot(1, 2, 1)
    ax1.bar(x - width/2, cpu_io, width, label='CPU')
    ax1.bar(x + width/2, gpu_io, width, label='GPU')

    ax1.set_ylabel('seconds')
    ax1.set_title('I/O operations by time')
    ax1.set_xticks(x)
    ax1.set_xticklabels(labels)
    ax1.legend()

    # PLOT 2
    labels = ['Q1', 'Q2']
    # cpu_queries = [i[1] for i in results["cpu"]["x1"]["queries"].items()]
    # gpu_queries = [i[1] for i in results["gpu"]["x1"]["queries"].items()]

    x = np.arange(len(labels))  # the label locations
    width = 0.35  # the width of the bars

    ax2 = plt.subplot(1, 2, 2)
    ax2.bar(x - width/2, cpu_queries, width, label='CPU')
    ax2.bar(x + width/2, gpu_queries, width, label='GPU')

    ax2.set_title('Queries by time')
    ax2.set_xticks(x)
    ax2.set_xticklabels(labels)
    ax2.legend()

    plt.suptitle("Test with SF="+sf)
    plt.show()

In [None]:
for scale_factor in scale_factors:
    if scale_factor != "x20":
        plot(
            scale_factor,
            [i[1] for i in results["cpu"][scale_factor]["io_opt"].items()],
            [i[1] for i in results["gpu"][scale_factor]["io_opt"].items()],
            [i[1] for i in results["cpu"][scale_factor]["queries"].items()],
            [i[1] for i in results["gpu"][scale_factor]["queries"].items()]
        )

In [None]:
cpu = []
gpu = []

for scale_factor in scale_factors:
    if scale_factor != "x20":
        cpu.append(mean([i[1] for i in results["cpu"][scale_factor]["queries"].items()]))
        gpu.append(mean([i[1] for i in results["gpu"][scale_factor]["queries"].items()]))

activity = ["SF 0.1", "SF 0.5", "SF 1", "SF 5", "SF 10"]

fig, ax = plt.subplots()
ax.plot(activity, gpu, label="GPU")
ax.plot(activity, cpu, label="CPU")
ax.legend()

plt.show()