In [1]:
import cudf
import numpy as np

In [2]:
# Step 1: Create a cuDF DataFrame

data = np.random.randint(0, 100, size=(10**6, 3)) # 1 million rows, 3 columns

df = cudf.DataFrame(data, columns=["A", "B", "C"])


print("First 5 rows of the DataFrame:")

print(df.head())


# Step 2: Filter rows where column 'A' is greater than 50

filtered_df = df[df['A'] > 50]

print("Filtered DataFrame (A > 50), first 5 rows:")

print(filtered_df.head())


# Step 3: Perform a groupby on column 'A' and aggregate column 'B'

grouped_df = df.groupby('A').agg({'B': 'mean'})

print("GroupBy result (mean of B for each value of A), first 5 rows:")

print(grouped_df.head())

First 5 rows of the DataFrame:
    A   B   C
0  54  37  36
1  10   5  39
2  68  91   1
3  37  80  13
4  42  15  19
Filtered DataFrame (A > 50), first 5 rows:
     A   B   C
0   54  37  36
2   68  91   1
6   98  79  34
9   73  37  85
10  64  44  92
GroupBy result (mean of B for each value of A), first 5 rows:
            B
A            
7   49.527626
15  49.551971
38  49.319799
53  48.937910
94  49.541721


In [3]:
import pandas as pd

import cudf

import numpy as np

import time

# Generate a large dataset

data = np.random.randint(0, 100, size=(10**6, 3))


# Step 1: Perform operations using Pandas

df_pd = pd.DataFrame(data, columns=["A", "B", "C"])

start_time = time.time()


# Filter rows where column 'A' is greater than 50

filtered_pd = df_pd[df_pd['A'] > 50]

end_time = time.time()

print(f"Pandas operation took: {end_time - start_time:.4f} seconds")


# Step 2: Perform operations using cuDF

df_cudf = cudf.DataFrame(data, columns=["A", "B", "C"])

start_time = time.time()


# Filter rows where column 'A' is greater than 50

filtered_cudf = df_cudf[df_cudf['A'] > 50]

end_time = time.time()

print(f"cuDF operation took: {end_time - start_time:.4f} seconds")

Pandas operation took: 0.0385 seconds
cuDF operation took: 0.0050 seconds


In [4]:

import pandas as pd

import cudf

import numpy as np
import time


# Step 1: Create two large DataFrames with a common column

data1 = np.random.randint(0, 100000, size=(10**6, 2)) # 1 million rows, 2 columns

data2 = np.random.randint(0, 100000, size=(10**6, 2))


df_pd1 = pd.DataFrame(data1, columns=["key", "value1"])

df_pd2 = pd.DataFrame(data2, columns=["key", "value2"])


df_cudf1 = cudf.DataFrame(data1, columns=["key", "value1"])

df_cudf2 = cudf.DataFrame(data2, columns=["key", "value2"])


# Step 2: Perform the join operation using Pandas

start_time = time.time()

joined_pd = pd.merge(df_pd1, df_pd2, on='key', how='inner')

end_time = time.time()

print(f"Pandas join took: {end_time - start_time:.4f} seconds")


# Step 3: Perform the join operation using cuDF

start_time = time.time()

joined_cudf = df_cudf1.merge(df_cudf2, on='key', how='inner')

end_time = time.time()

print(f"cuDF join took: {end_time - start_time:.4f} seconds")

Pandas join took: 1.0624 seconds
cuDF join took: 0.0592 seconds


In [5]:
# Step 1: Create a large dataset

data = np.random.randint(0, 100, size=(10**6, 3)) # 1 million rows, 3 columns

df_pd = pd.DataFrame(data, columns=["A", "B", "C"])

df_cudf = cudf.DataFrame(data, columns=["A", "B", "C"])


# Step 2: Perform groupby and aggregation using Pandas

start_time = time.time()

grouped_pd = df_pd.groupby('A').agg({'B': 'mean', 'C': 'sum'})

end_time = time.time()

print(f"Pandas groupby took: {end_time - start_time:.4f} seconds")


# Step 3: Perform groupby and aggregation using cuDF

start_time = time.time()

grouped_cudf = df_cudf.groupby('A').agg({'B': 'mean', 'C': 'sum'})

end_time = time.time()

print(f"cuDF groupby took: {end_time - start_time:.4f} seconds")

Pandas groupby took: 0.0768 seconds
cuDF groupby took: 0.0058 seconds


In [6]:
# Step 1: Create a DataFrame with missing values
data = {'A': [1, 2, None, 4, None], 'B': [5, None, 7, None, 9]}

df = cudf.DataFrame(data)


print("Original DataFrame:")

print(df)


# Step 2: Drop rows with missing values

df_dropped = df.dropna()

print("DataFrame after dropping missing values:")

print(df_dropped)


# Step 3: Fill missing values with a default value

df_filled = df.fillna(0)

print("DataFrame after filling missing values:")

print


Original DataFrame:
      A     B
0     1     5
1     2  <NA>
2  <NA>     7
3     4  <NA>
4  <NA>     9
DataFrame after dropping missing values:
   A  B
0  1  5
DataFrame after filling missing values:


<function print>