# Generate dummy data for tutorial notebook 2

The second tutorial notebook (Intermediate Ray Core and Large Data) involves reading in parquet files that contain randomly generated x,y points.
This notebook will generate those files.

This notebook should be run on a Large hardware tier, with no need to attach any cluster.
By default, it will generate files in the default dataset of the current project; adjust this accordingly if 

In [1]:
#If pyarrow is not already 7.0.0 may need this
#!pip install pyarrow==7.0.0 --user

In [1]:
import os
import numpy as np
import pyarrow as pa
import pyarrow.parquet as pq
import pyarrow.dataset as pds

In [2]:
pa.__version__

'7.0.0'

In [3]:
dataset_path = f"/domino/datasets/local/{os.environ['DOMINO_PROJECT_NAME']}"
#dataset_path = f"/domino/datasets/local/Points-For-Pi-Approximation"

In [4]:
def generate_dummy_points(n_rows, name, n_parts = None):
    dummy_file_root = os.path.join(dataset_path, name)
    table = pa.Table.from_pydict({name: np.random.uniform(size=n_rows) for name in ['x', 'y']})
    if n_parts is None:
        pq.write_table(table, dummy_file_root + ".parquet")
    else:
        n_per = n_rows // n_parts
        pds.write_dataset(
            table,
            dummy_file_root,
            format='parquet',
            max_rows_per_file = n_per,
            max_rows_per_group = n_per,
        )

In [6]:
generate_dummy_points(10**6, "points_1e6")

In [7]:
!du -sh {dataset_path}/points_1e6.parquet

16M	/domino/datasets/local/ray-tutorial/points_1e6.parquet


In [8]:
generate_dummy_points(3*10**6, "points_3e6")

In [9]:
!du -sh {dataset_path}/points_3e6.parquet

47M	/domino/datasets/local/ray-tutorial/points_3e6.parquet


In [10]:
generate_dummy_points(10**7, "points_1e7")

In [11]:
!du -sh {dataset_path}/points_1e7.parquet

154M	/domino/datasets/local/ray-tutorial/points_1e7.parquet


In [12]:
generate_dummy_points(3*10**7, "points_3e7")

In [13]:
!du -sh {dataset_path}/points_3e7.parquet

459M	/domino/datasets/local/ray-tutorial/points_3e7.parquet


In [14]:
generate_dummy_points(3*10**7, "points_3e7_split10", n_parts=10)

In [15]:
!du -sh {dataset_path}/points_3e7_split10

478M	/domino/datasets/local/ray-tutorial/points_3e7_split10


In [16]:
generate_dummy_points(10**8, "points_1e8")

In [17]:
!du -sh {dataset_path}/points_1e8.parquet

1.5G	/domino/datasets/local/ray-tutorial/points_1e8.parquet


In [18]:
generate_dummy_points(10**8, "points_1e8_split10", n_parts=10)

In [19]:
!du -sh {dataset_path}/points_1e8_split10

1.6G	/domino/datasets/local/ray-tutorial/points_1e8_split10


In [5]:
generate_dummy_points(3*10**8, "points_3e8_split30", n_parts=30)

In [6]:
!du -sh {dataset_path}/points_3e8_split30

4.7G	/domino/datasets/local/ray-tutorial/points_3e8_split30
