Skip to content

Commit

Permalink
perf: minor improvement to dx.data.iris() time to display (#525)
Browse files Browse the repository at this point in the history
Removes 50-100ms creating timestamp directly rather than using an
additional update call for an index column
  • Loading branch information
dsmmcken committed Jun 12, 2024
1 parent df6c9b9 commit 932a550
Showing 1 changed file with 16 additions and 22 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -47,16 +47,14 @@ def iris(ticking: bool = True) -> Table:
Notes:
- The ticking feature starts from 1936-01-01T08:00:00UTC and increases
by 1 second for each observation.
- The dataset contains a default of 300 number of samples but can be
set to any size, with 4 original features (sepal length, sepal width,
petal length, and petal width), along with a timestamp, id and species.
- The original Iris species labels are included (setosa, versicolor, and virginica).
- The initial dataset contains 150 samples and includes the 4 original
features (sepal length, sepal width, petal length, and petal width),
along with a timestamp, id and species name.
- The original Iris species names are included (setosa, versicolor, and virginica).
Args:
ticking:
If true, the table will tick using a replayer starting
with a third of the table already ticked. If false the
whole table will be returned as a static table.
If true, the table will tick new data every second.
Returns:
A Deephaven Table
Expand All @@ -74,6 +72,7 @@ def iris(ticking: bool = True) -> Table:
species_list: list[str] = ["setosa", "versicolor", "virginica"]
# Give this dataset a timestamp column based on original year from this data
base_time = to_j_instant("1936-01-01T08:00:00 ET")
pd_base_time = _cast_timestamp(to_pd_timestamp(base_time))

# Load the iris dataset and cast the species column to string
# group it and get the mean and std of each species
Expand All @@ -82,8 +81,9 @@ def iris(ticking: bool = True) -> Table:
species_descriptions = grouped_df.describe()

df_len = len(df)
# add index column using pandas, which is faster than an update() call
df.insert(0, "index", np.ndarray(range(df_len)))

# Add a timestamp column to the DataFrame
df["timestamp"] = pd_base_time + pd.to_timedelta(df.index * SECOND)

# Get a random gaussian value based on the mean and std of the existing
# data, where col is the column name ('sepal_length', etc) and index is the
Expand All @@ -99,15 +99,14 @@ def get_index(species: str) -> int:
return species_list.index(species) + 1

# convert the pandas DataFrame to a Deephaven Table
source_table = to_table(df)
source_table = to_table(df).move_columns_up("timestamp")

if ticking:
ticking_table = (
time_table("PT1S")
.update(
time_table("PT1S").update(
[
# need an index created before the merge, to use it after
"index = ii + df_len",
# make timestamp start after the source table timestamp
"timestamp = base_time + (long)((ii + df_len) * SECOND)",
# pick a random species from the list, using the index as a seed
"species = (String)species_list[(int)new Random(ii).nextInt(3)]",
"sepal_length = get_random_value(`sepal_length`, ii, species)",
Expand All @@ -117,17 +116,12 @@ def get_index(species: str) -> int:
"species_id = get_index(species)",
]
)
# we have our own timestamp column, so drop the one generated by time_table
.drop_columns("Timestamp")
)
t = merge([source_table, ticking_table])
else:
t = source_table
return merge([source_table, ticking_table])

return (
t.update("timestamp = base_time + (long)(index * SECOND)")
.move_columns_up("timestamp")
.drop_columns("index")
)
return source_table


def stocks(ticking: bool = True, hours_of_data: int = 1) -> Table:
Expand Down

0 comments on commit 932a550

Please sign in to comment.