perf: minor improvement to dx.data.iris() time to display (#525)

Removes 50-100ms creating timestamp directly rather than using an additional update call for an index column
deephaven · Jun 12, 2024 · 932a550 · 932a550
1 parent df6c9b9
commit 932a550
Showing 1 changed file with 16 additions and 22 deletions.
diff --git a/plugins/plotly-express/src/deephaven/plot/express/data/data_generators.py b/plugins/plotly-express/src/deephaven/plot/express/data/data_generators.py
@@ -47,16 +47,14 @@ def iris(ticking: bool = True) -> Table:
     Notes:
         - The ticking feature starts from 1936-01-01T08:00:00UTC and increases
           by 1 second for each observation.
-        - The dataset contains a default of 300 number of samples but can be
-          set to any size, with 4 original features (sepal length, sepal width,
-          petal length, and petal width), along with a timestamp, id and species.
-        - The original Iris species labels are included (setosa, versicolor, and virginica).
+        - The initial dataset contains 150 samples and includes the 4 original
+          features (sepal length, sepal width, petal length, and petal width),
+          along with a timestamp, id and species name.
+        - The original Iris species names are included (setosa, versicolor, and virginica).
 
     Args:
         ticking:
-            If true, the table will tick using a replayer starting
-            with a third of the table already ticked. If false the
-            whole table will be returned as a static table.
+            If true, the table will tick new data every second.
 
     Returns:
         A Deephaven Table
@@ -74,6 +72,7 @@ def iris(ticking: bool = True) -> Table:
     species_list: list[str] = ["setosa", "versicolor", "virginica"]
     # Give this dataset a timestamp column based on original year from this data
     base_time = to_j_instant("1936-01-01T08:00:00 ET")
+    pd_base_time = _cast_timestamp(to_pd_timestamp(base_time))
 
     # Load the iris dataset and cast the species column to string
     # group it and get the mean and std of each species
@@ -82,8 +81,9 @@ def iris(ticking: bool = True) -> Table:
     species_descriptions = grouped_df.describe()
 
     df_len = len(df)
-    # add index column using pandas, which is faster than an update() call
-    df.insert(0, "index", np.ndarray(range(df_len)))
+
+    # Add a timestamp column to the DataFrame
+    df["timestamp"] = pd_base_time + pd.to_timedelta(df.index * SECOND)
 
     # Get a random gaussian value based on the mean and std of the existing
     # data, where col is the column name ('sepal_length', etc) and index is the
@@ -99,15 +99,14 @@ def get_index(species: str) -> int:
         return species_list.index(species) + 1
 
     # convert the pandas DataFrame to a Deephaven Table
-    source_table = to_table(df)
+    source_table = to_table(df).move_columns_up("timestamp")
 
     if ticking:
         ticking_table = (
-            time_table("PT1S")
-            .update(
+            time_table("PT1S").update(
                 [
-                    # need an index created before the merge, to use it after
-                    "index = ii + df_len",
+                    # make timestamp start after the source table timestamp
+                    "timestamp = base_time + (long)((ii + df_len) * SECOND)",
                     # pick a random species from the list, using the index as a seed
                     "species = (String)species_list[(int)new Random(ii).nextInt(3)]",
                     "sepal_length = get_random_value(`sepal_length`, ii, species)",
@@ -117,17 +116,12 @@ def get_index(species: str) -> int:
                     "species_id = get_index(species)",
                 ]
             )
+            # we have our own timestamp column, so drop the one generated by time_table
             .drop_columns("Timestamp")
         )
-        t = merge([source_table, ticking_table])
-    else:
-        t = source_table
+        return merge([source_table, ticking_table])
 
-    return (
-        t.update("timestamp = base_time + (long)(index * SECOND)")
-        .move_columns_up("timestamp")
-        .drop_columns("index")
-    )
+    return source_table
 
 
 def stocks(ticking: bool = True, hours_of_data: int = 1) -> Table: