In [None]:
from google.cloud import bigquery
import os
import json
import pandas as pd
import numpy as np
from pathlib import Path
import datetime

from importlib import reload

import src.table_stats
reload(src.table_stats)

from src.table_stats import print_stats

# Initialize BigQuery client
client = bigquery.Client()

# Set maximum width for table view
pd.set_option('max_colwidth', 60)
# Set maximum rows for table view
pd.set_option('display.max_rows',200)
pd.options.plotting.backend = "plotly"

DATA_FOLDER = Path(os.getenv("WORKDIR")).joinpath("data")

In [None]:
# Configure query by run id

PROJECT_ID = "symphony-dev-2"
DATASET_ID = "log_dataset_default"
TABLE_ID = "logs-2"
RUN_ID = "test-gce-bigscale-0"
REQUEST_TIMESTAMP = "2025-10-15T15:03:04+00:00"


In [None]:
QUERY = """
SELECT * from `{project}.{dataset}.{table}`
WHERE (
    run = "{run_id}" AND
    time > "{start_time}"
)
ORDER BY time DESC
-- LIMIT 1000 
-- Optionally limit the query when dealing with too big datasets...
""".format(
    project = PROJECT_ID,
    dataset = DATASET_ID,
    table = TABLE_ID,
    run_id = RUN_ID,
    start_time = REQUEST_TIMESTAMP
)

query_job = client.query(QUERY)
rows = query_job.result()
df = rows.to_dataframe()

# Parse detail json string
df.detail = df.detail.transform(lambda x: json.loads(x) if x is not None else None)

# Sort by time
df = df.set_index("time").sort_index().reset_index()

# Optionally identify when grr changed
df["grr_shift_out"] = df[df.event == "cli:grr_out"].detail != df[df.event == "cli:grr_out"].shift().detail
df["grr_shift_in"] = df[df.event == "cli:grr_in"].detail != df[df.event == "cli:grr_in"].shift().detail

print_stats(df)


In [None]:
REQUEST_TIMESTAMP = "2025-10-15T15:03:04+00:00"
REQUEST_TIMESTAMP = datetime.datetime.fromisoformat(REQUEST_TIMESTAMP).astimezone(datetime.UTC)

dfc = df[df.event == "gce:insert"].copy(deep=True)

def extract_node_name(row):
    row.node = row.detail.get("request").get("name")
    return row

def extract_machine_type(detail):
    machineType=detail.get("request").get("machineType")
    if machineType is None:
        return None
    return machineType.split("/")[-1]

dfc = dfc.apply(extract_node_name, axis=1)
dfc["machine_type"] = dfc.detail.apply(
    extract_machine_type
)

dfc["cores"] = dfc.machine_type.apply(
    lambda x: int(x.split("-")[-1])
)

dfc = dfc.pivot_table(
      index="node",
      values=["time","cores"],
      aggfunc="last"
)

raw_df = dfc.copy(deep=True)


dfc = dfc.set_index("time").sort_index()

dfc.index = (dfc.index - REQUEST_TIMESTAMP).total_seconds()
dfc

dfc["total_cores"] = dfc.cores.cumsum()

dfc = dfc.drop(columns=["cores"])

dfc["machines"] = range(1,len(dfc.index)+1)


In [None]:
from plotly.subplots import make_subplots
import plotly.graph_objects as go

parsed_data = dfc

# Artifically add the zero index
parsed_data = pd.concat([
    pd.DataFrame([[0,0]], index=[0], columns=parsed_data.columns),
    parsed_data
])

parsed_data = parsed_data.rename(columns={
    "machines": "GCENumberOfNodes",
    "total_cores": "GCENumberOfCores",
})
# parsed_data.index.name = "Timestamp"

# Optionally reset index to REQUEST_TIMESTAMP
# parsed_data.index = (parsed_data.index - REQUEST_TIMESTAMP).total_seconds()
parsed_data.index.name = "TimeAfterSymphonyRequest"

fig = make_subplots(specs=[[{"secondary_y": True}]])


fig.add_trace(
    go.Scatter(
        x=parsed_data.index,
        y=parsed_data.GCENumberOfNodes,
        mode="lines",
        name="GCE - Number of VMs"
    )
)

fig.add_trace(
    go.Scatter(
        x=parsed_data.index,
        y=parsed_data.GCENumberOfCores,
        mode="lines",
        name="GCE - Number of cores"
    ),
    secondary_y=True
   
)


fig.update_layout(
    title="Scaling performance of IBM Spectrum Symphony connector for GCE",
    plot_bgcolor="white",
    legend=dict(
        x=0.005,
        y=0.95,
        bordercolor='black',
        borderwidth=1
    ),
    # xaxis_range=[
    #     REQUEST_TIMESTAMP,
    #     REQUEST_TIMESTAMP + datetime.timedelta(minutes=10)
    # ],
    xaxis_range=[0,150]
)


fig.update_xaxes(
    mirror=True,
    ticks='outside',
    showline=True,
    linecolor='black',
    gridcolor='lightgrey',
    title_text="Time after Symphony HostFactory GCE plugin request", 
    tickvals=list(range(30,150,30)),
    ticktext=[f"{x} seconds" for x in range(30,150,30)],
    # tickangle=45
)

fig.update_yaxes(
    mirror=True,
    ticks='outside',
    showline=True,
    linecolor='black',
    gridcolor='lightgrey',
    tickcolor="lightgrey",
    zerolinecolor='lightgrey',
    title_text="Number of VMs" 

)

fig.update_yaxes(
    mirror=True,
    ticks='outside',
    showline=False,
    showgrid=False,
    linecolor=None,
    gridcolor=None,
    title_text="Number of Cores",
    secondary_y=True,   
)

fig.update_traces(
    connectgaps=True
)

fig.show()



In [None]:
DATA_FOLDER = Path(f"/home/user/data-{RUN_ID}")

fig.write_image(
    file=DATA_FOLDER.joinpath(f"{RUN_ID}.png"),
    format="png",
    width=800,
    height=500,
)

fig.write_image(
    file=DATA_FOLDER.joinpath(f"{RUN_ID}.svg"),
    format="svg",
    width=800,
    height=500,
)



# Save the corresponding raw data
df.to_parquet(DATA_FOLDER.joinpath(f"{RUN_ID}-raw.parquet"))

# Save the corresponding raw data
raw_df.to_csv(DATA_FOLDER.joinpath(f"{RUN_ID}-indexed.csv"))

# Save the corresaponding plot data
parsed_data.to_csv(DATA_FOLDER.joinpath(f"{RUN_ID}-parsed.csv"))

# Save the plot itself
with open(DATA_FOLDER.joinpath(f"{RUN_ID}.html"), "w") as fh:
    fig.write_html(fh)