In [104]:
import datetime

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import altair as alt

from sdk_performance import (
    get_connection, get_sdk_size, last_minor_releases, get_latest_release,
    get_previous, VersionLevel, get_versions
)

In [105]:
connection = get_connection()
sdk_vals = get_sdk_size(connection)
connection.close()
sdk_vals

Unnamed: 0,id,started,version,measurement,value
0,1887,2022-05-27 04:20:00,6.1.1,full,9922493.0
1,1887,2022-05-27 04:20:00,6.1.1,min,4314127.0
2,1888,2022-05-15 23:20:00,6.1.0,full,9844986.0
3,1888,2022-05-15 23:20:00,6.1.0,min,4280428.0
4,1889,2022-05-06 19:20:00,6.0.3,full,9767479.0
...,...,...,...,...,...
163,1968,2020-03-16 08:20:00,1.0.2,min,1584533.0
164,1969,2020-03-07 01:20:00,1.0.1,full,3566919.0
165,1969,2020-03-07 01:20:00,1.0.1,min,1550834.0
166,1970,2020-02-20 21:20:00,1.0.0,full,3489412.0


In [106]:
sdk_vals = sdk_vals.drop(columns='id')
sdk_vals = sdk_vals.set_index(keys='started')
sdk_vals

Unnamed: 0_level_0,version,measurement,value
started,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2022-05-27 04:20:00,6.1.1,full,9922493.0
2022-05-27 04:20:00,6.1.1,min,4314127.0
2022-05-15 23:20:00,6.1.0,full,9844986.0
2022-05-15 23:20:00,6.1.0,min,4280428.0
2022-05-06 19:20:00,6.0.3,full,9767479.0
...,...,...,...
2020-03-16 08:20:00,1.0.2,min,1584533.0
2020-03-07 01:20:00,1.0.1,full,3566919.0
2020-03-07 01:20:00,1.0.1,min,1550834.0
2020-02-20 21:20:00,1.0.0,full,3489412.0


In [107]:
# x = pd.DataFrame(columns=["started","version", "measurement", "value"])
# x.set_index(keys="started")
# d = datetime.datetime.now()
x = pd.DataFrame({"started":[datetime.datetime.now()], "version":["123"], 'measurement':["full"], "value": [3.43]}, ).set_index("started")
x.dtypes

version         object
measurement     object
value          float64
dtype: object

In [108]:
x.loc[datetime.datetime.now()]={"version":"123", 'measurement':"min", "value": 1.12}
x

Unnamed: 0_level_0,version,measurement,value
started,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2022-06-29 18:46:44.871405,123,full,3.43
2022-06-29 18:46:44.884343,123,min,1.12


In [109]:
sdk_vals.loc[(sdk_vals["measurement"] == "full") & (sdk_vals["version"] == "6.1.1")]

Unnamed: 0_level_0,version,measurement,value
started,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2022-05-27 04:20:00,6.1.1,full,9922493.0


In [110]:
sdk_vals_no_idx = sdk_vals.reset_index()
base = alt.Chart(sdk_vals_no_idx).encode(
    x="started",
    y="value",
    tooltip="version",
    text="version",
    shape="measurement",
    color="measurement"
)

last_release = sdk_vals_no_idx[sdk_vals_no_idx["version"] == "6.1.1"]
minor_releases = last_minor_releases(sdk_vals_no_idx)

last = alt.Chart(sdk_vals_no_idx[sdk_vals_no_idx["version"].isin(minor_releases)]).encode(
    x="started",
    y="value",
    tooltip="version",
    text="version",
).mark_point(size=55, opacity=1.0).encode(
    shape="measurement",
    color="measurement"

)

line = (alt.Chart(pd.DataFrame({'y': [8000000]})).
        mark_rule(size=1, strokeDash=[4, 4], color="red", opacity=0.5).
        encode(y='y'))

warn_text = alt.Chart(pd.DataFrame({'y': [8000000]})).mark_text(text="too much", align="center", opacity=1, baseline="line-bottom", dx=-300, fontSize=12).encode(y="y")

last_text = last.mark_text(
    align='center',
    baseline='line-bottom',
    dy=-10
).encode(
    text='version'

)
alt.layer(
    base.mark_point(size=10, opacity=0.5),
    base.mark_line(size=1, opacity=0.3),
    line,
    warn_text,
    last,
    last_text,
).properties(
    width=850,
    height=400
).interactive()

In [111]:
versions = get_versions(sdk_vals_no_idx["version"])
r_latest = get_latest_release(versions)
previous_major = get_previous(r_latest, versions, VersionLevel.Major)
previous_minor = get_previous(r_latest, versions, VersionLevel.Minor)
previous_patch = get_previous(r_latest, versions, VersionLevel.Patch)
(r_latest, previous_major, previous_minor, previous_patch)

(Version(major=6, minor=1, patch=1),
 Version(major=5, minor=5, patch=3),
 Version(major=6, minor=0, patch=3),
 Version(major=6, minor=1, patch=0))

In [112]:

def get_val(df, version, measurement):
    return df.loc[(df["measurement"] == measurement) & (df["version"] == version),]


get_val(sdk_vals_no_idx, str(previous_major), "min").reset_index()["value"][0]

4111935.0

In [113]:

df = pd.DataFrame([[38.0, 2.0, 18.0, 22.0, 21, np.nan], [19, 439, 6, 452, 226, 232]],
                  index=pd.Index(['Tumour (Positive)', 'Non-Tumour (Negative)'], name='Actual Label:'),
                  columns=pd.MultiIndex.from_product([['Decision Tree', 'Regression', 'Random'], ['Tumour', 'Non-Tumour']], names=['Model:', 'Predicted:']))
df.style


Model:,Decision Tree,Decision Tree,Regression,Regression,Random,Random
Predicted:,Tumour,Non-Tumour,Tumour,Non-Tumour,Tumour,Non-Tumour
Actual Label:,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
Tumour (Positive),38.0,2.0,18.0,22.0,21,
Non-Tumour (Negative),19.0,439.0,6.0,452.0,226,232.0


In [114]:
df

Model:,Decision Tree,Decision Tree,Regression,Regression,Random,Random
Predicted:,Tumour,Non-Tumour,Tumour,Non-Tumour,Tumour,Non-Tumour
Actual Label:,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
Tumour (Positive),38.0,2.0,18.0,22.0,21,
Non-Tumour (Negative),19.0,439.0,6.0,452.0,226,232.0


In [115]:
pd.MultiIndex.from_product([['Decision Tree', 'Regression', 'Random'], ['Tumour', 'Non-Tumour']], names=['Model:', 'Predicted:'])


MultiIndex([('Decision Tree',     'Tumour'),
            ('Decision Tree', 'Non-Tumour'),
            (   'Regression',     'Tumour'),
            (   'Regression', 'Non-Tumour'),
            (       'Random',     'Tumour'),
            (       'Random', 'Non-Tumour')],
           names=['Model:', 'Predicted:'])

In [116]:
 pd.DataFrame([[38.0, 2.0, 18.0, 22.0, 21, np.nan], [19, 439, 6, 452, 226, 232]],
              columns=pd.MultiIndex.from_product([['Decision Tree', 'Regression', 'Random'], ['Tumour', 'Non-Tumour']], names=['Model:', 'Predicted:']))


Model:,Decision Tree,Decision Tree,Regression,Regression,Random,Random
Predicted:,Tumour,Non-Tumour,Tumour,Non-Tumour,Tumour,Non-Tumour
0,38.0,2.0,18.0,22.0,21,
1,19.0,439.0,6.0,452.0,226,232.0


In [117]:
df = pd.DataFrame(
    {"AAA": [4, 5, 6, 7], "BBB": [10, 20, 30, 40], "CCC": [100, 50, -30, -50]}
)
df


Unnamed: 0,AAA,BBB,CCC
0,4,10,100
1,5,20,50
2,6,30,-30
3,7,40,-50


In [118]:
df2 = df.copy()
df2.loc[df.AAA >= 5, "BBB"] = 33
df2


Unnamed: 0,AAA,BBB,CCC
0,4,10,100
1,5,33,50
2,6,33,-30
3,7,33,-50


In [119]:
df

Unnamed: 0,AAA,BBB,CCC
0,4,10,100
1,5,20,50
2,6,30,-30
3,7,40,-50


In [120]:
df_mask = pd.DataFrame(
    {"AAA": [True] * 4, "BBB": [False] * 4, "CCC": [True, False] * 2}
)
df_mask


Unnamed: 0,AAA,BBB,CCC
0,True,False,True
1,True,False,False
2,True,False,True
3,True,False,False


In [121]:
df3 = df.where(df_mask, np.nan)
df3

Unnamed: 0,AAA,BBB,CCC
0,4,,100.0
1,5,,
2,6,,-30.0
3,7,,


In [122]:
df3["CCC type"] = np.where(np.isnan(df3["CCC"]), "Not a number", "A number")
df3

Unnamed: 0,AAA,BBB,CCC,CCC type
0,4,,100.0,A number
1,5,,,Not a number
2,6,,-30.0,A number
3,7,,,Not a number


In [123]:
data = pd.DataFrame({'a': list('CCCDBDEEAAEED'),
                     'b': [2, 7, 4, 1, 2, 6, 8, 4, 7, 8, 8, 8, 9]})
data


Unnamed: 0,a,b
0,C,2
1,C,7
2,C,4
3,D,1
4,B,2
5,D,6
6,E,8
7,E,4
8,A,7
9,A,8


In [124]:
# chart = alt.Chart(data)
alt.Chart(data).mark_point().encode(x='a')

In [125]:
alt.Chart(data).mark_point().encode(x='b', y='a')

In [126]:

from vega_datasets import data

source = data.wheat()

bars = alt.Chart(source).mark_bar().encode(
    x='wheat:Q',
    y="year:O"
)

text = bars.mark_text(
    align='left',
    baseline='middle',
    dx=3  # Nudges text to right so it doesn't appear on top of the bar
).encode(
    text='wheat:Q'
)

(bars + text).properties(height=900)

In [127]:
d  = datetime.datetime.now()
started = d
version = "1.1.1"
val = 3.423
measurement = "full"
df2 = pd.DataFrame({"started": [started], "version": [version], "measurement": [measurement], "value": [val]}).set_index(["measurement","started"])


In [128]:
df2


Unnamed: 0_level_0,Unnamed: 1_level_0,version,value
measurement,started,Unnamed: 2_level_1,Unnamed: 3_level_1
full,2022-06-29 18:46:45.648961,1.1.1,3.423


In [129]:
df2.loc[("min", started)] = {"version":"1.2.3", "value": 3.11111}
df2

Unnamed: 0_level_0,Unnamed: 1_level_0,version,value,2022-06-29 18:46:45.648961
measurement,started,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
full,2022-06-29 18:46:45.648961,1.1.1,3.423,
min,NaT,,,


In [130]:
df2


Unnamed: 0_level_0,Unnamed: 1_level_0,version,value,2022-06-29 18:46:45.648961
measurement,started,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
full,2022-06-29 18:46:45.648961,1.1.1,3.423,
min,NaT,,,


In [131]:
df2.dtypes

version                        object
value                         float64
2022-06-29 18:46:45.648961     object
dtype: object

In [132]:
df2.index.dtypes


measurement            object
started        datetime64[ns]
dtype: object

In [133]:
from display_data import get_sdk_size

df2 = get_sdk_size()
df2


Unnamed: 0,started,version,measurement,value
0,2020-09-01 12:17:00,0.0.1,full,3688846
1,2020-09-01 12:17:00,0.0.1,min,2049359
2,2020-09-09 16:17:00,0.1.0,full,3778952
3,2020-09-09 16:17:00,0.1.0,min,2099418
4,2020-09-15 08:17:00,0.1.1,full,3881374
...,...,...,...,...
187,2023-07-18 11:17:00,22.0.1,min,4864122
188,2023-08-02 19:17:00,22.1.0,full,8854021
189,2023-08-02 19:17:00,22.1.0,min,4918901
190,2023-08-18 04:17:00,22.1.1,full,8855303


In [134]:
df2.index


RangeIndex(start=0, stop=192, step=1)

In [135]:
df2


Unnamed: 0,started,version,measurement,value
0,2020-09-01 12:17:00,0.0.1,full,3688846
1,2020-09-01 12:17:00,0.0.1,min,2049359
2,2020-09-09 16:17:00,0.1.0,full,3778952
3,2020-09-09 16:17:00,0.1.0,min,2099418
4,2020-09-15 08:17:00,0.1.1,full,3881374
...,...,...,...,...
187,2023-07-18 11:17:00,22.0.1,min,4864122
188,2023-08-02 19:17:00,22.1.0,full,8854021
189,2023-08-02 19:17:00,22.1.0,min,4918901
190,2023-08-18 04:17:00,22.1.1,full,8855303


In [137]:

base = alt.Chart(df2).encode(
    x="started",
    y="value",
    tooltip="version",
    text="version",
    shape="measurement",
    color="measurement"
)


minor_releases = last_minor_releases(df2)

last = alt.Chart(df2[df2["version"].isin(minor_releases)]).encode(
    x="started",
    y="value",
    tooltip="version",
    text="version",
).mark_point(size=55, opacity=1.0).encode(
    shape="measurement",
    color="measurement"

)

line = (alt.Chart(pd.DataFrame({'y': [8000000]})).
        mark_rule(size=1, strokeDash=[4, 4], color="red", opacity=0.5).
        encode(y='y'))

warn_text = alt.Chart(pd.DataFrame({'y': [8000000]})).mark_text(text="too much", align="center", opacity=1, baseline="line-bottom", dx=-350, fontSize=14).encode(y="y")

last_text = last.mark_text(
    align='center',
    baseline='line-bottom',
    dy=-10
).encode(
    text='version'

)
alt.layer(
    base.mark_point(size=10, opacity=0.5),
    base.mark_line(size=1, opacity=0.3),
    line,
    warn_text,
    last,
    last_text,
).properties(
    width=850,
    height=400
).interactive()
