In [None]:
import os
import glob
import numpy as np
import pandas as pd
import plotly.express as px

In [None]:
orig_df = pd.read_excel("SSTD_score.xlsx", index_col=0)

In [None]:
# normalize selected columns for each stream
df = orig_df.copy()
ndf = df[['new_count', 'new_score', 'new_severity', 'all_count', 'all_score', 'all_severity', 'stream_id']].groupby('stream_id').transform(lambda x: (x - x.min()) / (x.max() - x.min() + 0.000001))
ndf = ndf.rename(columns={'all_severity': 'norm_all_severity', 'all_score': 'norm_all_score', 'all_count': 'norm_all_count'})
df = pd.concat([df[['stream_id', 'fid', 'fname', 'app', 'rank', 'all_severity', 'all_count', 'all_score']], ndf], axis=1)
count_rank = df[['stream_id', 'fname', 'rank']].groupby(['stream_id', 'fname']).count().sort_values(['stream_id','rank'], ascending=[True, False]).reset_index()

In [None]:
# for each stream, the visualization of (fid, rank)-wise anomalies
# color shows the value of severity
metric = 'severity'
fig = px.scatter(df, x='fid', y='rank',
                 range_x=[df['fid'].min() - 10, df['fid'].max() + 10],
                 range_y=[df['rank'].min() - 10, df['rank'].max() + 10],
                 color=df['norm_all_' + metric].to_numpy(dtype=float),
                 range_color=[df['norm_all_' + metric].min(), df['norm_all_' + metric].max()],
                 animation_frame='stream_id',
                 hover_name='fname', hover_data=['fid', 'rank', 'all_severity', 'all_score', 'all_count'],
                 title='Metric: ' + metric,
                )
fig.show()

In [None]:
# choose top 1000 metric-value rows for each stream
df_top = df.sort_values(['stream_id', 'all_severity'], ascending=[True, False]).groupby('stream_id').head(1000)
_temp = df_top.groupby('stream_id').fid.count()
_out = [i for num in _temp.values for i in range(num)]
df_top['fid_index'] = _out

In [None]:
# for each stream, visualize the top severity anomalies sorted in descending order
fig = px.scatter(df_top, x='fid_index', y='norm_all_severity', color='fid', 
                 animation_frame='stream_id',
                 hover_name='fname', hover_data=['fid', 'rank', 'all_severity', 'all_score', 'all_count'],
                 title='Metric: ' + metric,
                )
fig.show()

In [None]:
### aggregate fid by count ###
### y axis: how many ranks captured for the same fid ###
### x axis: the fid's fname ###

#count_rank['fid'] = count_rank['fid'].astype(str)
count_rank_ = count_rank.rename(columns={'rank':'rank_count'})
fig = px.scatter(count_rank_, x='fname', y='rank_count', #color='rank', 
                 #range_x=[count_rank['fid'].min()-10, count_rank['fid'].max()+10],
                 range_y=[count_rank_['rank_count'].min()-10, count_rank_['rank_count'].max()+10],
                 animation_frame='stream_id')
fig.update_xaxes(showticklabels=False)
fig.show()

In [None]:
# global -- for each stream, each rank, how many anomalies reported
cur = df
acc_df = cur.groupby(['stream_id', 'rank']).count()
acc_df = acc_df.reset_index()
acc_df['stream_id'] = acc_df['stream_id'].astype(str)
fig = px.scatter(acc_df, x='stream_id', y='rank', color='fid')
fig.show()