## Setting up visualization portion of project

In [1]:
# visualization tools
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from bokeh.plotting import figure, output_file, show
from bokeh.models import ColumnDataSource
from bokeh.models.tools import HoverTool

In [2]:
# data tools
import datetime as dt
import numpy as np
from random import sample
from random import choice
from scipy import stats

In [3]:
# access to .csv job files
from os import listdir

In [4]:
# collect file names for all jobs
train_files = [ '../jobs/train/'+file for file in listdir('../jobs/train/') if file.startswith('comet') ]
test_files = [ '../jobs/test/'+file for file in listdir('../jobs/test/') if file.startswith('comet')]

all_files = [ *train_files, *test_files]

In [5]:
len(all_files)

10623

### Parse for trends
**Notes:**
    * Certain categories have been pre-selected to further analysis:
        * intel_hsw: Intel Haswell Processor (HSW) (per core)
        * intel_rapl: Running average power limit
        * ib: Infiniband usage
        * ib_sw: InfiniBand usage
        * ib_ext: Infiniband usage
        * llite: Lustre filesystem usage (per mount),
        * lnet: Lustre network usage
        * mdc: Lustre network usage
        * osc: Lustre filesystem usage
        * block: block device statistics (per device)
        * cpu: scheduler accounting (per CPU)
        * mem: memory usage (per socket)
        * net: network device usage (per device)
        * nfs: NFS system usage
        * proc: Process specific data (MaxRSS, executable name etc.)
        * sysv_shm SysV shared memory segment usage
        * tmpfs: ram-backed filesystem usage (per mount)
        * vm: virtual memory statistics
    * Extreme case will also be evaluated

In [6]:
all_jobs = []
norm_jobs = []

In [7]:
cols = {
    "amd64_pmc": "AMD Opteron performance counters (per core)",
        "intel_hsw": "Intel Haswell Processor (HSW) (per core)",
        "intel_hsw_ht": "Intel Haswell Processor - Hyper-threaded (per logical core)",
        "intel_nhm": "Intel Nehalem Processor (NHM) (per core)",
        "intel_uncore": "Westmere Uncore (WTM) (per socket)",
        "intel_snb": "Intel Sandy Brige (SNB) or Ivy Bridge (IVB) Processor (per core)",
        "intel_rapl": "Running average power limit",
        "intel_hsw_cbo": "Caching Agent (CBo) for SNB (HSW) (per socket)",
        "intel_hsw_pcu": "Power Control Unit for SNB (HSW) (per socket)",
        "intel_hsw_imc": "Integrated Memory Controller for SNB (HSW) (per socket)",
        "intel_hsw_qpi": "QPI Link Layer for SNB (HSW) (per socket)",
        "intel_hsw_hau": "Home Agent Unit for SNB (HSW) (per socket)",
        "intel_hsw_r2pci": "Ring to PCIe Agent for SNB (HSW) (per socket)",
        "ib": "Infiniband usage (default)",
        "ib_sw": "InfiniBand usage (sw)",
        "ib_ext": "Infiniband usage (ext)",
        "llite": "Lustre filesystem usage (per mount)",
        "lnet": "Lustre network usage (lnet)",
        "mdc": "Lustre network usage (mdc)",
        "mic": "MIC scheduler account (per hardware thread)",
        "osc": "Lustre filesystem usage (osc)",
        "block": "Block device statistics (per device)",
        "cpu": "Scheduler accounting (per CPU)",
        "mem": "Memory usage (per socket)",
        "net": "Network device usage (per device)",
        "nfs": "NFS system usage",
        "numa": "NUMA statistics (per socket)",
        "proc": "Process specific data (MaxRSS, executable name etc.)",
        "ps": "Process statistics",
        "sysv_shm": "SysV shared memory segment usage",
        "tmpfs": "Ram-backed filesystem usage (per mount)",
        #"vfs": "Dentry_file_inode cache usage",
        "vm": "Virtual memory statistics"
       }

In [8]:
# Check if this program is missing any columns from the master list that are in the data
def missing_cols ( ):
    missing = []
    
    for df in all_jobs:
        for col in df.columns.values:
            if col not in cols.values() and col not in missing:
                missing.append(col)
    return missing

In [9]:
# Check if there are any columns in the master list that are not in the data
def unused_cols ( ):
    unused = []
    
    for col in cols.values():
        # don't change this, you knew what you were doing when you wrote it
        notfound = [ (col in df) for df in all_jobs ]
        
        if all( (elem == False) for elem in notfound ):
            unused.append(col)
        
    return unused

In [21]:
# Check if there are any columns with values that don't change throughout jobs
def static_cols ( ):
    static = []
    
    for col in cols.values():
        for df in all_jobs:
            if col in df and df[col].nunique() == 1 and col not in static:
                static.append(col)
        
    return static

In [11]:
# normalize all data values in DataFrame
def clean ( df ):
    
    for val in cols.values():
        try:
            df[[title]] = normalize( df[[title]], axis=0, norm='max', copy=False )
        except:
            next
        
    return df

In [12]:
# extreme values
most_cycles = ( pd.DataFrame() )
most_stats = ( pd.DataFrame() )

for i in range( len(all_files) ):
    df = pd.read_csv( all_files[i] ).drop("Cycle", 1)
    
    # make sure job ran for at least one hour
    if (df.shape[0] > 6):
        all_jobs.append( df )
        norm_jobs.append( clean(df) )
    
        # find job with most cycles of collected stats
        if (df.shape[0] > most_cycles.shape[0]): most_cycles = df
        # find job with most types of stats
        if (df.shape[1] > most_stats.shape[1]): most_stats = df

In [13]:
missing_cols()

['Dentry/file/inode cache usage', 'intel_rapl']

In [14]:
unused_cols()

['AMD Opteron performance counters (per core)',
 'Intel Haswell Processor - Hyper-threaded (per logical core)',
 'Intel Nehalem Processor (NHM) (per core)',
 'Westmere Uncore (WTM) (per socket)',
 'Intel Sandy Brige (SNB) or Ivy Bridge (IVB) Processor (per core)',
 'MIC scheduler account (per hardware thread)']

In [22]:
static_cols()

['Power Control Unit for SNB (HSW) (per socket)',
 'QPI Link Layer for SNB (HSW) (per socket)',
 'Infiniband usage (default)',
 'Lustre filesystem usage (per mount)',
 'Block device statistics (per device)',
 'NFS system usage',
 'Process specific data (MaxRSS, executable name etc.)',
 'SysV shared memory segment usage',
 'Ram-backed filesystem usage (per mount)']

In [16]:
len(all_jobs)

6104

In [17]:
most_cycles.shape

(526, 23)

In [18]:
most_stats.shape

(7, 26)

### Minor Data Cleaning

In [None]:
def ordinalize ( x ):
    x = abs(int(x))
    
    if (x % 10 < 4) or (x % 100 < 4):
        return {
            1: f"{x}st",
            2: f"{x}nd",
            3: f"{x}rd",
        }.get(x % 10, "th")
    else:
        return f"{x}th"

In [None]:
def cycl_mean_all ( ):
    return int(np.mean([ df.shape[0] for df in all_jobs ]))
    
def cycl_mean_spec ( col ):
    return int(np.mean([ df.shape[0] for df in all_jobs if col in df]))    

In [None]:
# find the most frequent number of cycles for all jobs
def mode_all ( ):
    return int( stats.mode( [df.shape[0] for df in all_jobs] )[0][0] )

# find the most frequent number of cycles for a specific set
def mode_spec ( col ):
    return int( stats.mode( [df.shape[0] for df in all_jobs if col in df] )[0][0] )

In [None]:
def ymax ( col, jobs=all_jobs ):
    ymax = 0
    
    for df in jobs:
        if col in df and df[col].max() > ymax:
            ymax = df[col].max()
    return ymax

### matplotlib with seaborn

In [None]:
themes = ['inferno', 'ocean', 'tab20c', 'winter', 'summer', 'Wistia', 'hot', 'bone', 'pink',
         'BuGn', 'Blues', 'Purples', 'GnBu', 'YlGn', 'plasma', 'magma', 'viridis', 'BuPu', 'Blues']

In [None]:
def plot_all ( col, mode=mode_all() ):
    modeList = [ df for df in all_jobs if df.shape[0] == mode ]
    
    for job in modeList:
        try:
            plt.plot( job[col] )
        except:
            next
    
    plt.grid( True )
    plt.gcf().set_size_inches(15, 11)
    sns.despine()
    
def plot_n ( col, n, mode=mode_all() ):
    modeList = [ df for df in all_jobs if df.shape[0] == mode ]
    s = sample( modeList , n )
    
    for job in s:
        try:
            plt.plot( job[col] )
        except:
            next
    
    plt.grid( True )
    plt.gcf().set_size_inches(15, 11)
    sns.despine()

In [None]:
mode = mode_all()
xrng = np.arange(1, mode)
for trend in cols.values():
    print( ymax(trend), trend )
    #print( "Max:", ymax(trend) )
    #print( "Step:", ymax(trend)/mode )
    #print( "Range:", np.arange(0, ymax(trend), ymax(trend)/mode))

In [None]:
# choose ranges
n = 25
mode = mode_all()
xrng = np.arange(1, mode)

for trend in cols.values():   # len(cols)
    yticks = np.arange(0, ymax(trend), )
    
    # generate plot
    sns.set("notebook", palette=choice(themes), font_scale=1.5, rc={"lines.linewidth": 2.5})
    plot_n( trend, n)
    
    # label and format plot
    plt.suptitle(    trend, fontsize=25, fontweight='bold' )
    plt.title(       f"Sample of {n} jobs with {mode} cycles of data")
    plt.xticks(      xrng, [ ordinalize(i) for i in xrng ], fontsize='large' )
    plt.xlabel(      "Cycle Data Collected", labelpad=15, fontweight='bold', fontsize='x-large' )
    #plt.ylabel(      f"SUs", labelpad=15, fontweight='bold', fontsize='x-large' )
    
    # save
    plt.savefig( f"./graphs/{trend}_sample.png", bbox='tight')

In [None]:
ax = sns.regplot(x="size", y="total_bill", data=tips,
                 x_estimator=np.mean)

In [None]:
ans = sns.load_dataset("anscombe")
>>> ax = sns.regplot(x="x", y="y", data=ans.loc[ans.dataset == "II"],
...                  scatter_kws={"s": 80},
...                  order=2, ci=None, truncate=True)

In [None]:
sns.set()

# Load the iris dataset
iris = sns.load_dataset(trend[0])

# Plot sepal with as a function of sepal_length across days
g = sns.lmplot(x="sepal_length", y="sepal_width", hue="species",
               truncate=True, size=5, data=iris)

# Use more informative axis labels than are provided by default
g.set_axis_labels("Sepal length (mm)", "Sepal width (mm)")

In [None]:
sns.set(style="whitegrid")

# Load the example iris dataset
diamonds = sns.load_dataset("diamonds")

# Draw a scatter plot while assigning point colors and sizes to different
# variables in the dataset
f, ax = plt.subplots(figsize=(6.5, 6.5))
sns.despine(f, left=True, bottom=True)
clarity_ranking = ["I1", "SI2", "SI1", "VS2", "VS1", "VVS2", "VVS1", "IF"]
sns.scatterplot(x="carat", y="price",
                hue="clarity", size="depth",
                palette="ch:r=-.2,d=.3_r",
                hue_order=clarity_ranking,
                sizes=(1, 8), linewidth=0,
                data=diamonds, ax=ax)

# Bokeh Visualizations

In [None]:
output_file('bokeh_example.html')

sample = most_cycles[1] #.sample(50)
source = ColumnDataSource(sample)
p = figure()
#p.circle(x='TOTAL_TONS', y='AC_ATTACKING',
#         source=source,
#         size=10, color='green')
#p.title.text = 'Attacking Aircraft and Munitions Dropped'
#p.xaxis.axis_label = 'Tons of Munitions Dropped'
#p.yaxis.axis_label = 'Number of Attacking Aircraft'
#hover = HoverTool()
#hover.tooltips=[
#    ('Attack Date', '@MSNDATE'),
#    ('Attacking Aircraft', '@AC_ATTACKING'),
#    ('Tons of Munitions', '@TOTAL_TONS'),
#    ('Type of Aircraft', '@AIRCRAFT_NAME')
#]
#
#p.add_tools(hover)
#
#show(p)