In [None]:
# File dependencies
from tacc_stats.pickler.job_stats import Job
import cPickle as pickle
import argparse
import time
from os import listdir
from IPython.display import clear_output

In [None]:
# Data manipulation dependencies
import pandas as pd
import numpy as np
import datetime as dt

In [None]:
# Directory of all pickled jobs via comet
source_dir = '/oasis/projects/nsf/sys200/tcooper/xsede_stats/comet_pickles/'

# List of date directories in source_dir
dates_list = [ date for date in listdir(source_dir) ]

# Access and open pickled job files
**Process:**
    - Iterate through the non-empty date folders available in source_dir
    - A file is saved in valid_jobs if:
        * The pickled file is a Job object
        * The job ran for more than 6 cycles (1 hour)
        * The total number of jobs saved at the end of the previous date folder is less than 1000
            _This is purely to keep the computations manageable according to compute time requested_
    - Exceptions are skipped

In [None]:
n = sum([len(listdir(source_dir+date)) for date in dates_list])

In [None]:
valid_jobs = []
t0 = time.time()

for date in dates_list:
    count = 0
    
    # do not open empty folders
    if len( listdir(source_dir+date) ) != 0 and len(valid_jobs) < 10000:
        files = listdir(source_dir+date)
        
        for i in range(len(files)):
            count += i
            job = files[i]
            clear_output(wait=True)
            t1 = time.time()
            print("Processing file {} ({}%)".format(i, np.round( 100*i//n, 2)))
            
            # open job file if possible
            try:
                pickle_file = open( source_dir+date+'/'+job, 'rb')
                jobid = pickle.load(pickle_file)
                
                # only save jobs that ran longer than 1 hour
                if (len(jobid.times) > 5):
                    valid_jobs.append(jobid)
                    
                pickle_file.close()
            except:
                next
                
            # t2 = time.time()
            # print(" {}s (total: {}s)".format(np.round(t2-t1, 2), np.round(t2-t0, 2)))
    else:
        next

In [None]:
# Check number of jobs accessed
len(valid_jobs)

In [None]:
# Instantiate example host,job pair
ex_job = valid_jobs[0].id
ex_job_host = valid_jobs[0].hosts.keys()[0]
ex_stats = valid_jobs[0].schemas.keys()
ex_job_host, ex_job

In [None]:
print "There were", len(ex_stats), "statistics collected for this example job:"
print
print [stat for stat in ex_stats]

# Format Columns & Clean Data

### Function: Given DataFrame, rename all columns with full label
<b>Notes:</b>
    - Certain column descriptions are repeated exactly in the available documentation. As a result, when these columns are relabelled according to their description, a column-specific identifier is appended in parentheses to keep it unique and prevent altering the meaning unintentionally.
    - Some of the intel categories are listed in the available documention as "-snb(hsw)-"; however, the code is actually tagged with "-hsw-". This is to note the respective categories are in fact included in this program, though they appear skipped.
    - At least one stat type was present in the data but does not appear to have a corresponding value in the available documentation, 'intel_rapl'. This has been interpretted to represent, "Running Average Power Limit" and is included in the proceeding analysis.

In [None]:
# Master list of all possible statistics collected
stat_types = ["amd64_pmc", "intel_hsw", "intel_hsw_ht", "intel_nhm",
              "intel_uncore", "intel_snb", "intel_rapl", "intel_hsw_cbo", "intel_hsw_pcu",
              "intel_hsw_imc", "intel_hsw_qpi", "intel_hsw_hau",
              "intel_hsw_r2pci", "ib", "ib_sw", "ib_ext", "llite",
              "lnet", "mdc", "mic", "osc", "block", "cpu", "mem", "net",
              "nfs", "numa", "proc", "ps", "sysv_shm", "tmpfs", "vfs", "vm"]

# Total number of categories
len(stat_types)

In [None]:
def formatTitles ( df ):
    return df.rename( columns={"index": "Job Name",
              "amd64_pmc": "AMD Opteron performance counters (per core)",
              "intel_hsw": "Intel Haswell Processor (HSW) (per core)",
              "intel_hsw_ht": "Intel Haswell Processor - Hyper-threaded (per logical core)",
              "intel_nhm": "Intel Nehalem Processor (NHM) (per core)",
              "intel_uncore": "Westmere Uncore (WTM) (per socket)",
              "intel_snb": "Intel Sandy Brige (SNB) or Ivy Bridge (IVB) Processor (per core)",
              "intel_rapl": "Running average power limit",
              "intel_hsw_cbo": "Caching Agent (CBo) for SNB (HSW) (per socket)",
              "intel_hsw_pcu": "Power Control Unit for SNB (HSW) (per socket)",
              "intel_hsw_imc": "Integrated Memory Controller for SNB (HSW) (per socket)",
              "intel_hsw_qpi": "QPI Link Layer for SNB (HSW) (per socket)",
              "intel_hsw_hau": "Home Agent Unit for SNB (HSW) (per socket)",
              "intel_hsw_r2pci": "Ring to PCIe Agent for SNB (HSW) (per socket)",
              "ib": "Infiniband usage (default)",
              "ib_sw": "InfiniBand usage (sw)",
              "ib_ext": "Infiniband usage (ext)",
              "llite": "Lustre filesystem usage (per mount)",
              "lnet": "Lustre network usage (lnet)",
              "mdc": "Lustre network usage (mdc)",
              "mic": "MIC scheduler account (per hardware thread)",
              "osc": "Lustre filesystem usage (osc)",
              "block": "Block device statistics (per device)",
              "cpu": "Scheduler accounting (per CPU)",
              "mem": "Memory usage (per socket)",
              "net": "Network device usage (per device)",
              "nfs": "NFS system usage",
              "numa": "NUMA statistics (per socket)",
              "proc": "Process specific data (MaxRSS, executable name etc.)",
              "ps": "Process statistics",
              "sysv_shm": "SysV shared memory segment usage",
              "tmpfs": "Ram-backed filesystem usage (per mount)",
              "vfs": "Dentry/file/inode cache usage",
              "vm": "Virtual memory statistics"
                            })

### Loops in loops in loops (Cleaning data)
**Notes:**
    - If a value is missing from the data, it will be replaced with '0' for the purpose of this project
    - If a type of statistic was not collected on the job, that column is dropped from the DataFrame
    - Two files are created during each iteration:
         1) A .csv of the descriptive statistics for that host,job pair
         2) A full .csv of the host,job data from the formatted DataFrame
    - Naming convention: Files are labelled as '{host}_{jobid}' to support random lookup
         * A job run on multiple host nodes is processed and saved with each individual host,job pair *

In [None]:
total = 0

for job in range( len(valid_jobs) ):

    # general job values
    jobid = valid_jobs[job]
    start = pd.to_datetime(round(jobid.start_time), unit='s').time()
    end = pd.to_datetime(round(jobid.end_time), unit='s').time()
    numCycles = len(jobid.times) + 2
    total += 1
    type_avgs = {}
    times = []
    
    # check that stat types all exist in master list
    # append if not found
    for stat in jobid.schemas.keys():
        if stat not in stat_types:
            stat_types.append(stat)
    
    # iterate through each host object job was run on
    for host_name, host in jobid.hosts.iteritems():
        typ='all'
        dev='all'
        
        if 'all' in typ: typ_keys = host.stats.keys()
        else: typ_keys = typ
        
        # convert timestamps to DateTime objects
        times = [ pd.to_datetime(round(time), unit='s').time() for time in host.times ]
        
        # iterate through all types of stats
        for type_name in typ_keys:
            type_data = host.stats[type_name]
            type_avgs[type_name] = []
            
            if 'all' in dev: dev_keys = type_data.keys()
            else: dev_keys = dev
            
            # iterate through all devices each stat was collected on
            # compute the mean of the row for each row in matrix
            for dev_name in dev_keys:          
                type_avgs[type_name].append(
                    [  np.mean(row).round(decimals=3) for row in type_data[dev_name] ])
        
        # skeleton for job DataFrame
        all_stats = { time:{ stat:0 for stat in type_avgs } for time in times }
        
        # skeleton for descriptive stats DataFrame
        dev_stats = { type_stat:{} for type_stat in jobid.schemas.keys() }
        
        # parse through all values in matrices
        # assign row,col pair in matrix to row,col pair in df
        for type_avg,data in type_avgs.iteritems():
            
            dev_stats[type_avg]["Overall Average"] = np.mean(data)
            dev_stats[type_avg]["Standard Deviation"] = np.std(data)
            dev_stats[type_avg]["High"] = np.max(data)
            dev_stats[type_avg]["Low"] = np.min(data)
            
            for dev in range( len(data) ):
                for i in range( len(times)):
                    try:
                        all_stats[times[i]][type_avg] = data[dev][i].round(3)
                        
                    except:
                        next
        
        # clean up descriptive stats and save to csv
        desc_df = (formatTitles( pd.DataFrame( dev_stats )).dropna(axis=1)).set_index('Unnamed: 0')
        desc_df.to_csv(path_or_buf="./jobs/descriptive_stats/{host}_{jobid}.csv"
                       .format( jobid=valid_jobs[job].id, host=host_name ))
        
        # create df
        df = pd.DataFrame(all_stats)
        
        # transpose df so index=time, col=category of stat
        # relable columns to formal title
        df = formatTitles( df.T )
        df.index.name = "Cycle"
        
        # save jobs to csv
        # set some aside for testing model
        if total <= (len(valid_jobs)*3/4):
            df.to_csv(path_or_buf="./jobs/train/{host}_{jobid}.csv"
                           .format( jobid=valid_jobs[job].id, host=host_name ))
        else:
            df.to_csv(path_or_buf="./jobs/test/{host}_{jobid}.csv"
                           .format( jobid=valid_jobs[job].id, host=host_name ))

In [None]:
# check that no job was missed
if total == len(valid_jobs):
    print "Success!"

In [None]:
# example read of descriptive .csv file saved
ex_desc_df = pd.DataFrame( pd.read_csv("./jobs/descriptive_stats/{host_A}_{job_A}.csv"
                          .format( host_A=ex_job_host, job_A=ex_job)) )
ex_desc_df

In [None]:
# example read of full .csv file saved
ex_df = pd.DataFrame( pd.read_csv("./jobs/train/{host_A}_{job_A}.csv"
                          .format( host_A=ex_job_host, job_A=ex_job)) )
ex_df

In [None]:
ex_df.columns

In [None]:
len(stat_types)