# Performance Data Lookup

In [1]:
# System dependencies
from os import listdir
import time as clock
from datetime import timedelta
from IPython.display import clear_output

In [2]:
import pickle
import gzip
import re

In [3]:
# Data manipulation dependencies
import pandas as pd
import numpy as np
import datetime as dt

In [4]:
# Directory to save to
save_dir = '../data/continued/'

# Directory of prev job scans
id_dir = '../data/labels/IDs/'

In [5]:
# Directory of recent saved comet jobs
source_dir = '/oasis/projects/nsf/sys200/stats/xsede_stats/'

In [6]:
contents = listdir(source_dir)
contents

['archive_of_archive',
 'gordon_hostfile_logs',
 'gordon_pickles',
 'comet_accounting',
 'gordon_accounting',
 'comet_pickles',
 'archive',
 '.htaccess',
 'comet_hostfile_logs']

In [7]:
possible = [ source_dir+file_name for file_name in listdir(source_dir) ]

for item in possible:
    try:
        listdir(item)
    except:
        print(item)

/oasis/projects/nsf/sys200/stats/xsede_stats/gordon_hostfile_logs
/oasis/projects/nsf/sys200/stats/xsede_stats/gordon_pickles
/oasis/projects/nsf/sys200/stats/xsede_stats/gordon_accounting
/oasis/projects/nsf/sys200/stats/xsede_stats/comet_pickles
/oasis/projects/nsf/sys200/stats/xsede_stats/.htaccess
/oasis/projects/nsf/sys200/stats/xsede_stats/comet_hostfile_logs


In [8]:
locs = { 'aofa': source_dir+'archive_of_archive',
         'job_info': source_dir+'comet_accounting',
         'arc': source_dir+'archive'
         #'host_info': source_dir+'comet_hostfile_logs',
         #'old_pickles': source_dir+'comet_pickles'
       }

In [9]:
for key,loc in locs.items():
    print(loc)

/oasis/projects/nsf/sys200/stats/xsede_stats/archive_of_archive
/oasis/projects/nsf/sys200/stats/xsede_stats/comet_accounting
/oasis/projects/nsf/sys200/stats/xsede_stats/archive


In [10]:
arc_data = [ locs['arc']+'/'+host_dir+'/'+stamp 
            for host_dir in listdir(locs['arc'])
            for stamp in listdir(locs['arc']+'/'+host_dir)  ]

aofa_data = [ locs['aofa']+'/'+host_dir+'/'+stamp 
            for host_dir in listdir(locs['aofa'])
            for stamp in listdir(locs['aofa']+'/'+host_dir)  ]

In [11]:
acct_info_locs = [ locs['job_info']+'/'+stamp for stamp in listdir(locs['job_info']) ]
dates = [ loc[-14:-4] for loc in acct_info_locs ]

In [12]:
len(arc_data)

298673

In [13]:
len(aofa_data)

1809956

In [14]:
len(acct_info_locs)

1194

#### Pre-process available data

In [None]:
### Process all ###
arc_hosts = [ host_to_info_dict( host_file ) for host_file in arc_data ]
#aofa_hosts = [ host_to_info_dict( host_file ) for host_file in aofa_data ]
#acct_dates = job_to_info_dict( acct_info_locs )

In [None]:
len(arc_hosts)

## Target(s)

In [16]:
chosen = [('comet-27-[23,27,29-33,35,41-42]', '2020-01-03T20:34:47', '2020-01-05T08:15:18', '30598034'),
         ('comet-15-[01-05,08-10,62,64]', '2020-01-03T07:26:50', '2020-01-05T07:28:31', '30644440'),
         ('comet-19-[06,11-14,17-19,22,24]', '2020-01-03T20:34:47', '2020-01-05T08:11:53', '30598030'),
         ('comet-15-[10,14,66,69,71],comet-18-09,comet-20-[46,48,50,56,59,61]', '2019-08-23T01:06:50', '2019-08-23T06:00:27', '25918189'),
         ('comet-01-[26,32-33],comet-05-72,comet-07-[47-48],comet-23-[08,18-19,33,39,69]', '2019-08-23T02:34:38', '2019-08-23T08:07:04', '25920563')
         ]

In [17]:
#diff = get_stamp(chosen[0][2]) - get_stamp(chosen[0][1])

In [18]:
#diff / 60000

### Prep Cleaning

In [16]:
def get_time( spec=None ):
    if type(spec) is str:
        spec = float( spec )
    
    return clock.strftime("%Y-%d-%mT%H:%M:%S", clock.localtime( spec ))

In [17]:
def get_stamp( spec ):
    try:
        sf = "%Y-%d-%mT%H:%M:%S"
        return int(clock.mktime( clock.strptime( spec, sf ) ))
    except:
        try:
            sf = "'%Y-%d-%mT%H:%M:%S'"
            return int(clock.mktime( clock.strptime( spec, sf ) ))
        except:
            return 0

In [18]:
def check_static( alist ):
    return alist[1:] == alist[:-1]

In [19]:
def check_header( line ):
    if line.find(" ") < 0:
        try:
            return line[0] == '%'
        except:
            return False
        
    else:
        chunks = line.split(" ")
        try:
            return (chunks[0][0] == '%') or ( chunks[2].find("comet") >= 0 )
        except:
            return False

In [20]:
#def group_from_txt(  ):
#    

In [21]:
def check_job( chunk ):
    return chunk.find("-") == -1

In [22]:
def open_txt( txt_file ):
    
    with open( txt_file, "rt" ) as f:
        lines = f.readlines()
        f.close()
    
    return lines

In [23]:
def unzip_txt( gzipped ):
    
    with gzip.open( gzipped, 'rt') as f:
        lines = f.readlines()
        f.close()
    
    return lines

In [24]:
def group_from_txt( txt_file ):
    lines = open_txt( txt_file )
    group = []
    
    for line in lines:
        chunks = line.split(" ")
        nodelist_r = chunks[0]
        nodelist = format_nodelist( chunks[0] )
        start = get_stamp( chunks[1] )
        end = get_stamp( chunks[2] )
        
        item = ( nodelist, start, end )
        group.append(item)
        
    return group

In [25]:
def quick_save( obj, label=get_time() ):
    
    try:
        out_file = open( label, 'wb')
        pickle.dump( obj, out_file)
        
        # double check save
        check_cpicore_set = pickle.load(open(cpiset_out, 'rb'))
        check_cpicore_set = None
        
    except:
        "There was a problem pickling the object - Save manually."

Data Munging

In [26]:
def info_dict( rules, info ):
    rules_list = rules.split("|")
    
    if len(rules_list) != len(info):
        return {}
    
    else:
        return { rules_list[i]:info[i] for i in range(len(rules_list)) }

In [27]:
def host_to_info_dict( zip_txt ):
    contents = unzip_txt( zip_txt )
    host_name = contents[1].partition(" ")[2][:11]
    out_dict = { host_name: {} }
    host_info = {}
    info_dict = { "Data":{},
                    "Job":"N/A",
                    "Schemas":{},
                    "Specs":[]
                }
    
    for line in contents:
            
        if line[0] == "$":
            info_dict["Specs"].append( format_spec( line ) )
            
        elif line[0] == "!":
            info_dict["Schemas"].update( format_schema( line ) )
        
        else:
            
            if (len(line) > 0) and (len(line) < 3 or check_header( line )):
                header_dict = format_header( line )
                
                if header_dict:
                    t = header_dict["Timestamp"]
                    host_info[ t ] = {}
                    
                    if check_job( header_dict["Jobid"] ):
                        info_dict["Job"] = { "Jobid": header_dict["Jobid"] } 
                    
            else:
                incoming = format_data( line )
                info_dict["Data"].update(incoming)
                
                host_info[t].update( info_dict )
                
    out_dict[host_name].update( host_info )
    
    return out_dict

In [28]:
def job_to_info_dict( txt_file_list ):
    nodes_by_date = {}
    unsaved = []

    for date in txt_file_list:
        try:
            # skip alt files
            #check_stamp = int( date[-14] )
            
            # read in file contents
            contents = open_txt( date )
            
            # formatting
            label = date[-14:-4]
            rules = contents[0]
            jobs = contents[1:]
            
            # template to save
            nodes_by_date[ label ] = {}
            nodes_by_date[ label ]["multiple"] = {}
            nodes_by_date[ label ]["rules"] = rules
            
            # run through lines in file
            for job in jobs:
                line = job.split("|")
                node = line[-1]
                info = info_dict( rules, line )
                
                # save multiple node jobs to specified loc
                if len(node) > 12:
                    nodes = format_nodelist( info )
                    for node in nodes:
                        nodes_by_date[ label ][ "multiple" ][ node ] = info
                
                else:
                    nodes_by_date[ label ][ node[:11] ] = info
        except:
            unsaved.append(date)
            
    
    return nodes_by_date, unsaved

Formatting

In [29]:
def format_header( line ):
    chunks = line.split(" ")
    
    try:
        if chunks[0][0] == '%':
            return {}
        else:
            return { "Timestamp": get_time( chunks[0] ), 
                     "Jobid": chunks[1],
                     "Host": chunks[2][:11] }
        
    except:
        return {}

In [30]:
def format_nodelist( nodelist ):
    purged = nodelist.replace('[','').replace(']','').replace(',','-').replace('-','').split("comet")[1:]
    nodes = []
    
    for item in purged:
        base = item[:2]
        prev = 2
        
        for i in range( 4,len(item)+1,2 ):
            node = 'comet' + '-' + base + '-' + item[ prev:i ]
            nodes.append(node)
            prev = i
    return nodes

In [31]:
def format_spec( line ):
    return line[1:-1]

In [32]:
def format_data( line ):
    chunks = line.split(" ")
    
    stat = chunks[0]
    dev = chunks[1]
    data = chunks[2:-1]
    
    return { (stat,dev): data }

In [33]:
def format_schema( line ):
    chunks = line.partition(" ")
    stat = chunks[0][1:]
    
    temp_sch = chunks[2:][0][:-1].replace(",E","").replace(",C","").split(" ")
    fin_sch = []
    
    for item in temp_sch:
        
        if item.find("=") > -1:
            new = item.replace(",","(") + ")"
            fin_sch.append( new )
        
        else:
            fin_sch.append( item )
    
    return { stat:fin_sch }

Data analysis

In [34]:
def timely_dict( host_data, host_name ):
    stamps = list(host_data[ host_name ].keys())
    schemas = host_data[ host_name ][ stamps[0] ]["Schemas"]
    timely_data = []
    
    for stamp in stamps:
        for key,data in host_data[ host_name ][ stamp ]["Data"].items():
            
            stat = key[0]
            dev = key[1]
            
            for i in range(len(data)):
                metric = schemas[stat][i]
            
            info = (stat, metric, dev, int(data[i]), stamp)
            timely_data.append( info )
    
    return timely_data

In [35]:
# PARAMETERS:
# 's/e' single search from start/end (manual)
#       ie) "Start, End: 2020-01-03T20:34:47, 2020-01-05T08:15:18"
# 's' single search from nodelist%start%end (manual)
#       ie) "NL, Start, End: comet-05-12 2020-03-03T20:34:47 2020-03-05T08:15:18"
#       ie) "NL, Start, End: comet-05-[12,16] 2020-03-03T20:34:47 2020-03-05T08:15:18"
# 'l' repeated search from nodelist%start%end strings or (nodelist,start,end) tuples (from list)
#       ie) myJobList = [ "comet-05-12 2020-03-03T20:34:47 2020-03-05T08:15:18",
#                          (comet-05-12, 2020-03-03T20:34:47, 2020-03-05T08:15:18)   ]
#           search( mode='l', myJobList )
# 'f' repeated search from nodelist%start%end (from file)
#       ie) "Text file: your_search_file.txt"  (Note: Mismatched file contents ignored)
def search( mode=['s/e', 's', 'l','f'], job_list=False ):
    
    if mode == 's/e':
        t_0,t_n = input("Start, End:").replace(",", "").split(" ")
        start = get_stamp( t_0 )
        end=get_stamp( t_n )
        return (start,end)
    
    if mode == 's':
        nl,t_0,t_n = input("NL, Start, End:").replace(",", "").split(" ")
        nodelist = format_nodelist( nl )
        start = get_stamp( t_0 )
        end=get_stamp( t_n )
        return (nodelist,start,end)

    if mode == 'l' and job_list:
        try:
            out_list = []
            for item in job_list:
                if len(item.split(" ")) > 1:
                    item = item.split(" ")
                else:
                    nodelist = format_nodelist( item[0] )
                    start = get_stamp( item[1] )
                    end = get_stamp( item[2] )
                    out_list.append( (nodelist,start,end) )
            return out_list
        except:
            "Unable to process variable passed to function. All items in list should be in"
    
    if mode == 'f':
        search_list = group_from_text( input("Text file:") )
        return search_list
    
    else:
        return 1

In [36]:
def sort_search_tup( in_tup ):
    nl_i = in_tup.index("comet")
    nl = in_tup[ nl_i ]
    t_n = ''
    
    if nl_i == 0:
        t_0 = in_tup[1]
    else:
        t_0 = in_tup[0]
    
    try:
        for i in range(len(in_tup)):
            if in_tup[ i ] < t_0:
                t_0 = in_tup[ i ]
            elif in_tup[ i ] > t_0 and (t_n == '' or t_n < in_tup[ i ]):
                t_n = in_tup[ i ]                          
    except:
        next
    
    return (nl, t_0, t_n)

In [37]:
def format_search_tup( line ):
    
    if len(line) > 1:
        nl,t_0,t_n = sort_search_tup( line )
        
        nodelist = format_nodelist( nl )
        start = get_stamp( t_0 )
        end = get_stamp( t_n )
    
    elif len( line.split(" ") ) > 1:
        next

# TEST

In [41]:
#print(search(mode='l',chosen))

# Accessing Info

In [42]:
keys = []

for item in chosen:
    jobid = item[3]
    nodelist = format_nodelist( item[0] )
    start = str(get_stamp( item[1] ))
    end = str(get_stamp( item[2] ))
    
    for node in nodelist:
        keys.append( ( node, start, end, jobid) )
    
    #print("Jobid:\t", jobid)
    #print("Hosts:\t", nodelist)
    #print("Start(ep):\t", start)
    #print()

In [43]:
keys

[('comet-27-23', '1583123687', '1588346118', '30598034'),
 ('comet-27-27', '1583123687', '1588346118', '30598034'),
 ('comet-27-29', '1583123687', '1588346118', '30598034'),
 ('comet-27-33', '1583123687', '1588346118', '30598034'),
 ('comet-27-35', '1583123687', '1588346118', '30598034'),
 ('comet-27-41', '1583123687', '1588346118', '30598034'),
 ('comet-27-42', '1583123687', '1588346118', '30598034'),
 ('comet-15-01', '1583076410', '1588343311', '30644440'),
 ('comet-15-05', '1583076410', '1588343311', '30644440'),
 ('comet-15-08', '1583076410', '1588343311', '30644440'),
 ('comet-15-10', '1583076410', '1588343311', '30644440'),
 ('comet-15-62', '1583076410', '1588343311', '30644440'),
 ('comet-15-64', '1583076410', '1588343311', '30644440'),
 ('comet-19-06', '1583123687', '1588345913', '30598030'),
 ('comet-19-11', '1583123687', '1588345913', '30598030'),
 ('comet-19-14', '1583123687', '1588345913', '30598030'),
 ('comet-19-17', '1583123687', '1588345913', '30598030'),
 ('comet-19-19

In [44]:
chosen_locs = []

for loc in aofa_data:
    for key in keys:
        host = key[0]
        t_n = key[2]
        
        if (host in loc) and t_n is not '0':
            if t_n in loc or t_n[:-2] in loc:
                chosen_locs.append(loc)

for loc in arc_data:
    for key in keys:
        host = key[0]
        t_n = key[2]
        
        if (host in loc) and t_n is not '0':
            if t_n in loc or t_n[:-2] in loc:
                chosen_locs.append(loc)

In [45]:
len(chosen_locs)

25494

### Formatting Info

In [46]:
chosen_locs

['/oasis/projects/nsf/sys200/stats/xsede_stats/archive_of_archive/comet-15-14.sdsc.edu/1559382640.gz',
 '/oasis/projects/nsf/sys200/stats/xsede_stats/archive_of_archive/comet-15-14.sdsc.edu/1562233841.gz',
 '/oasis/projects/nsf/sys200/stats/xsede_stats/archive_of_archive/comet-15-14.sdsc.edu/1528079608.gz',
 '/oasis/projects/nsf/sys200/stats/xsede_stats/archive_of_archive/comet-15-14.sdsc.edu/1573760448.gz',
 '/oasis/projects/nsf/sys200/stats/xsede_stats/archive_of_archive/comet-15-14.sdsc.edu/1482953869.gz',
 '/oasis/projects/nsf/sys200/stats/xsede_stats/archive_of_archive/comet-15-14.sdsc.edu/1498256864.gz',
 '/oasis/projects/nsf/sys200/stats/xsede_stats/archive_of_archive/comet-15-14.sdsc.edu/1490653658.gz',
 '/oasis/projects/nsf/sys200/stats/xsede_stats/archive_of_archive/comet-15-14.sdsc.edu/1527215611.gz',
 '/oasis/projects/nsf/sys200/stats/xsede_stats/archive_of_archive/comet-15-14.sdsc.edu/1564442950.gz',
 '/oasis/projects/nsf/sys200/stats/xsede_stats/archive_of_archive/comet-1

In [47]:
chosen

[('comet-27-[23,27,29-33,35,41-42]',
  '2020-01-03T20:34:47',
  '2020-01-05T08:15:18',
  '30598034'),
 ('comet-15-[01-05,08-10,62,64]',
  '2020-01-03T07:26:50',
  '2020-01-05T07:28:31',
  '30644440'),
 ('comet-19-[06,11-14,17-19,22,24]',
  '2020-01-03T20:34:47',
  '2020-01-05T08:11:53',
  '30598030'),
 ('comet-15-[10,14,66,69,71],comet-18-09,comet-20-[46,48,50,56,59,61]',
  '2019-08-23T01:06:50',
  '2019-08-23T06:00:27',
  '25918189'),
 ('comet-01-[26,32-33],comet-05-72,comet-07-[47-48],comet-23-[08,18-19,33,39,69]',
  '2019-08-23T02:34:38',
  '2019-08-23T08:07:04',
  '25920563')]