#### Dependencies

In [3]:
# System dependencies
from os import listdir
import time as clock
from IPython.display import clear_output

In [4]:
import pickle
import gzip
import re

In [5]:
# Data manipulation dependencies
import pandas as pd
import numpy as np
import datetime as dt

In [6]:
# Directory to save to
save_dir = '../data/continued/'

# Directory of prev job scans
id_dir = '../data/labels/IDs/'

In [7]:
# Directory of recent saved comet jobs
source_dir = '/oasis/projects/nsf/sys200/stats/xsede_stats/'

In [8]:
contents = listdir(source_dir)
contents

['archive_of_archive',
 'gordon_hostfile_logs',
 'gordon_pickles',
 'comet_accounting',
 'gordon_accounting',
 'comet_pickles',
 'archive',
 '.htaccess',
 'comet_hostfile_logs']

In [9]:
possible = [ source_dir+file_name for file_name in listdir(source_dir) ]

for item in possible:
    try:
        listdir(item)
    except:
        print(item)

/oasis/projects/nsf/sys200/stats/xsede_stats/gordon_hostfile_logs
/oasis/projects/nsf/sys200/stats/xsede_stats/gordon_pickles
/oasis/projects/nsf/sys200/stats/xsede_stats/gordon_accounting
/oasis/projects/nsf/sys200/stats/xsede_stats/comet_pickles
/oasis/projects/nsf/sys200/stats/xsede_stats/.htaccess
/oasis/projects/nsf/sys200/stats/xsede_stats/comet_hostfile_logs


In [10]:
locs = { 'aofa': source_dir+'archive_of_archive',
         'job_info': source_dir+'comet_accounting',
         'arc': source_dir+'archive'
         #'host_info': source_dir+'comet_hostfile_logs',
         #'old_pickles': source_dir+'comet_pickles'
       }

In [11]:
for key,loc in locs.items():
    print(loc)

/oasis/projects/nsf/sys200/stats/xsede_stats/archive_of_archive
/oasis/projects/nsf/sys200/stats/xsede_stats/comet_accounting
/oasis/projects/nsf/sys200/stats/xsede_stats/archive


## Prep Cleaning

In [30]:
def get_time( spec=None ):
    return clock.strftime("%Y/%m/%d %H:%M:%S", clock.localtime( spec ))

In [54]:
def check_header( line ):
    chunks = line.split(" ")
    
    try:
        return (chunks[0][0] == '%') or ( chunks[2].find("comet") >= 0 )
    except:
        return False

In [61]:
def check_job( chunk ):
    return chunk.find("-") == -1

In [13]:
def open_txt( txt_file ):
    
    with open( txt_file, "rt" ) as f:
        lines = f.readlines()
        f.close()
    
    return lines

In [14]:
def unzip_txt( gzipped ):
    
    with gzip.open( gzipped, 'rt') as f:
        lines = f.readlines()
        f.close()
    
    return lines

In [15]:
def quick_save( obj, label=get_time() ):
    
    try:
        out_file = open( label, 'wb')
        pickle.dump( obj, out_file)
        
        # double check save
        check_cpicore_set = pickle.load(open(cpiset_out, 'rb'))
        check_cpicore_set = None
        
    except:
        "There was a problem pickling the object - Save manually."

In [16]:
def info_dict( rules, info ):
    rules_list = rules.split("|")
    
    if len(rules_list) != len(info):
        return {}
    
    else:
        return { rules_list[i]:info[i] for i in range(len(rules_list)) }

In [64]:
def host_to_info_dict( zip_txt ):
    contents = unzip_txt( zip_txt )
    out_dict = { "Host": {} }
    host_info = { "Timestamp":{} }
    info_dict = { "Data":{},
                    "Job":"N/A",
                    "Schemas":{},
                    "Specs":[]
                }
    
    for line in contents:
            
        if line[0] == "$":
            host_info["Specs"].append( format_spec( line ) )
            
        elif line[0] == "!":
            host_info["Schemas"].update( format_schema( line ) )
        
        else:
            host_name = ""
            timestamp = ""
            
            if len(line) < 3 or check_header( line ):
                header_dict = format_header( line )
                
                if header_dict:
                    if check_job( header_dict["Job"] ):
                        info_dict["Job"] = { "Jobid": header_dict["Jobid"] }
                        
                    host_name = header_dict["Host"]
                    timestamp = header_dict["Timestamp"]
                    
            else:
                info_dict["Data"].update( format_data( line ) )
    
    host_info[ timestamp ].update( info_dict )
    out_dict[ host_name ].update( host_info )
                
    return out_dict

In [58]:
def format_header( line ):
    chunks = line.split(" ")
    
    if chunks[0][0] == '%':
        return {}
    else:
        return { "Timestamp": get_time( chunks[0] ), 
                 "Jobid": chunks[1],
                 "Host": chunks[2][:11] }

In [17]:
def format_spec( line ):
    return line[1:-1]

In [18]:
def format_schema( line ):
    chunks = line.partition(" ")
    
    stat = chunks[0][1:]
    schemas = chunks[2:]
    
    return { stat:schemas }

In [19]:
def format_data( line ):
    chunks = line.split(" ")
    
    if chunks[0]
    
    else:
        stat = chunks[0]
        dev = chunks[1]
        data = chunks[2:-1]
        
        return { "Stat": stat, "Device": dev, "Data": data }

### Parse file in archive

In [20]:
arc_data = [ locs['arc']+'/'+stamp for stamp in listdir(locs['arc']) ]

In [21]:
temp = listdir(arc_data[0])[0]
arc_data[0]+'/'+temp

'/oasis/projects/nsf/sys200/stats/xsede_stats/archive/comet-10-14.sdsc.edu/1587951665.gz'

In [32]:
unzip_txt('/oasis/projects/nsf/sys200/stats/xsede_stats/archive/comet-10-14.sdsc.edu/1587951665.gz')

['$tacc_stats 2.3.4\n',
 '$hostname comet-10-14.sdsc.edu\n',
 '$uname Linux x86_64 3.10.0-957.12.2.el7.x86_64 #1 SMP Tue May 14 21:24:32 UTC 2019\n',
 '$uptime 4082659\n',
 '!block rd_ios,E rd_merges,E rd_sectors,E,U=512B rd_ticks,E,U=ms wr_ios,E wr_merges,E wr_sectors,E,U=512B wr_ticks,E,U=ms in_flight io_ticks,E,U=ms time_in_queue,E,U=ms\n',
 '!cpu user,E,U=cs nice,E,U=cs system,E,U=cs idle,E,U=cs iowait,E,U=cs irq,E,U=cs softirq,E,U=cs\n',
 '!ib excessive_buffer_overrun_errors,E,W=32 link_downed,E,W=32 link_error_recovery,E,W=32 local_link_integrity_errors,E,W=32 port_rcv_constraint_errors,E,W=32 port_rcv_data,E,U=4B,W=32 port_rcv_errors,E,W=32 port_rcv_packets,E,W=32 port_rcv_remote_physical_errors,E,W=32 port_rcv_switch_relay_errors,E,W=32 port_xmit_constraint_errors,E,W=32 port_xmit_data,E,U=4B,W=32 port_xmit_discards,E,W=32 port_xmit_packets,E,W=32 port_xmit_wait,E,U=ms,W=32 symbol_error,E,W=32 VL15_dropped,E,W=32\n',
 '!ib_ext port_select,C counter_select,C port_xmit_data,E,U=4

In [None]:
info = 

In [24]:
info

{'Process': ['tacc_stats 2.3.4',
  'hostname comet-10-14.sdsc.edu',
  'uname Linux x86_64 3.10.0-957.12.2.el7.x86_64 #1 SMP Tue May 14 21:24:32 UTC 2019',
  'uptime 4082659'],
 'Schemas': [{'block': ('rd_ios,E rd_merges,E rd_sectors,E,U=512B rd_ticks,E,U=ms wr_ios,E wr_merges,E wr_sectors,E,U=512B wr_ticks,E,U=ms in_flight io_ticks,E,U=ms time_in_queue,E,U=ms\n',)},
  {'cpu': ('user,E,U=cs nice,E,U=cs system,E,U=cs idle,E,U=cs iowait,E,U=cs irq,E,U=cs softirq,E,U=cs\n',)},
  {'ib': ('excessive_buffer_overrun_errors,E,W=32 link_downed,E,W=32 link_error_recovery,E,W=32 local_link_integrity_errors,E,W=32 port_rcv_constraint_errors,E,W=32 port_rcv_data,E,U=4B,W=32 port_rcv_errors,E,W=32 port_rcv_packets,E,W=32 port_rcv_remote_physical_errors,E,W=32 port_rcv_switch_relay_errors,E,W=32 port_xmit_constraint_errors,E,W=32 port_xmit_data,E,U=4B,W=32 port_xmit_discards,E,W=32 port_xmit_packets,E,W=32 port_xmit_wait,E,U=ms,W=32 symbol_error,E,W=32 VL15_dropped,E,W=32\n',)},
  {'ib_ext': ('port_se

### Parse file in comet_accounting

In [33]:
job_data = [ locs['job_info']+'/'+stamp for stamp in listdir(locs['job_info']) ]

In [34]:
temp = job_data[0]
temp

'/oasis/projects/nsf/sys200/stats/xsede_stats/comet_accounting/2019-09-19.txt'

In [35]:
job_sample = open_txt( temp )

In [36]:
rules = job_sample[0]
rules.split("|")

['JobID',
 'User',
 'Account',
 'Start',
 'End',
 'Submit',
 'Partition',
 'Timelimit',
 'JobName',
 'State',
 'NNodes',
 'ReqCPUS',
 'NodeList\n']

In [37]:
job_sample[1].split("|")

['26617639',
 'cipres',
 'sds121',
 '2019-09-12T05:12:16',
 '2019-09-19T05:12:30',
 '2019-09-12T05:12:10',
 'shared',
 '7-00:00:00',
 'NGBW-JOB-MRBAYES_XSEDE-1AA1022FA22D445DAB262C7D61CB6616',
 'TIMEOUT',
 '1',
 '16',
 'comet-04-40\n']

In [38]:
job_sample[1:]

['26617639|cipres|sds121|2019-09-12T05:12:16|2019-09-19T05:12:30|2019-09-12T05:12:10|shared|7-00:00:00|NGBW-JOB-MRBAYES_XSEDE-1AA1022FA22D445DAB262C7D61CB6616|TIMEOUT|1|16|comet-04-40\n',
 '26618425|cipres|sds121|2019-09-12T06:27:21|2019-09-19T06:27:28|2019-09-12T06:27:21|compute|7-00:00:00|NGBW-JOB-JMODELTEST2_XSEDE-A7B81541E1DF42F5A332ED19BE47DAE2|TIMEOUT|1|24|comet-17-48\n',
 '26618465|cipres|sds121|2019-09-12T06:28:54|2019-09-19T06:28:59|2019-09-12T06:28:51|compute|7-00:00:00|NGBW-JOB-JMODELTEST2_XSEDE-32AF6F93F6AB4ADAA6FC62855CC9B261|TIMEOUT|1|24|comet-27-64\n',
 '26619940|cipres|sds121|2019-09-12T09:06:06|2019-09-19T09:06:36|2019-09-12T09:06:03|compute|7-00:00:00|NGBW-JOB-GPHOCS_XSEDE-23CCC6C010CD4526B4514144918E7145|TIMEOUT|1|24|comet-04-69\n',
 '26694681|jwestern|cit121|2019-09-18T23:29:00|2019-09-19T23:28:26|2019-09-14T16:42:50|compute|23:59:00|rp12e07g2C0.17|TIMEOUT|16|384|comet-14-[37-38,42-44,56-57,59-60,62-64,66-69]\n',
 '26697694|cipres|sds121|2019-09-14T21:18:44|2019-09-

In [39]:
nodes_by_date = {}
unsaved = []

for date in job_data:
    try:
        
        # skip alt files
        #check_stamp = int( date[-14] )
        
        # read in file contents
        contents = open_txt( date )
    
        # formatting
        label = date[-14:-4]
        rules = contents[0]
        jobs = contents[1:]
        
        # template to save
        nodes_by_date[ label ] = {}
        nodes_by_date[ label ]["multiple"] = {}
        nodes_by_date[ label ]["rules"] = rules
        
        # run through lines in file
        for job in jobs:
            line = job.split("|")
            node = line[-1]
            info = info_dict( rules, line )
            
            # save multiple node jobs to specified loc
            if len(node) > 12:                
                nodes_by_date[ label ][ "multiple" ][ node ] = info
            
            else:
                nodes_by_date[ label ][ node[:11] ] = info
                
    except:
        unsaved.append(date)

In [40]:
nodes_by_date['2019-09-19']

{'multiple': {'comet-14-[37-38,42-44,56-57,59-60,62-64,66-69]\n': {'JobID': '26694681',
   'User': 'jwestern',
   'Account': 'cit121',
   'Start': '2019-09-18T23:29:00',
   'End': '2019-09-19T23:28:26',
   'Submit': '2019-09-14T16:42:50',
   'Partition': 'compute',
   'Timelimit': '23:59:00',
   'JobName': 'rp12e07g2C0.17',
   'State': 'TIMEOUT',
   'NNodes': '16',
   'ReqCPUS': '384',
   'NodeList\n': 'comet-14-[37-38,42-44,56-57,59-60,62-64,66-69]\n'},
  'comet-14-[10,17,21,23,26,32,36,40],comet-16-[08,11-16,18,26-32,49,69],comet-27-[19-20,31-32,39,42,44,48-49,56-57,63,66-68]\n': {'JobID': '26745059',
   'User': 'liwenfei',
   'Account': 'cla174',
   'Start': '2019-09-17T18:48:24',
   'End': '2019-09-19T10:56:19',
   'Submit': '2019-09-16T22:22:22',
   'Partition': 'compute',
   'Timelimit': '2-00:00:00',
   'JobName': 'batch_script',
   'State': 'COMPLETED',
   'NNodes': '40',
   'ReqCPUS': '960',
   'NodeList\n': 'comet-14-[10,17,21,23,26,32,36,40],comet-16-[08,11-16,18,26-32,49,69

### Parse file in archive of archive

In [None]:
aofa_data = [ locs['aofa']+'/'+stamp for stamp in listdir(locs['aofa']) ]

In [None]:
temp = listdir(aofa_data[0])[0]
aofa_data[0]+'/'+temp

In [None]:
aofa_sample = unzip_txt( aofa_data[0]+'/'+temp )

In [None]:
aofa_sample

# PREVIOUS PROCEDURE

In [None]:
t0 = clock.time()
n = len(files)
valid = []
invalid = []
scanned = 0

for file_name in files[:10]:
    scanned += 1
    perc_scanned = np.round( scanned / n * 100, 2)
        
    clear_output(wait=True)
    print "Processing file {} of {} files \t ({}% of total files)".format( scanned, n, perc_scanned )
        
    try:
        check_file = open( source_dir+file_name, 'r' )
        
        for line in check_file:
            print(line)
        
        check_file.close()
    except:
        next
#        job_obj = pickle.load( pickle_file )
#        
#        # Save data
#        valid.append(job_obj)
#        summary[date]["Saved"] += 1
#        
#        pickle_file.close()
#            
#    except:
#        invalid.append( file_name )
#            
#    print
#    print "Run time: {}s".format( np.round( clock.time() - t0, 1 ) )

In [None]:
len(invalid) == len(files)


# Resume

In [None]:
# List of date directories in source_dir
dates_list = [ date for date in listdir(source_dir) if len(listdir(source_dir+date)) > 0 ]
all_files = [ source_dir+date+'/'+file_name for date in listdir(source_dir) for file_name in listdir(source_dir+date) ]

#### Confirm Already Scanned Jobs

In [None]:
collected = [ file_name[12:19] for file_name in listdir('../data/raw') ]
remaining = [ file_name for file_name in all_files if file_name[72:] not in collected ]        

In [None]:
print "Total files:\t", len(all_files)
print "Collected:\t", len(collected)
print "--------------------------"
print "Remaining:\t", len(remaining)

## Prep Data Cleaning

In [None]:
def check_val( val ):
    try:
        val = float(val)
        return val
    except:
        return 0
    else:
        return 0
        
def convert_dt( val ):
    return dt.datetime.utcfromtimestamp( val ).strftime( "%Y-%m-%d %H:%M:%S" )

def get_schemas( job ):
    return { stat:schema.keys() for stat,schema in job.schemas.items() }

def get_indices( job, host ):
    indices = []
    stats = [ stat for stat in job.schemas.keys() if stat in host.stats.keys() ]
    schemas = { stat:schema.keys() for stat,schema in job.schemas.items() }
    cores = { stat:core.keys() for stat,core in host.stats.items() }
    
    for stat in stats:
        for core in cores[stat]:
            for schema in schemas[stat]:
                indices.append( (stat,core,schema) )
             
    return indices

def get_times( job, host ):
    times = [ job.start_time ]
    times.extend( host.times )
    times.append( job.end_time )
    return [ convert_dt(t) for t in times ]

def clean_list( data_list ):
    return [ check_val( x ) for x in data_list ]
    
def get_data( host, row_labels ):
    data = { label:[] for label in row_labels }
    
    for stat,node in host.stats.items():
        for core,matrix in node.items():
            matrix = matrix.T
            for i in range(len(matrix)):
                data[stat,core] = clean_list( matrix[i] )
    return data

def fill_df( template_df, data_dict):
    for row,data in data_dict.items():
        template_df.loc[row].update( pd.Series(data) )
    return df

In [None]:
def cpicore ( job_df ):
    data = job_df.loc['intel_hsw']
    times = job_df.columns.tolist()
    cpicore_list = []
    
    for i in range(1, len(times)):
        chunk = data[times[:i+1]]
        devices = { row : np.mean(col.values) for row,col in chunk.iterrows() }
        avg_c = { key[0]:0 for key,val in devices.items() }
        sum_avgs = 0
        
        for key,val in avg_c.items():
            avg_c[ key ] = devices[ (key, 'CLOCKS_UNHALTED_CORE') ] / devices[ (key, 'INSTRUCTIONS_RETIRED') ]
    
        for key,val in avg_c.items():
            sum_avgs += val
            
        cpicore_list.append(sum_avgs/24)
    
    return cpicore_list

In [None]:
#def find_next( current, unsorted ):
#    target = current + 00:10:00
#    found = unsorted[0]
#    proximity = target - found
#    
#    if len(unsorted) > 1:
#        for i in range(len(unsorted)):
#            if target - unsorted[i] < proximity:
#                found = unsorted[i]
#                proximity = target - found
#    return found
#
#def fill_sorted( start, unsorted ):
#    sorted_list = []
#    
#    for i in range(len(unsorted)):
#        current = sorted_list[i]
#        next_time = find_next( current, unsorted )
#        sorted_list[i+1] = next_time
#        
#def sort_times( job ):
#    start = job.start
#    mid = job.times
#    end = job.end
#    
#    if start == end:
#        return [start]
#    elif len(mid) < 1:
#        return [start, end]
#    elif len(mid) < 2:
#        return [start, mid[0], end]
#    else:
#        return fill_sorted( start, mid.append(end) )

# Access and open pickled job files
**Process:**
    - Iterate through the non-empty date folders available in source_dir
    - A file is saved in valid_jobs if:
        * The pickled file is a Job object
    - Exceptions are skipped

In [None]:
cut = remaining
n = len(cut)

In [None]:
summary = {}

for file_name in cut:
    date = file_name[61:71]
    
    if date in summary:
        summary[date]["Total"] += 1
    else:
        summary[date] = {}
        summary[date]["Total"] = 1
        summary[date]["Saved"] = 0

In [None]:
summary

#### Catch invalid files

In [None]:
t0 = clock.time()
valid = []
invalid = []
scanned = 0

for file_name in cut:
    date = file_name[61:71]
    scanned += 1
    perc_scanned = np.round( scanned / n * 100, 2)
        
    clear_output(wait=True)
    print "Processing file {} of {} files \t ({}% of total files)".format( scanned, n, perc_scanned )
        
    try:
        pickle_file = open( file_name, 'rb' )
        job_obj = pickle.load( pickle_file )
        
        # Save data
        valid.append(job_obj)
        summary[date]["Saved"] += 1
        
        pickle_file.close()
            
    except:
        invalid.append( file_name )
            
    print
    print "Run time: {}s".format( np.round( clock.time() - t0, 1 ) )

In [None]:
invalid_out = '../src/data/summary_stats/raw_metrics/invalid.pkl'
out_file = open(invalid_out, 'wb')
pickle.dump(invalid, out_file)
#invalid = pickle.load(open(invalid_out, 'rb'))

In [None]:
valid_out = '../src/data/summary_stats/raw_metrics/all_rows.pkl'
out_file = open(all_rows_out, 'wb')
pickle.dump(all_rows, out_file)
#valid = pickle.load(open(valid_out, 'rb'))

In [None]:
len(invalid)

In [None]:
len(valid)

In [None]:
print "Total job objects collected:\t", len(valid)
print
print "\tBreakdown of files"
print "========================="

for date,info in summary:
    
    print "Date:\t", date
    print info["Saved"], "files saved out of", info["Total"]
    print

# Begin Processing

In [None]:
job_objects = valid
m = float(sum([len(job.hosts.keys()) for job in job_objects]))

In [None]:
cut = len(job_objects)/2
first = 0
stop = len(job_objects)
rem = stop - first

print "Total Jobs (this date):\t\t", len(job_objects)
print "Total Host,Job Pairs:\t\t", int(m)
print("------------------------------------")
print "Remaining jobs to scan:\t\t", int(rem)

In [None]:
job_dfs = {}
t0 = clock.time()
total = 0
current = 0

for job_idx in range( first, stop ):
    job = job_objects[ job_idx ]
    schemas = get_schemas( job )
    total += 1
    
    # support for tracking progress in below print statements
    clear_output(wait=True)
    
    # iterate through each host object job was run on
    for host_name, host in job.hosts.iteritems():
        print("Processing hosts for job {} of {} \t ({}% of total)".format(job_idx+1, stop, np.round( (current+first)/m*100, 2)))
        current += 1
        
        # build MultiIndex for df 
        idx_labels = get_indices( job, host )
        indices = pd.MultiIndex.from_tuples( idx_labels, names=['Stat', 'Device', 'Schema'] )
                    
        # process timestamps
        times = get_times( job, host )
    
        # collect job data
        data = get_data( host, idx_labels )
        
        # create df with MultiIndex, ordered times
        df = pd.DataFrame( index=indices, columns=times ).sort_index()
        
        # fill df
        for stat,devices in host.stats.items():
            for device,data_matrix in devices.items():
                for t_idx in range( len(data_matrix) ):
                    timestamp = times[t_idx]
                    
                    for metric_idx in range( len(data_matrix[ t_idx ]) ):
                        metric = schemas[stat][metric_idx]
                        row_label = (stat,device,metric)
                        datum = data_matrix[t_idx][metric_idx]
                        
                        df.loc[row_label][timestamp] = check_val(datum)
        
        # save job info from DataFrame to csv file
        df.to_csv( path_or_buf=save_dir+"{}_{}.csv".format( host_name, job.id ) )

In [None]:
# check that no job was missed
if total == ( stop-first ):
    print "Success!"
else:
    print stop - first - total, "jobs missing"