# Performance Data Lookup

In [33]:
# System dependencies
from os import listdir
import time as clock
from datetime import timedelta
from IPython.display import clear_output

In [34]:
import pickle
import gzip
import re

In [35]:
# Data manipulation dependencies
import pandas as pd
import numpy as np
import datetime as dt

In [36]:
import prep_IO

In [37]:
# Directory to save to
save_dir = '../data/continued/'

# Directory of prev job scans
id_dir = '../data/labels/IDs/'

In [38]:
# Directory of recent saved comet jobs
source_dir = '/oasis/projects/nsf/sys200/stats/xsede_stats/'

In [39]:
contents = listdir(source_dir)
contents

['archive_of_archive',
 'gordon_hostfile_logs',
 'gordon_pickles',
 'comet_accounting',
 'gordon_accounting',
 'comet_pickles',
 'archive',
 '.htaccess',
 'comet_hostfile_logs']

In [40]:
possible = [ source_dir+file_name for file_name in listdir(source_dir) ]

for item in possible:
    try:
        listdir(item)
    except:
        print(item)

/oasis/projects/nsf/sys200/stats/xsede_stats/gordon_hostfile_logs
/oasis/projects/nsf/sys200/stats/xsede_stats/gordon_pickles
/oasis/projects/nsf/sys200/stats/xsede_stats/gordon_accounting
/oasis/projects/nsf/sys200/stats/xsede_stats/comet_pickles
/oasis/projects/nsf/sys200/stats/xsede_stats/.htaccess
/oasis/projects/nsf/sys200/stats/xsede_stats/comet_hostfile_logs


In [41]:
locs = { 'aofa': source_dir+'archive_of_archive',
         'job_info': source_dir+'comet_accounting',
         'arc': source_dir+'archive'
         #'host_info': source_dir+'comet_hostfile_logs',
         #'old_pickles': source_dir+'comet_pickles'
       }

In [42]:
for key,loc in locs.items():
    print(loc)

/oasis/projects/nsf/sys200/stats/xsede_stats/archive_of_archive
/oasis/projects/nsf/sys200/stats/xsede_stats/comet_accounting
/oasis/projects/nsf/sys200/stats/xsede_stats/archive


In [43]:
arc_data = [ locs['arc']+'/'+host_dir+'/'+stamp 
            for host_dir in listdir(locs['arc'])
            for stamp in listdir(locs['arc']+'/'+host_dir)  ]

aofa_data = [ locs['aofa']+'/'+host_dir+'/'+stamp 
            for host_dir in listdir(locs['aofa'])
            for stamp in listdir(locs['aofa']+'/'+host_dir)  ]

In [44]:
acct_info_locs = [ locs['job_info']+'/'+stamp for stamp in listdir(locs['job_info']) ]
dates = [ loc[-14:-4] for loc in acct_info_locs ]

In [45]:
len(arc_data)

299014

In [46]:
len(aofa_data)

1809956

In [47]:
len(acct_info_locs)

1228

### temporary munging

In [52]:
def info_dict( rules, info ):
    rules_list = rules.split("|")
    info_list = info.split("|")
    
    if len(rules_list) != len(info_list):
        return {}
    
    else:
        if '\n' in info_list[-1]:
            saved = info_list[:-1]
            extra = info_list[-1]
            cut = extra[:-1]
            
            info_list = saved.append(cut)
            
        return { rules_list[i]:info_list[i] for i in range(len(rules_list)) }

def job_to_info_dict( txt_file_list, target=False ):
    nodes_by_date = {}
    unsaved = []

    for date in txt_file_list:
        try:
            # skip alt files
            #check_stamp = int( date[-14] )
            #if (target) and (check_stamp == target):
            
            # read in file contents
            contents = open_txt( date )
            
            # formatting
            label = date[-14:-4]
            rules = contents[0]
            jobs = contents[1:]
            
            # template to save
            nodes_by_date[ label ] = {}
            nodes_by_date[ label ]["multiple"] = {}
            nodes_by_date[ label ]["rules"] = rules
            
            # run through lines in file
            for job in jobs:
                line = job.split("|")
                node = line[-1]
                info = info_dict( rules, line )
                
                # save multiple node jobs to specified loc
                if len(node) > 12:
                    nodes = format_nodelist( info )
                    for node in nodes:
                        nodes_by_date[ label ][ "multiple" ][ node ] = info
                
                else:
                    nodes_by_date[ label ][ node[:11] ] = info
        except:
            unsaved.append(date)
            continue
            
    
    return nodes_by_date, unsaved

def failed_search( collected_dict_list ):
    
    for i in range( collected_dict_list ):
        temp = collected_dict_list[i]
        nodelist = temp['NodeList']
        s = temp['Start']
        e = temp['End']
        jobid = temp['JobID']

#### Pre-process available data

In [17]:
### Process all ###
#arc_hosts = [ prep_IO.host_to_info_dict( host_file ) for host_file in arc_data[:100] ]
#aofa_hosts = [ host_to_info_dict( host_file ) for host_file in aofa_data ]
#acct_dates = [ prep_IO.job_to_info_dict( acct_info_locs )

# Compile failed

In [49]:
chunky_list = []

for i in range(10):
    failed = prep_IO.collect_failed( acct_info_locs[i] )
    rules = prep_IO.open_txt( acct_info_locs[i] )[0]
    
    for j in range(len( failed )):
        chunky_list.append( prep_IO.info_dict( rules, failed[j] ))

chunky_list

[{'JobID': '26753815',
  'User': 'niloo',
  'Account': 'mqt102',
  'Start': '2019-09-17T08:56:01',
  'End': '2019-09-19T00:38:37',
  'Submit': '2019-09-17T08:55:58',
  'Partition': 'shared',
  'Timelimit': '2-00:00:00',
  'JobName': 'g_TSA-thiolate-pcm-5watersv2.com',
  'State': 'FAILED',
  'NNodes': '1',
  'ReqCPUS': '8',
  'NodeList\n': 'comet-10-36\n'},
 {'JobID': '26753954',
  'User': 'niloo',
  'Account': 'mqt102',
  'Start': '2019-09-17T09:02:11',
  'End': '2019-09-19T05:17:58',
  'Submit': '2019-09-17T09:02:08',
  'Partition': 'shared',
  'Timelimit': '2-00:00:00',
  'JobName': 'g_TSA-hyb5.com',
  'State': 'FAILED',
  'NNodes': '1',
  'ReqCPUS': '8',
  'NodeList\n': 'comet-04-60\n'},
 {'JobID': '26755181',
  'User': 'imsaid2',
  'Account': 'uic346',
  'Start': '2019-09-17T10:25:48',
  'End': '2019-09-19T05:15:10',
  'Submit': '2019-09-17T10:25:22',
  'Partition': 'compute',
  'Timelimit': '3-00:00:00',
  'JobName': 'job5-86',
  'State': 'FAILED',
  'NNodes': '1',
  'ReqCPUS': '2

In [50]:
len(chunky_list)

22608

In [51]:
sample = chunky_list[0]['NodeList\n']
'\n' in sample

True

In [32]:
sample[:-1]

'comet-10-36'

### Previous host processing

In [24]:
job_host_pairs_a = []
job_host_pairs_b = []

for i in range(len(arc_hosts)):
    avail_hosts = list(arc_hosts[i].keys())
    
    for j in range( len( avail_hosts ) ):
        host_i_name = avail_hosts[j]
        timely_list = arc_hosts[i][ host_i_name ][ "Timely Data" ]
        
        for k in range( len( timely_list ) ):
            try_job_info = arc_hosts[i][ host_i_name ][ timely_list[k][4] ][ "Job" ]
            
            # check each tup in timely_data list for a jobid
            if len( timely_list[k] ) > 5:
                save_tup = ( host_i_name,timely_list[j][5] )
                if save_tup not in job_host_pairs_a:
                    job_host_pairs_a.append( save_tup )
                
            if type(try_job_info) is dict and "Jobid" in try_job_info:
                save_tup = ( host_i_name,try_job_info[ "Jobid" ] )
                if save_tup not in job_host_pairs_b:
                    job_host_pairs_b.append( save_tup )

NameError: name 'arc_hosts' is not defined

In [None]:
len(job_host_pairs_a)

In [None]:
len(job_host_pairs_b)

In [None]:
timely_list

### View

In [None]:
arc_hosts[0]['comet-10-14']["Timely Data"]