#### Dependencies

In [1]:
# System dependencies
from os import listdir
import time as clock
from IPython.display import clear_output

In [2]:
import pickle
import gzip
import re

In [3]:
# Data manipulation dependencies
import pandas as pd
import numpy as np
import datetime as dt

In [4]:
# Directory to save to
save_dir = '../data/continued/'

# Directory of prev job scans
id_dir = '../data/labels/IDs/'

In [5]:
# Directory of recent saved comet jobs
source_dir = '/oasis/projects/nsf/sys200/stats/xsede_stats/'

In [6]:
contents = listdir(source_dir)
contents

['archive_of_archive',
 'gordon_hostfile_logs',
 'gordon_pickles',
 'comet_accounting',
 'gordon_accounting',
 'comet_pickles',
 'archive',
 '.htaccess',
 'comet_hostfile_logs']

In [7]:
possible = [ source_dir+file_name for file_name in listdir(source_dir) ]

for item in possible:
    try:
        listdir(item)
    except:
        print(item)

/oasis/projects/nsf/sys200/stats/xsede_stats/gordon_hostfile_logs
/oasis/projects/nsf/sys200/stats/xsede_stats/gordon_pickles
/oasis/projects/nsf/sys200/stats/xsede_stats/gordon_accounting
/oasis/projects/nsf/sys200/stats/xsede_stats/comet_pickles
/oasis/projects/nsf/sys200/stats/xsede_stats/.htaccess
/oasis/projects/nsf/sys200/stats/xsede_stats/comet_hostfile_logs


In [8]:
locs = { 'aofa': source_dir+'archive_of_archive',
         'job_info': source_dir+'comet_accounting',
         'arc': source_dir+'archive'
         #'host_info': source_dir+'comet_hostfile_logs',
         #'old_pickles': source_dir+'comet_pickles'
       }

In [9]:
for key,loc in locs.items():
    print(loc)

/oasis/projects/nsf/sys200/stats/xsede_stats/archive_of_archive
/oasis/projects/nsf/sys200/stats/xsede_stats/comet_accounting
/oasis/projects/nsf/sys200/stats/xsede_stats/archive


## Prep Cleaning

In [10]:
def get_time( spec=None ):
    if type(spec) is str:
        spec = float( spec )
    
    return clock.strftime("%Y/%m/%d %H:%M:%S", clock.localtime( spec ))

In [11]:
def check_header( line ):
    if line.find(" ") < 0:
        try:
            return line[0] == '%'
        except:
            return False
        
    else:
        chunks = line.split(" ")
        try:
            return (chunks[0][0] == '%') or ( chunks[2].find("comet") >= 0 )
        except:
            return False

In [12]:
def check_job( chunk ):
    return chunk.find("-") == -1

In [13]:
def open_txt( txt_file ):
    
    with open( txt_file, "rt" ) as f:
        lines = f.readlines()
        f.close()
    
    return lines

In [14]:
def unzip_txt( gzipped ):
    
    with gzip.open( gzipped, 'rt') as f:
        lines = f.readlines()
        f.close()
    
    return lines

In [15]:
def quick_save( obj, label=get_time() ):
    
    try:
        out_file = open( label, 'wb')
        pickle.dump( obj, out_file)
        
        # double check save
        check_cpicore_set = pickle.load(open(cpiset_out, 'rb'))
        check_cpicore_set = None
        
    except:
        "There was a problem pickling the object - Save manually."

In [16]:
def info_dict( rules, info ):
    rules_list = rules.split("|")
    
    if len(rules_list) != len(info):
        return {}
    
    else:
        return { rules_list[i]:info[i] for i in range(len(rules_list)) }

In [17]:
def host_to_info_dict( zip_txt ):
    contents = unzip_txt( zip_txt )
    host_name = contents[1].partition(" ")[2][:11]
    out_dict = { host_name: {} }
    host_info = {}
    info_dict = { "Data":{},
                    "Job":"N/A",
                    "Schemas":{},
                    "Specs":[]
                }
    
    for line in contents:
            
        if line[0] == "$":
            info_dict["Specs"].append( format_spec( line ) )
            
        elif line[0] == "!":
            info_dict["Schemas"].update( format_schema( line ) )
        
        else:
            
            if (len(line) > 0) and (len(line) < 3 or check_header( line )):
                header_dict = format_header( line )
                
                if header_dict:
                    t = header_dict["Timestamp"]
                    host_info[ t ] = {}
                    
                    if check_job( header_dict["Jobid"] ):
                        info_dict["Job"] = { "Jobid": header_dict["Jobid"] } 
                    
            else:
                incoming = format_data( line )
                info_dict["Data"].update(incoming)
                
                host_info[t].update( info_dict )
                
    out_dict[host_name].update( host_info )
    
    return out_dict

In [18]:
def job_to_info_dict( txt_file_list ):
    nodes_by_date = {}
    unsaved = []

    for date in txt_file_list:
        try:
            # skip alt files
            #check_stamp = int( date[-14] )
            
            # read in file contents
            contents = open_txt( date )
            
            # formatting
            label = date[-14:-4]
            rules = contents[0]
            jobs = contents[1:]
            
            # template to save
            nodes_by_date[ label ] = {}
            nodes_by_date[ label ]["multiple"] = {}
            nodes_by_date[ label ]["rules"] = rules
            
            # run through lines in file
            for job in jobs:
                line = job.split("|")
                node = line[-1]
                info = info_dict( rules, line )
                
                # save multiple node jobs to specified loc
                if len(node) > 12:                
                    nodes_by_date[ label ][ "multiple" ][ node ] = info
                
                else:
                    nodes_by_date[ label ][ node[:11] ] = info
        except:
            unsaved.append(date)
            
    
    return nodes_by_date, unsaved

In [19]:
def format_header( line ):
    chunks = line.split(" ")
    
    try:
        if chunks[0][0] == '%':
            return {}
        else:
            return { "Timestamp": get_time( chunks[0] ), 
                     "Jobid": chunks[1],
                     "Host": chunks[2][:11] }
        
    except:
        return {}

In [20]:
def format_spec( line ):
    return line[1:-1]

In [21]:
def format_schema( line ):
    chunks = line.partition(" ")
    
    stat = chunks[0][1:]
    schemas = chunks[2:]
    
    return { stat:schemas }

In [22]:
def format_data( line ):
    chunks = line.split(" ")
    
    stat = chunks[0]
    dev = chunks[1]
    data = chunks[2:-1]
    
    return { (stat,dev): data }

### Parse file in archive

In [23]:
arc_data = [ locs['arc']+'/'+host_dir+'/'+stamp 
            for host_dir in listdir(locs['arc'])
            for stamp in listdir(locs['arc']+'/'+host_dir)  ]

In [24]:
len(arc_data)

251599

In [25]:
arc_data

['/oasis/projects/nsf/sys200/stats/xsede_stats/archive/comet-10-14.sdsc.edu/1587951665.gz',
 '/oasis/projects/nsf/sys200/stats/xsede_stats/archive/comet-10-14.sdsc.edu/1593826873.gz',
 '/oasis/projects/nsf/sys200/stats/xsede_stats/archive/comet-10-14.sdsc.edu/1588902066.gz',
 '/oasis/projects/nsf/sys200/stats/xsede_stats/archive/comet-10-14.sdsc.edu/1590198068.gz',
 '/oasis/projects/nsf/sys200/stats/xsede_stats/archive/comet-10-14.sdsc.edu/1591666870.gz',
 '/oasis/projects/nsf/sys200/stats/xsede_stats/archive/comet-10-14.sdsc.edu/1587865265.gz',
 '/oasis/projects/nsf/sys200/stats/xsede_stats/archive/comet-10-14.sdsc.edu/1587606065.gz',
 '/oasis/projects/nsf/sys200/stats/xsede_stats/archive/comet-10-14.sdsc.edu/1590716468.gz',
 '/oasis/projects/nsf/sys200/stats/xsede_stats/archive/comet-10-14.sdsc.edu/1595468475.gz',
 '/oasis/projects/nsf/sys200/stats/xsede_stats/archive/comet-10-14.sdsc.edu/1585273261.gz',
 '/oasis/projects/nsf/sys200/stats/xsede_stats/archive/comet-10-14.sdsc.edu/1588

In [26]:
group = [arc_data[0], arc_data[798], arc_data[23], arc_data[399], arc_data[25178] ]
for temp_loc in group:
    print(temp_loc)
    print()

/oasis/projects/nsf/sys200/stats/xsede_stats/archive/comet-10-14.sdsc.edu/1587951665.gz

/oasis/projects/nsf/sys200/stats/xsede_stats/archive/comet-11-10.sdsc.edu/1589247663.gz

/oasis/projects/nsf/sys200/stats/xsede_stats/archive/comet-10-14.sdsc.edu/1584582060.gz

/oasis/projects/nsf/sys200/stats/xsede_stats/archive/comet-15-11.sdsc.edu/1594258904.gz

/oasis/projects/nsf/sys200/stats/xsede_stats/archive/comet-04-62.sdsc.edu/1590975625.gz



In [27]:
test_group = {}

for temp_loc in group:
    test_group.update( host_to_info_dict( temp_loc ) )

In [28]:
hosts = list(test_group.keys())
hosts

['comet-10-14', 'comet-11-10', 'comet-15-11', 'comet-04-62']

In [29]:
h1 = hosts[0]
h1_times = list(test_group[ h1 ].keys())
t1 = h1_times[0]
h1_t1 = test_group[ h1 ][ t1 ]
h1_t1

{'Data': {('block', 'md0'): ['321',
   '0',
   '15300',
   '0',
   '614',
   '0',
   '44',
   '0',
   '0',
   '0'],
  ('block', 'md1'): ['168687',
   '0',
   '5032482',
   '0',
   '2240906',
   '0',
   '155520960',
   '0',
   '0',
   '0'],
  ('block', 'sda'): ['337102',
   '2692',
   '30887991',
   '270488',
   '8734605',
   '449498',
   '1126639671',
   '25380728',
   '0',
   '3126016'],
  ('block', 'sdb'): ['337501',
   '2370',
   '31313587',
   '256549',
   '8686096',
   '419516',
   '1134250279',
   '28363861',
   '0',
   '3180292'],
  ('cpu', '22'): ['24400672', '190', '1728962', '53644456', '42069', '0'],
  ('cpu', '20'): ['37820030', '36', '1622677', '40270686', '102463', '0'],
  ('cpu', '21'): ['30259755', '30', '1545549', '47980386', '36400', '0'],
  ('cpu', '11'): ['46397598', '4', '1668264', '31697887', '58628', '0'],
  ('cpu', '10'): ['39917159', '0', '1760080', '38126264', '12142', '0'],
  ('cpu', '13'): ['44792651', '81', '1688054', '33313807', '24450', '0'],
  ('cpu', '1

In [30]:
test_cases = []

for host,t_dict in test_group.items():
    for t in t_dict.keys():
        if type(test_group[ host ][ t ]['Job']) is dict:
            jobid = test_group[ host ][ t ]['Job']['Jobid']
            test_cases.append( (host,jobid,t) )

In [68]:
test_jobs = {}

for case in test_cases:
    host = case[0]
    jobid = case[1]
    date = case[2]
    
    if (host,jobid) not in test_jobs.keys():
        test_jobs[ (host,jobid) ] = date

In [69]:
test_cases

[('comet-10-14', '32182941', '2020/03/19 00:18:10'),
 ('comet-10-14', '32182941', '2020/03/19 00:28:11'),
 ('comet-10-14', '32182941', '2020/03/19 00:38:11'),
 ('comet-10-14', '32182941', '2020/03/19 00:48:11'),
 ('comet-10-14', '32182941', '2020/03/19 00:58:11'),
 ('comet-10-14', '32182941', '2020/03/19 01:08:11'),
 ('comet-10-14', '32182941', '2020/03/19 01:18:11'),
 ('comet-10-14', '32182941', '2020/03/19 01:28:12'),
 ('comet-10-14', '32182941', '2020/03/19 01:38:12'),
 ('comet-10-14', '32182941', '2020/03/19 01:48:12'),
 ('comet-10-14', '32182941', '2020/03/19 01:58:12'),
 ('comet-10-14', '32182941', '2020/03/19 02:08:12'),
 ('comet-10-14', '32182941', '2020/03/19 02:18:12'),
 ('comet-10-14', '32182941', '2020/03/19 02:28:13'),
 ('comet-10-14', '32182941', '2020/03/19 02:32:17'),
 ('comet-10-14', '32182941', '2020/03/19 03:32:18'),
 ('comet-10-14', '32182941', '2020/03/19 03:42:35'),
 ('comet-10-14', '32182941', '2020/03/19 04:42:35'),
 ('comet-10-14', '32182941', '2020/03/19 05:42

In [70]:
test_jobs

{('comet-10-14', '32182941'): '2020/03/19 00:18:10',
 ('comet-11-10', '33189989'): '2020/05/11 18:41:03',
 ('comet-11-10', '33283561'): '2020/05/12 06:30:04',
 ('comet-11-10', '33283089'): '2020/05/12 06:32:14',
 ('comet-15-11', '34583358'): '2020/07/08 18:41:44',
 ('comet-04-62', '33706563'): '2020/05/31 18:40:25',
 ('comet-04-62', '33817890'): '2020/06/01 14:22:04',
 ('comet-04-62', '33817891'): '2020/06/01 14:22:11',
 ('comet-04-62', '33817977'): '2020/06/01 14:38:09',
 ('comet-04-62', '33825317'): '2020/06/01 15:26:56',
 ('comet-04-62', '33825097'): '2020/06/01 15:30:13'}

### Parse file in comet_accounting

In [35]:
acct_info_locs = [ locs['job_info']+'/'+stamp for stamp in listdir(locs['job_info']) ]
acct_dict = [ loc[-14:-4] for loc in acct_info_locs ]

In [36]:
acct_dict

['2019-09-19',
 '2017-08-29',
 '2018-08-02',
 '2019-04-10',
 '2020-04-14',
 '2018-07-28',
 '2018-11-26',
 '2017-05-20',
 '2019-02-09',
 '2019-05-02',
 '2020-05-06',
 '2018-01-31',
 '2017-06-11',
 '2018-12-17',
 '2018-04-19',
 '2018-03-12',
 '2019-07-21',
 '2018-09-10',
 '2020-05-01',
 '2019-05-05',
 '2018-12-10',
 '2017-06-16',
 '2017-10-18',
 '2019-07-26',
 '2020-07-22',
 '2018-03-15',
 '2018-09-17',
 '2018-08-05',
 '2020-04-13',
 '2019-04-17',
 '2020-03-18',
 '2020-06-30',
 '2018-02-07',
 '2017-05-27',
 '2018-11-21',
 '2017-06-18',
 '2017-01-13',
 '2017-03-30',
 '2019-07-28',
 '2018-04-10',
 '2017-10-16',
 '2019-08-02',
 '2018-09-19',
 '2019-09-10',
 '2017-08-20',
 '2018-07-21',
 '2020-03-16',
 '2019-04-19',
 '2018-05-02',
 '2017-11-04',
 '2019-01-31',
 '2018-02-09',
 '2017-02-22',
 '2017-05-29',
 '2019-09-17',
 '2017-08-27',
 '2020-03-11',
 '2018-07-26',
 '2019-03-15',
 '2017-11-03',
 '2018-05-05',
 '2017-02-25',
 '2018-11-28',
 '2020-05-08',
 '2019-02-07',
 '2018-12-19',
 '2017-01-

In [76]:
for job_tup,t in test_jobs.items():
    if t[:10] in acct_dict:
        print("Found:", job_tup, t)
    else:
        print("Not Found:", job_tup, t)

Not Found: ('comet-10-14', '32182941') 2020/03/19 00:18:10
Not Found: ('comet-11-10', '33189989') 2020/05/11 18:41:03
Not Found: ('comet-11-10', '33283561') 2020/05/12 06:30:04
Not Found: ('comet-11-10', '33283089') 2020/05/12 06:32:14
Not Found: ('comet-15-11', '34583358') 2020/07/08 18:41:44
Not Found: ('comet-04-62', '33706563') 2020/05/31 18:40:25
Not Found: ('comet-04-62', '33817890') 2020/06/01 14:22:04
Not Found: ('comet-04-62', '33817891') 2020/06/01 14:22:11
Not Found: ('comet-04-62', '33817977') 2020/06/01 14:38:09
Not Found: ('comet-04-62', '33825317') 2020/06/01 15:26:56
Not Found: ('comet-04-62', '33825097') 2020/06/01 15:30:13


In [38]:
temp = acct_info_locs[0]
temp

'/oasis/projects/nsf/sys200/stats/xsede_stats/comet_accounting/2019-09-19.txt'

In [39]:
job_sample = open_txt( temp )

In [40]:
rules = job_sample[0]
rules.split("|")

['JobID',
 'User',
 'Account',
 'Start',
 'End',
 'Submit',
 'Partition',
 'Timelimit',
 'JobName',
 'State',
 'NNodes',
 'ReqCPUS',
 'NodeList\n']

In [41]:
job_sample[1].split("|")

['26617639',
 'cipres',
 'sds121',
 '2019-09-12T05:12:16',
 '2019-09-19T05:12:30',
 '2019-09-12T05:12:10',
 'shared',
 '7-00:00:00',
 'NGBW-JOB-MRBAYES_XSEDE-1AA1022FA22D445DAB262C7D61CB6616',
 'TIMEOUT',
 '1',
 '16',
 'comet-04-40\n']

In [42]:
job_sample[1:]

['26617639|cipres|sds121|2019-09-12T05:12:16|2019-09-19T05:12:30|2019-09-12T05:12:10|shared|7-00:00:00|NGBW-JOB-MRBAYES_XSEDE-1AA1022FA22D445DAB262C7D61CB6616|TIMEOUT|1|16|comet-04-40\n',
 '26618425|cipres|sds121|2019-09-12T06:27:21|2019-09-19T06:27:28|2019-09-12T06:27:21|compute|7-00:00:00|NGBW-JOB-JMODELTEST2_XSEDE-A7B81541E1DF42F5A332ED19BE47DAE2|TIMEOUT|1|24|comet-17-48\n',
 '26618465|cipres|sds121|2019-09-12T06:28:54|2019-09-19T06:28:59|2019-09-12T06:28:51|compute|7-00:00:00|NGBW-JOB-JMODELTEST2_XSEDE-32AF6F93F6AB4ADAA6FC62855CC9B261|TIMEOUT|1|24|comet-27-64\n',
 '26619940|cipres|sds121|2019-09-12T09:06:06|2019-09-19T09:06:36|2019-09-12T09:06:03|compute|7-00:00:00|NGBW-JOB-GPHOCS_XSEDE-23CCC6C010CD4526B4514144918E7145|TIMEOUT|1|24|comet-04-69\n',
 '26694681|jwestern|cit121|2019-09-18T23:29:00|2019-09-19T23:28:26|2019-09-14T16:42:50|compute|23:59:00|rp12e07g2C0.17|TIMEOUT|16|384|comet-14-[37-38,42-44,56-57,59-60,62-64,66-69]\n',
 '26697694|cipres|sds121|2019-09-14T21:18:44|2019-09-

In [43]:
nodes_dict,not_found = job_to_info_dict( acct_info_locs )

In [44]:
nodes_dict['2019-09-19']

{'multiple': {'comet-14-[37-38,42-44,56-57,59-60,62-64,66-69]\n': {'JobID': '26694681',
   'User': 'jwestern',
   'Account': 'cit121',
   'Start': '2019-09-18T23:29:00',
   'End': '2019-09-19T23:28:26',
   'Submit': '2019-09-14T16:42:50',
   'Partition': 'compute',
   'Timelimit': '23:59:00',
   'JobName': 'rp12e07g2C0.17',
   'State': 'TIMEOUT',
   'NNodes': '16',
   'ReqCPUS': '384',
   'NodeList\n': 'comet-14-[37-38,42-44,56-57,59-60,62-64,66-69]\n'},
  'comet-14-[10,17,21,23,26,32,36,40],comet-16-[08,11-16,18,26-32,49,69],comet-27-[19-20,31-32,39,42,44,48-49,56-57,63,66-68]\n': {'JobID': '26745059',
   'User': 'liwenfei',
   'Account': 'cla174',
   'Start': '2019-09-17T18:48:24',
   'End': '2019-09-19T10:56:19',
   'Submit': '2019-09-16T22:22:22',
   'Partition': 'compute',
   'Timelimit': '2-00:00:00',
   'JobName': 'batch_script',
   'State': 'COMPLETED',
   'NNodes': '40',
   'ReqCPUS': '960',
   'NodeList\n': 'comet-14-[10,17,21,23,26,32,36,40],comet-16-[08,11-16,18,26-32,49,69

### Parse file in archive of archive

In [45]:
#aofa_data = [ locs['aofa']+'/'+stamp for stamp in listdir(locs['aofa']) ]

In [46]:
#temp = listdir(aofa_data[0])[0]
#aofa_data[0]+'/'+temp

In [47]:
#aofa_sample = unzip_txt( aofa_data[0]+'/'+temp )

In [48]:
#aofa_sample