In [1]:
# System dependencies
from os import listdir
import time as clock
from datetime import timedelta
from IPython.display import clear_output

In [2]:
# Data manipulation dependencies
import pandas as pd
import numpy as np
import datetime as dt

In [3]:
# Custom data handling methods
import prep_IO

In [4]:
# Directory of pre-determined lustre failed jobs
source_dir = './lustre_fail_set.txt'

# location of search contents
src = './lustre.failed.jobs2.txt'
acct_base = '/oasis/projects/nsf/sys200/stats/xsede_stats/comet_accounting/'

## Temp A

In [6]:
# up to error:
chosen = [('comet-14-72,comet-19-19,comet-28-55', '2020-05-13T02:11:38', '2020-05-13T05:07:34', '33301074'),
         ('comet-30-10', '2020-05-13T10:50:08', '2020-05-13T10:50:12', '33321014'),
          ('comet-06-46,comet-12-52,comet-22-[39,64]', '2020-05-12T06:40:43', '2020-05-13T14:27:44', '33283100'),
          ('comet-21-07', '2020-05-27T08:49:26', '2020-05-28T01:27:49', '33637231'),
          ('comet-22-48', '2020-05-27T09:32:35', '2020-05-28T00:25:06', '33637422'),
         ]

## Testing A

In [7]:
search_out = prep_IO.search( mode='l', from_list=chosen )

In [8]:
keys = list(search_out.keys())[1:]
len(keys)

9

In [9]:
sample_key = keys[0]
sample = search_out[ sample_key ]
print( sample_key )
print( sample.keys() )

('comet-19-19', 1589361098, 1589371654, '33301074')
dict_keys(['Acct Info', 'Host Info', 'Source Files'])


In [10]:
sample['Acct Info']

'33301074|haqi|uic304|2020-05-13T02:11:38|2020-05-13T05:07:34|2020-05-12T02:50:25|compute|1-00:00:00|eq-5MN6-01_sim2|FAILED|3|6|comet-14-72,comet-19-19,comet-28-55\n'

In [11]:
sample['Host Info'].keys()

dict_keys(['Timely Data', 'Schemas', 'Specs'])

In [12]:
sample['Host Info']["Specs"]

['tacc_stats 2.3.1',
 'hostname comet-19-19.sdsc.edu',
 'uname Linux x86_64 2.6.32-642.13.1.el6.x86_64 #1 SMP Wed Jan 11 20:56:24 UTC 2017',
 'uptime 864119']

In [13]:
sample['Host Info']["Timely Data"]

[('block', 'io_ticks(U=ms)', 'md0', 0, '2017-02-24T20:43:45'),
 ('block', 'io_ticks(U=ms)', 'md1', 0, '2017-02-24T20:43:45'),
 ('block', 'io_ticks(U=ms)', 'sda', 20963407, '2017-02-24T20:43:45'),
 ('block', 'io_ticks(U=ms)', 'sdb', 21027378, '2017-02-24T20:43:45'),
 ('cpu', 'irq(U=cs)', '22', 0, '2017-02-24T20:43:45'),
 ('cpu', 'irq(U=cs)', '20', 0, '2017-02-24T20:43:45'),
 ('cpu', 'irq(U=cs)', '21', 0, '2017-02-24T20:43:45'),
 ('cpu', 'irq(U=cs)', '11', 0, '2017-02-24T20:43:45'),
 ('cpu', 'irq(U=cs)', '10', 1, '2017-02-24T20:43:45'),
 ('cpu', 'irq(U=cs)', '13', 0, '2017-02-24T20:43:45'),
 ('cpu', 'irq(U=cs)', '12', 1, '2017-02-24T20:43:45'),
 ('cpu', 'irq(U=cs)', '15', 0, '2017-02-24T20:43:45'),
 ('cpu', 'irq(U=cs)', '14', 0, '2017-02-24T20:43:45'),
 ('cpu', 'irq(U=cs)', '17', 0, '2017-02-24T20:43:45'),
 ('cpu', 'irq(U=cs)', '16', 0, '2017-02-24T20:43:45'),
 ('cpu', 'irq(U=cs)', '19', 0, '2017-02-24T20:43:45'),
 ('cpu', 'irq(U=cs)', '18', 0, '2017-02-24T20:43:45'),
 ('cpu', 'irq(U=cs)

## Temp B

In [None]:
def check_len ( df, num ):
    return len(df.columns.values.tolist()) > num

def purge_str ( df ):
    for row,col in df.iterrows():
        for i in range(len(col.values)):
            val = col.values[i]
            time = col.index[i]
            
            # certain numeric responses are recorded as str
            if type(val) is str:
                try:
                    df.at[row,time] = np.float64( val )
                except:
                    df.at[row,time] = np.float64(0)
                else:
                    df.at[row,time] = np.float64(0)
                    
    return df
    
def get_host_id ( file_name ):
    host,jobid = file_name.split('_')
    return host[:11] , jobid[:7]

def get_dfs ( file_list, min_jobs, min_cycles=0 ):
    job_dfs = {}
    count = 0
    
    for i in range( len(file_list) ):
        
        if count < min_jobs:
            job_file = file_list[i]
            df = purge_str( pd.read_csv( source_dir+job_file, index_col=[0,1,2], low_memory=False ) )
            host,jobid = get_host_id( job_file )
        
            if check_len( df, min_cycles ):
                job_dfs[jobid] = {}
                job_dfs[jobid][host] = df
            else:
                next
            
            count += 1
            
    return job_dfs
        
def sort_hosts ( file_list ):
    hosts = {}
    
    for job_file in file_list:
        host,jobid = job_file.split('_')
        
        if host in hosts:
            hosts[host].append( jobid[:7] )
        else:
            hosts[host] = [ jobid[:7] ]
            
    return hosts
            
def sort_jobs ( file_list, job_dfs ):
    jobs = {}
    
    for job_file in file_list:
        host,jobid = get_id( job_file )
        
        if jobid in job_dfs.keys():
            if jobid in jobs:
                jobs[jobid].append( host )
            else:
                jobs[jobid] = [ host ]
            
    return jobs, multiple_hosts(jobs)

def multiple_hosts ( jobs_dict ):
    return any( len(host) > 1 for job,host in jobs_dict.items() )

def cpicore_simple ( job_df, monitor=False ):
    data = job_df.loc['intel_hsw']
    times = job_df.columns.tolist()
    cpicore_list = []
    
    for i in range(1, len(times)):
        chunk = data[times[:i+1]]
        devices = { row : np.mean(col.values) for row,col in chunk.iterrows() }
        avg_c = { key[0]:0 for key,val in devices.items() }
        sum_avgs = 0
        
        for key,val in avg_c.items():
            avg_c[ key ] = devices[ (key, 'CLOCKS_UNHALTED_CORE') ] / devices[ (key, 'INSTRUCTIONS_RETIRED') ]
    
        for key,val in avg_c.items():
            sum_avgs += val
            
        cpicore_list.append(sum_avgs/24)
    
    if monitor:
        return cpicore_list
    
    return sum_avgs/24

def cpicore ( job_df, monitor=False ):
    data = job_df.loc['intel_hsw']
    times = job_df.columns.tolist()
    cpicore_dict = OrderedDict( )
    
    for t in times:
        cpicore_dict[t] = 0
    
    for i in range(1, len(times)):
        chunk = data[times[:i+1]]
        devices = { row : np.mean(col.values) for row,col in chunk.iterrows() }
        avg_c = { key[0]:0 for key,val in devices.items() }
        sum_avgs = 0
        current = times[i]
        
        for key,val in avg_c.items():
            avg_c[ key ] = devices[ (key, 'CLOCKS_UNHALTED_CORE') ] / devices[ (key, 'INSTRUCTIONS_RETIRED') ]
    
        for key,val in avg_c.items():
            sum_avgs += val
            
        cpicore_dict[current] = sum_avgs/24
    
    if monitor:
        return cpicore_dict
    
    return sum_avgs/24

def cpiref ( devices_dict ):
    avg_d = { key[0]:0 for key,val in devices_dict.items() }
    sum_avgs = 0
    
    for key,val in avg_d.items():
        avg_d[ key ] = devices_dict[ (key, 'CLOCKS_UNHALTED_REF') ] / devices_dict[ (key, 'INSTRUCTIONS_RETIRED') ]
        
    for key,val in avg_d.items():
        sum_avgs += val
        
    return sum_avgs/24

#def find_notable( cpi_set ):
#    notable = []
#    
#    for jobid,data_dict in cpi_set.items():
#        vals = [ val for val in data_dict.values()[1:] ]
#        
#        if not all():
#            notable.append( jobid )

def get_stats( cpi_set, outliers=False ):
    data = []
    outliers = []
    
    for jobid,cpi_dict in cpicore_set.items():
        for val in cpi_dict.values():
            if (val > 0):
                data.append(np.float64(val))
            else:
                outliers.append(val)   
                
    stats = {
        'Max' : max(data),
        'Min' : min(data),
        'Mean' : np.mean(data),
        'Std. Dev' : np.std(data),
        'Count' : len(data),
        'Excluded' : len(outliers)
    }
    
    if outliers:
        return stats,outliers
    
    return stats

## Testing B