#### Dependencies

In [1]:
# Pickle file dependencies
from tacc_stats.pickler.job_stats import Job
import cPickle as pickle

In [2]:
# System dependencies
from os import listdir
import time as clock
from IPython.display import clear_output

In [3]:
# Data manipulation dependencies
import pandas as pd
import numpy as np
import datetime as dt

In [4]:
# Directory of all pickled jobs via comet
# new_dir = '/oasis/projects/nsf/sys200/stats/xsede_stats/archive'
source_dir = '/oasis/projects/nsf/sys200/tcooper/xsede_stats/comet_pickles/'

# Directory to save to
save_dir = '../data/continued/'

# Directory of prev job scans
id_dir = '../data/labels/IDs/'

In [5]:
# List of date directories in source_dir
dates_list = [ date for date in listdir(source_dir) if len(listdir(source_dir+date)) > 0 ]
all_files = [ source_dir+date+'/'+file_name for date in listdir(source_dir) for file_name in listdir(source_dir+date) ]

#### Confirm Already Scanned Jobs

In [6]:
collected = [ file_name[12:19] for file_name in listdir('../data/raw') ]
remaining = [ file_name for file_name in all_files if file_name[72:] not in collected ]        

In [7]:
print "Total files:\t", len(all_files)
print "Collected:\t", len(collected)
print "--------------------------"
print "Remaining:\t", len(remaining)

Total files:	2199159
Collected:	5398
--------------------------
Remaining:	2194279


## Prep Data Cleaning

In [8]:
def check_val( val ):
    try:
        val = float(val)
        return val
    except:
        return 0
    else:
        return 0
        
def convert_dt( val ):
    return dt.datetime.utcfromtimestamp( val ).strftime( "%Y-%m-%d %H:%M:%S" )

def get_schemas( job ):
    return { stat:schema.keys() for stat,schema in job.schemas.items() }

def get_indices( job, host ):
    indices = []
    stats = [ stat for stat in job.schemas.keys() if stat in host.stats.keys() ]
    schemas = { stat:schema.keys() for stat,schema in job.schemas.items() }
    cores = { stat:core.keys() for stat,core in host.stats.items() }
    
    for stat in stats:
        for core in cores[stat]:
            for schema in schemas[stat]:
                indices.append( (stat,core,schema) )
             
    return indices

def get_times( job, host ):
    times = [ job.start_time ]
    times.extend( host.times )
    times.append( job.end_time )
    return [ convert_dt(t) for t in times ]

def clean_list( data_list ):
    return [ check_val( x ) for x in data_list ]
    
def get_data( host, row_labels ):
    data = { label:[] for label in row_labels }
    
    for stat,node in host.stats.items():
        for core,matrix in node.items():
            matrix = matrix.T
            for i in range(len(matrix)):
                data[stat,core] = clean_list( matrix[i] )
    return data

def fill_df( template_df, data_dict):
    for row,data in data_dict.items():
        template_df.loc[row].update( pd.Series(data) )
    return df

In [9]:
def cpicore ( job_df ):
    data = job_df.loc['intel_hsw']
    times = job_df.columns.tolist()
    cpicore_list = []
    
    for i in range(1, len(times)):
        chunk = data[times[:i+1]]
        devices = { row : np.mean(col.values) for row,col in chunk.iterrows() }
        avg_c = { key[0]:0 for key,val in devices.items() }
        sum_avgs = 0
        
        for key,val in avg_c.items():
            avg_c[ key ] = devices[ (key, 'CLOCKS_UNHALTED_CORE') ] / devices[ (key, 'INSTRUCTIONS_RETIRED') ]
    
        for key,val in avg_c.items():
            sum_avgs += val
            
        cpicore_list.append(sum_avgs/24)
    
    return cpicore_list

In [10]:
#def find_next( current, unsorted ):
#    target = current + 00:10:00
#    found = unsorted[0]
#    proximity = target - found
#    
#    if len(unsorted) > 1:
#        for i in range(len(unsorted)):
#            if target - unsorted[i] < proximity:
#                found = unsorted[i]
#                proximity = target - found
#    return found
#
#def fill_sorted( start, unsorted ):
#    sorted_list = []
#    
#    for i in range(len(unsorted)):
#        current = sorted_list[i]
#        next_time = find_next( current, unsorted )
#        sorted_list[i+1] = next_time
#        
#def sort_times( job ):
#    start = job.start
#    mid = job.times
#    end = job.end
#    
#    if start == end:
#        return [start]
#    elif len(mid) < 1:
#        return [start, end]
#    elif len(mid) < 2:
#        return [start, mid[0], end]
#    else:
#        return fill_sorted( start, mid.append(end) )

# Access and open pickled job files
**Process:**
    - Iterate through the non-empty date folders available in source_dir
    - A file is saved in valid_jobs if:
        * The pickled file is a Job object
    - Exceptions are skipped

In [11]:
cut = remaining
n = len(cut)

In [12]:
summary = {}

for file_name in cut:
    date = file_name[61:71]
    
    if date in summary:
        summary[date]["Total"] += 1
    else:
        summary[date] = {}
        summary[date]["Total"] = 1
        summary[date]["Saved"] = 0

In [13]:
summary

{'2016-08-19': {'Saved': 0, 'Total': 7732},
 '2016-08-20': {'Saved': 0, 'Total': 9610},
 '2016-08-21': {'Saved': 0, 'Total': 12071},
 '2016-08-22': {'Saved': 0, 'Total': 15356},
 '2016-08-23': {'Saved': 0, 'Total': 22251},
 '2016-08-24': {'Saved': 0, 'Total': 13709},
 '2016-08-25': {'Saved': 0, 'Total': 12779},
 '2016-08-26': {'Saved': 0, 'Total': 12042},
 '2016-08-27': {'Saved': 0, 'Total': 11752},
 '2016-08-28': {'Saved': 0, 'Total': 11869},
 '2016-08-29': {'Saved': 0, 'Total': 8462},
 '2016-08-30': {'Saved': 0, 'Total': 3106},
 '2016-08-31': {'Saved': 0, 'Total': 11619},
 '2016-09-01': {'Saved': 0, 'Total': 11086},
 '2016-09-02': {'Saved': 0, 'Total': 15194},
 '2016-09-03': {'Saved': 0, 'Total': 12176},
 '2016-09-04': {'Saved': 0, 'Total': 32656},
 '2016-09-05': {'Saved': 0, 'Total': 51049},
 '2016-09-06': {'Saved': 0, 'Total': 39128},
 '2016-09-07': {'Saved': 0, 'Total': 13329},
 '2016-09-08': {'Saved': 0, 'Total': 17233},
 '2016-09-09': {'Saved': 0, 'Total': 24291},
 '2016-09-10':

#### Catch invalid files

In [None]:
t0 = clock.time()
valid = []
invalid = []
scanned = 0

for file_name in cut:
    date = file_name[61:71]
    scanned += 1
    perc_scanned = np.round( scanned / n * 100, 2)
        
    clear_output(wait=True)
    print "Processing file {} of {} files \t ({}% of total files)".format( scanned, n, perc_scanned )
        
    try:
        pickle_file = open( file_name, 'rb' )
        job_obj = pickle.load( pickle_file )
        
        # Save data
        valid.append(job_obj)
        summary[date]["Saved"] += 1
        
        pickle_file.close()
            
    except:
        invalid.append( file_name )
            
    print
    print "Run time: {}s".format( np.round( clock.time() - t0, 1 ) )

Processing file 1932842 of 2194279 files 	 (0% of total files)

Run time: 68903.8s


In [None]:
invalid_out = '../src/data/summary_stats/raw_metrics/invalid.pkl'
out_file = open(invalid_out, 'wb')
pickle.dump(invalid, out_file)
#invalid = pickle.load(open(invalid_out, 'rb'))

In [None]:
valid_out = '../src/data/summary_stats/raw_metrics/all_rows.pkl'
out_file = open(all_rows_out, 'wb')
pickle.dump(all_rows, out_file)
#valid = pickle.load(open(valid_out, 'rb'))

In [None]:
len(invalid)

In [None]:
len(valid)

In [None]:
print "Total job objects collected:\t", len(valid)
print
print "\tBreakdown of files"
print "========================="

for date,info in summary:
    
    print "Date:\t", date
    print info["Saved"], "files saved out of", info["Total"]
    print

# Begin Processing

In [None]:
job_objects = valid
m = float(sum([len(job.hosts.keys()) for job in job_objects]))

In [None]:
cut = len(job_objects)/2
first = 0
stop = len(job_objects)
rem = stop - first

print "Total Jobs (this date):\t\t", len(job_objects)
print "Total Host,Job Pairs:\t\t", int(m)
print("------------------------------------")
print "Remaining jobs to scan:\t\t", int(rem)

In [None]:
job_dfs = {}
t0 = clock.time()
total = 0
current = 0

for job_idx in range( first, stop ):
    job = job_objects[ job_idx ]
    schemas = get_schemas( job )
    total += 1
    
    # support for tracking progress in below print statements
    clear_output(wait=True)
    
    # iterate through each host object job was run on
    for host_name, host in job.hosts.iteritems():
        print("Processing hosts for job {} of {} \t ({}% of total)".format(job_idx+1, stop, np.round( (current+first)/m*100, 2)))
        current += 1
        
        # build MultiIndex for df 
        idx_labels = get_indices( job, host )
        indices = pd.MultiIndex.from_tuples( idx_labels, names=['Stat', 'Device', 'Schema'] )
                    
        # process timestamps
        times = get_times( job, host )
    
        # collect job data
        data = get_data( host, idx_labels )
        
        # create df with MultiIndex, ordered times
        df = pd.DataFrame( index=indices, columns=times ).sort_index()
        
        # fill df
        for stat,devices in host.stats.items():
            for device,data_matrix in devices.items():
                for t_idx in range( len(data_matrix) ):
                    timestamp = times[t_idx]
                    
                    for metric_idx in range( len(data_matrix[ t_idx ]) ):
                        metric = schemas[stat][metric_idx]
                        row_label = (stat,device,metric)
                        datum = data_matrix[t_idx][metric_idx]
                        
                        df.loc[row_label][timestamp] = check_val(datum)
        
        # save job info from DataFrame to csv file
        df.to_csv( path_or_buf=save_dir+"{}_{}.csv".format( host_name, job.id ) )

In [None]:
# check that no job was missed
if total == ( stop-first ):
    print "Success!"
else:
    print stop - first - total, "jobs missing"