#### Dependencies

In [1]:
# System dependencies
from os import listdir
import time as clock
from IPython.display import clear_output

In [2]:
import pickle
import gzip
import re

In [3]:
# Data manipulation dependencies
import pandas as pd
import numpy as np
import datetime as dt

In [4]:
# Directory to save to
save_dir = '../data/continued/'

# Directory of prev job scans
id_dir = '../data/labels/IDs/'

In [5]:
# Directory of recent saved comet jobs
source_dir = '/oasis/projects/nsf/sys200/stats/xsede_stats/'

In [6]:
contents = listdir(source_dir)
contents

['archive_of_archive',
 'gordon_hostfile_logs',
 'gordon_pickles',
 'comet_accounting',
 'gordon_accounting',
 'comet_pickles',
 'archive',
 '.htaccess',
 'comet_hostfile_logs']

In [7]:
possible = [ source_dir+file_name for file_name in listdir(source_dir) ]

for item in possible:
    try:
        listdir(item)
    except:
        print(item)

/oasis/projects/nsf/sys200/stats/xsede_stats/gordon_hostfile_logs
/oasis/projects/nsf/sys200/stats/xsede_stats/gordon_pickles
/oasis/projects/nsf/sys200/stats/xsede_stats/gordon_accounting
/oasis/projects/nsf/sys200/stats/xsede_stats/comet_pickles
/oasis/projects/nsf/sys200/stats/xsede_stats/.htaccess
/oasis/projects/nsf/sys200/stats/xsede_stats/comet_hostfile_logs


In [8]:
locs = { 'aofa': source_dir+'archive_of_archive',
         'job_info': source_dir+'comet_accounting',
         'arc': source_dir+'archive'
         #'host_info': source_dir+'comet_hostfile_logs',
         #'old_pickles': source_dir+'comet_pickles'
       }

In [9]:
for key,loc in locs.items():
    print(loc)

/oasis/projects/nsf/sys200/stats/xsede_stats/archive_of_archive
/oasis/projects/nsf/sys200/stats/xsede_stats/comet_accounting
/oasis/projects/nsf/sys200/stats/xsede_stats/archive


## Prep Cleaning

In [51]:
def get_time( spec=None ):
    if type(spec) is str:
        spec = float( spec )
    
    return clock.strftime("%Y/%m/%d %H:%M:%S", clock.localtime( spec ))

In [52]:
def check_header( line ):
    if line.find(" ") < 0:
        try:
            return line[0] == '%'
        except:
            return False
        
    else:
        chunks = line.split(" ")
        try:
            return (chunks[0][0] == '%') or ( chunks[2].find("comet") >= 0 )
        except:
            return False

In [53]:
def check_job( chunk ):
    return chunk.find("-") == -1

In [54]:
def open_txt( txt_file ):
    
    with open( txt_file, "rt" ) as f:
        lines = f.readlines()
        f.close()
    
    return lines

In [55]:
def unzip_txt( gzipped ):
    
    with gzip.open( gzipped, 'rt') as f:
        lines = f.readlines()
        f.close()
    
    return lines

In [56]:
def quick_save( obj, label=get_time() ):
    
    try:
        out_file = open( label, 'wb')
        pickle.dump( obj, out_file)
        
        # double check save
        check_cpicore_set = pickle.load(open(cpiset_out, 'rb'))
        check_cpicore_set = None
        
    except:
        "There was a problem pickling the object - Save manually."

In [57]:
def info_dict( rules, info ):
    rules_list = rules.split("|")
    
    if len(rules_list) != len(info):
        return {}
    
    else:
        return { rules_list[i]:info[i] for i in range(len(rules_list)) }

In [89]:
def host_to_info_dict( zip_txt ):
    contents = unzip_txt( zip_txt )
    out_dict = { "Host": {} }
    host_info = { "Timestamp":{} }
    info_dict = { "Data":{},
                    "Job":"N/A",
                    "Schemas":{},
                    "Specs":[]
                }
    
    for line in contents:
            
        if line[0] == "$":
            print(line)
            print("Identified '$'\n")
            #info_dict["Specs"].append( format_spec( line ) )
            
        elif line[0] == "!":
            print(line)
            print("Identified '!'\n")
            #info_dict["Schemas"].update( format_schema( line ) )
        
        else:
            #host_name = ""
            #timestamp = ""
            
            if len(line) < 3 or check_header( line ):
                header_dict = format_header( line )
                print(line)
                print("Potential header\n")
                
                if header_dict:
                    print(line)
                    print("Identified header\n")
                    
                    if check_job( header_dict["Jobid"] ):
                        print(line)
                        print("Identified Jobid\n")
                        #info_dict["Job"] = { "Jobid": header_dict["Jobid"] }
                        
                    #host_name = header_dict["Host"]
                    #timestamp = header_dict["Timestamp"]
                    
                    #time_data = { timestamp : {} }
                    #out_dict[host_name] = time_data
                    
            else:
                print(line)
                print("Identified data block\n")
                #incoming = format_data( line )
                #found_stat = incoming["Stat"]
                #info_dict["Data"][ found_stat ] = incoming
                
    return out_dict

In [90]:
def format_header( line ):
    chunks = line.split(" ")
    
    try:
        if chunks[0][0] == '%':
            return {}
        else:
            return { "Timestamp": get_time( chunks[0] ), 
                     "Jobid": chunks[1],
                     "Host": chunks[2][:11] }
        
    except:
        print(line)
        return {}

In [91]:
def format_spec( line ):
    return line[1:-1]

In [92]:
def format_schema( line ):
    chunks = line.partition(" ")
    
    stat = chunks[0][1:]
    schemas = chunks[2:]
    
    return { stat:schemas }

In [93]:
def format_data( line ):
    chunks = line.split(" ")
    
    stat = chunks[0]
    dev = chunks[1]
    data = chunks[2:-1]
    
    return { "Stat": stat, "Device": dev, "Data": data }

### Parse file in archive

In [94]:
arc_data = [ locs['arc']+'/'+stamp for stamp in listdir(locs['arc']) ]

In [95]:
temp = listdir(arc_data[0])[0]
temp_loc = arc_data[0]+'/'+temp
temp_loc

'/oasis/projects/nsf/sys200/stats/xsede_stats/archive/comet-10-14.sdsc.edu/1587951665.gz'

In [96]:
host_to_info_dict( temp_loc )

$tacc_stats 2.3.4

Identified '$'

$hostname comet-10-14.sdsc.edu

Identified '$'

$uname Linux x86_64 3.10.0-957.12.2.el7.x86_64 #1 SMP Tue May 14 21:24:32 UTC 2019

Identified '$'

$uptime 4082659

Identified '$'

!block rd_ios,E rd_merges,E rd_sectors,E,U=512B rd_ticks,E,U=ms wr_ios,E wr_merges,E wr_sectors,E,U=512B wr_ticks,E,U=ms in_flight io_ticks,E,U=ms time_in_queue,E,U=ms

Identified '!'

!cpu user,E,U=cs nice,E,U=cs system,E,U=cs idle,E,U=cs iowait,E,U=cs irq,E,U=cs softirq,E,U=cs

Identified '!'

!ib excessive_buffer_overrun_errors,E,W=32 link_downed,E,W=32 link_error_recovery,E,W=32 local_link_integrity_errors,E,W=32 port_rcv_constraint_errors,E,W=32 port_rcv_data,E,U=4B,W=32 port_rcv_errors,E,W=32 port_rcv_packets,E,W=32 port_rcv_remote_physical_errors,E,W=32 port_rcv_switch_relay_errors,E,W=32 port_xmit_constraint_errors,E,W=32 port_xmit_data,E,U=4B,W=32 port_xmit_discards,E,W=32 port_xmit_packets,E,W=32 port_xmit_wait,E,U=ms,W=32 symbol_error,E,W=32 VL15_dropped,E,W=32



Identified data block

proc MATLAB/29892/12/0-1 525985 755256 755248 0 91680 91680 277440 140 56 223384 712 0 4

Identified data block

proc slurm_script/29889/12/0-1 525985 113184 113184 0 1428 1428 208 136 884 2092 52 0 1

Identified data block

proc MATLAB/28882/8/0-1 525985 3668240 3668232 0 376868 376868 3137848 140 56 255912 1440 0 13

Identified data block

proc slurm_script/27952/14/0-1 515496 134208 134208 0 5016 5016 2956 140 8 4856 100 0 1

Identified data block

proc cas_d3baka21Ate/28142/0/0-1 515496 1629344 1629344 0 210640 210640 1627428 136 1696 0 1144 0 1

Identified data block

proc cas_d3baka23Ate/28097/14/0-1 515496 1629344 1629344 0 210640 210640 1627428 136 1696 0 1148 0 1

Identified data block

proc cvmfs2/28252/0-23/0-1 0 572296 572296 0 26832 26832 513828 132 320 12368 256 0 11

Identified data block

proc munged/14877/0-23/0-1 509849 221660 221652 0 8636 8632 198148 132 96 4608 92 0 4

Identified data block

proc MATLAB/29406/10/0-1 525985 3668240 3668232 0 3

proc cvmfs2/28243/0-23/0-1 0 58824 58824 0 708 708 356 132 320 12368 120 0 1

Identified data block

proc slurm_script/31846/4/0-1 515496 134076 134076 0 4960 4960 2824 140 8 4856 96 0 1

Identified data block

proc dnsmasq/15012/0-23/0-1 99 54916 54908 0 2108 1872 1304 132 316 5856 120 0 1

Identified data block

proc slurm_script/30611/20/0-1 525985 113184 113184 0 1432 1432 208 136 884 2092 48 0 1

Identified data block

proc slurm_script/22912/6/0-1 513696 134072 134072 0 4976 4976 2820 140 8 4856 96 0 1

Identified data block

proc slurm_script/30378/11/0-1 525985 113184 113184 0 1432 1432 208 136 884 2092 48 0 1

Identified data block

proc slurm_script/17492/17/0-1 515496 134076 134076 0 4960 4960 2824 140 8 4856 92 0 1

Identified data block

proc slurm_script/4985/16/0-1 525985 113184 113184 0 1468 1468 208 136 884 2092 48 0 1

Identified data block

proc netsim/4895/9/0-1 525985 317740 317740 0 306864 306864 306176 136 120 3056 640 0 1

Identified data block

proc MATLAB/2888


Potential header

1587960515.319111 32935193 comet-10-14.sdsc.edu

Potential header

1587960515.319111 32935193 comet-10-14.sdsc.edu

Identified header

1587960515.319111 32935193 comet-10-14.sdsc.edu

Identified Jobid

%end 32935193

Potential header

block md0 855 0 42550 0 1408 0 68 0 0 0 0

Identified data block

block md1 333883 0 14214778 0 9313847 0 391216608 0 0 0 0

Identified data block

block sda 480920 3285 41247537 348556 44916635 2159965 5845296641 73827507 0 14942472 74159197

Identified data block

block sdb 478521 2718 41499371 327920 43648422 2006470 5813774353 79912664 0 15038194 80224696

Identified data block

cpu 22 213509086 526 4285630 190376277 771649 0 4001

Identified data block

cpu 20 233213260 518 6125872 169379769 197298 0 2329

Identified data block

cpu 21 217429330 147 6053230 185358372 116605 0 1869

Identified data block

cpu 11 260979093 14 6686412 140280315 963969 0 7858

Identified data block

cpu 10 239262044 175 6819207 161744300 1065656 0 2579

Identified data block

intel_8pmc3 0 4424144 4391377 4391633 4391249 4392145 4391176 4391330 4392945 192433424557312 250265638088415 48172023144460 250252897334381 0 7340333511287 112908793042919 151508641137298 12095264743763 154374955109457 179014875690039

Identified data block

intel_8pmc3 3 4424144 4391377 4391633 4391249 4392145 4391176 4391330 4392945 92591132424008 121781522935658 59178007979313 10903887388595 0 11702644651372 85137532206935 167967848291667 240003535356404 156132269737655 180594851895564

Identified data block

intel_8pmc3 2 4424144 4391377 4391633 4391249 4392145 4391176 4391330 4392945 15115890431249 40036095131124 55361678424135 20642886780324 0 9217061211791 61589324200618 181345865889140 228637357885950 154009519555389 178683711318064

Identified data block

intel_8pmc3 5 4424144 4391377 4391633 4391249 4392145 4391176 4391330 4392945 219666764285430 251877986846631 59265602318138 3582630387205 0 9417713818998 35006721969013 160259114796113 127231089690346


intel_hsw_cbo 0/11 20971793 20972340 20975390 20972852 200208558478506 84515082127921 122144891419717 58136851042709

Identified data block

intel_hsw_cbo 1/9 20971793 20972340 20975390 20972852 277852322765623 83338099028848 87392784143454 50151395306595

Identified data block

intel_hsw_cbo 1/8 20971793 20972340 20975390 20972852 225974981721054 83191042643432 105921451717703 50153724556166

Identified data block

intel_hsw_cbo 0/8 20971793 20972340 20975390 20972852 188413821282283 84683272575485 112314625575026 58121187247148

Identified data block

intel_hsw_cbo 0/9 20971793 20972340 20975390 20972852 243681881243073 84777383330158 105030675942469 58130725104071

Identified data block

intel_hsw_cbo 1/11 20971793 20972340 20975390 20972852 239025479352027 83362750933516 101959150921919 50084833764437

Identified data block

intel_hsw_cbo 1/10 20971793 20972340 20975390 20972852 216793122715467 83413838110664 125008661249555 50109112614472

Identified data block

intel_hsw_hau ff/

intel_hsw_cbo 0/1 20971793 20972340 20975390 20972852 120444883888993 81160907451926 120001249709545 55527447702259

Identified data block

intel_hsw_cbo 0/10 20971793 20972340 20975390 20972852 171431503897642 84660539905310 136684047204525 58141363278209

Identified data block

intel_hsw_cbo 0/11 20971793 20972340 20975390 20972852 200733263959624 84641192650614 122230688896995 58165777888525

Identified data block

intel_hsw_cbo 1/9 20971793 20972340 20975390 20972852 278235464318721 83425537666083 87437891553480 50169414290517

Identified data block

intel_hsw_cbo 1/8 20971793 20972340 20975390 20972852 226332430097124 83278247654660 106065650219181 50171832375857

Identified data block

intel_hsw_cbo 0/8 20971793 20972340 20975390 20972852 188935836953761 84810274293157 112641776242273 58149962516937

Identified data block

intel_hsw_cbo 0/9 20971793 20972340 20975390 20972852 244214586939780 84903557110150 105082161592285 58159417668374

Identified data block

intel_hsw_cbo 1/11 


tmpfs /sys/fs/cgroup 0 16

Identified data block

tmpfs /dev/shm 0 5

Identified data block

vfs - 26446 1824 547082

Identified data block

vm - 3 5821391745 195802547 0 0 96027111974 97888416766 960059034 163956358 184317147035 4021894 156603337 0 26253111 15755 0 72880512 0 85196 286182 40 27388 325 2 1 0 2

Identified data block





Potential header

1587985293.097109 - comet-10-14.sdsc.edu

Potential header

1587985293.097109 - comet-10-14.sdsc.edu

Identified header

block md0 855 0 42550 0 1423 0 68 0 0 0 0

Identified data block

block md1 333924 0 14215770 0 9361189 0 391678392 0 0 0 0

Identified data block

block sda 532867 3328 52773761 843165 45690969 2174693 5984327649 77976688 0 15499154 78802633

Identified data block

block sdb 531240 2750 53222827 831203 44405434 2019710 5952065689 84051684 0 15591484 84866638

Identified data block

cpu 22 215477653 526 4307964 190862400 771913 0 4007

Identified data block

cpu 20 233765307 519 6141152 171278549 207527 0 2377

Ide


Identified data block

proc l1110.exe/5441/8/0-1 517335 6643604 6643532 0 80092 80092 6597900 2764 15344 3116 328 0 1

Identified data block

proc cas_S536380_4F_/8940/22/0-1 515496 985900 985900 0 186808 186808 984420 132 1288 0 588 0 1

Identified data block

proc cvmfs2/5845/0-23/0-1 0 58824 58824 0 704 704 356 132 320 12368 120 0 1

Identified data block

proc cvmfs2/5941/0-23/0-1 0 572296 572296 0 26868 26868 513828 132 320 12368 256 0 11

Identified data block

ps - 529325710629 52428853 719 706 706 9 677

Identified data block

sysv_shm - 0 0

Identified data block

tmpfs /run 916389888 984

Identified data block

tmpfs /run/user/0 0 1

Identified data block

tmpfs /sys/fs/cgroup 0 16

Identified data block

tmpfs /dev/shm 0 4

Identified data block

vfs - 26430 2016 624747

Identified data block

vm - 3 5860101197 198416847 0 0 96567277841 98439395166 960511307 163956358 184918846621 4022436 156603337 0 26253111 15755 0 72880512 0 85196 286182 40 27388 325 2 1 0 2

Identified 


1588028051.614098 32950571 comet-10-14.sdsc.edu

Identified header

1588028051.614098 32950571 comet-10-14.sdsc.edu

Identified Jobid

block md0 855 0 42550 0 1438 0 68 0 0 0 0

Identified data block

block md1 333931 0 14215826 0 9444986 0 397420856 0 0 0 0

Identified data block

block sda 532899 3328 52774017 843498 46335865 2195877 6064523569 78591586 0 15695922 79417678

Identified data block

block sdb 531254 2750 53222939 831217 45016343 2038842 6031759857 84653972 0 15785348 85468749

Identified data block

cpu 22 219717017 526 4344452 190862400 771913 0 4007

Identified data block

cpu 20 233766489 519 6144419 175547341 209290 0 2397

Identified data block

cpu 21 218038611 147 6079202 191465515 125626 0 1938

Identified data block

cpu 11 263201362 33 6702006 144780257 984193 0 7872

Identified data block

cpu 10 240729967 175 6832243 167013891 1067865 0 25814

Identified data block

cpu 13 251198550 172 7442587 156626348 347214 0 2207

Identified data block

cpu 12 25980430

{'Host': {}}

In [97]:
info = 

SyntaxError: invalid syntax (<ipython-input-97-0e9f5a3d2778>, line 1)

In [None]:
info

### Parse file in comet_accounting

In [None]:
job_data = [ locs['job_info']+'/'+stamp for stamp in listdir(locs['job_info']) ]

In [None]:
temp = job_data[0]
temp

In [None]:
job_sample = open_txt( temp )

In [None]:
rules = job_sample[0]
rules.split("|")

In [None]:
job_sample[1].split("|")

In [None]:
job_sample[1:]

In [None]:
nodes_by_date = {}
unsaved = []

for date in job_data:
    try:
        
        # skip alt files
        #check_stamp = int( date[-14] )
        
        # read in file contents
        contents = open_txt( date )
    
        # formatting
        label = date[-14:-4]
        rules = contents[0]
        jobs = contents[1:]
        
        # template to save
        nodes_by_date[ label ] = {}
        nodes_by_date[ label ]["multiple"] = {}
        nodes_by_date[ label ]["rules"] = rules
        
        # run through lines in file
        for job in jobs:
            line = job.split("|")
            node = line[-1]
            info = info_dict( rules, line )
            
            # save multiple node jobs to specified loc
            if len(node) > 12:                
                nodes_by_date[ label ][ "multiple" ][ node ] = info
            
            else:
                nodes_by_date[ label ][ node[:11] ] = info
                
    except:
        unsaved.append(date)

In [None]:
nodes_by_date['2019-09-19']

### Parse file in archive of archive

In [None]:
aofa_data = [ locs['aofa']+'/'+stamp for stamp in listdir(locs['aofa']) ]

In [None]:
temp = listdir(aofa_data[0])[0]
aofa_data[0]+'/'+temp

In [None]:
aofa_sample = unzip_txt( aofa_data[0]+'/'+temp )

In [None]:
aofa_sample

# PREVIOUS PROCEDURE

In [None]:
t0 = clock.time()
n = len(files)
valid = []
invalid = []
scanned = 0

for file_name in files[:10]:
    scanned += 1
    perc_scanned = np.round( scanned / n * 100, 2)
        
    clear_output(wait=True)
    print "Processing file {} of {} files \t ({}% of total files)".format( scanned, n, perc_scanned )
        
    try:
        check_file = open( source_dir+file_name, 'r' )
        
        for line in check_file:
            print(line)
        
        check_file.close()
    except:
        next
#        job_obj = pickle.load( pickle_file )
#        
#        # Save data
#        valid.append(job_obj)
#        summary[date]["Saved"] += 1
#        
#        pickle_file.close()
#            
#    except:
#        invalid.append( file_name )
#            
#    print
#    print "Run time: {}s".format( np.round( clock.time() - t0, 1 ) )

In [None]:
len(invalid) == len(files)


# Resume

In [None]:
# List of date directories in source_dir
dates_list = [ date for date in listdir(source_dir) if len(listdir(source_dir+date)) > 0 ]
all_files = [ source_dir+date+'/'+file_name for date in listdir(source_dir) for file_name in listdir(source_dir+date) ]

#### Confirm Already Scanned Jobs

In [None]:
collected = [ file_name[12:19] for file_name in listdir('../data/raw') ]
remaining = [ file_name for file_name in all_files if file_name[72:] not in collected ]        

In [None]:
print "Total files:\t", len(all_files)
print "Collected:\t", len(collected)
print "--------------------------"
print "Remaining:\t", len(remaining)

## Prep Data Cleaning

In [None]:
def check_val( val ):
    try:
        val = float(val)
        return val
    except:
        return 0
    else:
        return 0
        
def convert_dt( val ):
    return dt.datetime.utcfromtimestamp( val ).strftime( "%Y-%m-%d %H:%M:%S" )

def get_schemas( job ):
    return { stat:schema.keys() for stat,schema in job.schemas.items() }

def get_indices( job, host ):
    indices = []
    stats = [ stat for stat in job.schemas.keys() if stat in host.stats.keys() ]
    schemas = { stat:schema.keys() for stat,schema in job.schemas.items() }
    cores = { stat:core.keys() for stat,core in host.stats.items() }
    
    for stat in stats:
        for core in cores[stat]:
            for schema in schemas[stat]:
                indices.append( (stat,core,schema) )
             
    return indices

def get_times( job, host ):
    times = [ job.start_time ]
    times.extend( host.times )
    times.append( job.end_time )
    return [ convert_dt(t) for t in times ]

def clean_list( data_list ):
    return [ check_val( x ) for x in data_list ]
    
def get_data( host, row_labels ):
    data = { label:[] for label in row_labels }
    
    for stat,node in host.stats.items():
        for core,matrix in node.items():
            matrix = matrix.T
            for i in range(len(matrix)):
                data[stat,core] = clean_list( matrix[i] )
    return data

def fill_df( template_df, data_dict):
    for row,data in data_dict.items():
        template_df.loc[row].update( pd.Series(data) )
    return df

In [None]:
def cpicore ( job_df ):
    data = job_df.loc['intel_hsw']
    times = job_df.columns.tolist()
    cpicore_list = []
    
    for i in range(1, len(times)):
        chunk = data[times[:i+1]]
        devices = { row : np.mean(col.values) for row,col in chunk.iterrows() }
        avg_c = { key[0]:0 for key,val in devices.items() }
        sum_avgs = 0
        
        for key,val in avg_c.items():
            avg_c[ key ] = devices[ (key, 'CLOCKS_UNHALTED_CORE') ] / devices[ (key, 'INSTRUCTIONS_RETIRED') ]
    
        for key,val in avg_c.items():
            sum_avgs += val
            
        cpicore_list.append(sum_avgs/24)
    
    return cpicore_list

In [None]:
#def find_next( current, unsorted ):
#    target = current + 00:10:00
#    found = unsorted[0]
#    proximity = target - found
#    
#    if len(unsorted) > 1:
#        for i in range(len(unsorted)):
#            if target - unsorted[i] < proximity:
#                found = unsorted[i]
#                proximity = target - found
#    return found
#
#def fill_sorted( start, unsorted ):
#    sorted_list = []
#    
#    for i in range(len(unsorted)):
#        current = sorted_list[i]
#        next_time = find_next( current, unsorted )
#        sorted_list[i+1] = next_time
#        
#def sort_times( job ):
#    start = job.start
#    mid = job.times
#    end = job.end
#    
#    if start == end:
#        return [start]
#    elif len(mid) < 1:
#        return [start, end]
#    elif len(mid) < 2:
#        return [start, mid[0], end]
#    else:
#        return fill_sorted( start, mid.append(end) )

# Access and open pickled job files
**Process:**
    - Iterate through the non-empty date folders available in source_dir
    - A file is saved in valid_jobs if:
        * The pickled file is a Job object
    - Exceptions are skipped

In [None]:
cut = remaining
n = len(cut)

In [None]:
summary = {}

for file_name in cut:
    date = file_name[61:71]
    
    if date in summary:
        summary[date]["Total"] += 1
    else:
        summary[date] = {}
        summary[date]["Total"] = 1
        summary[date]["Saved"] = 0

In [None]:
summary

#### Catch invalid files

In [None]:
t0 = clock.time()
valid = []
invalid = []
scanned = 0

for file_name in cut:
    date = file_name[61:71]
    scanned += 1
    perc_scanned = np.round( scanned / n * 100, 2)
        
    clear_output(wait=True)
    print "Processing file {} of {} files \t ({}% of total files)".format( scanned, n, perc_scanned )
        
    try:
        pickle_file = open( file_name, 'rb' )
        job_obj = pickle.load( pickle_file )
        
        # Save data
        valid.append(job_obj)
        summary[date]["Saved"] += 1
        
        pickle_file.close()
            
    except:
        invalid.append( file_name )
            
    print
    print "Run time: {}s".format( np.round( clock.time() - t0, 1 ) )

In [None]:
invalid_out = '../src/data/summary_stats/raw_metrics/invalid.pkl'
out_file = open(invalid_out, 'wb')
pickle.dump(invalid, out_file)
#invalid = pickle.load(open(invalid_out, 'rb'))

In [None]:
valid_out = '../src/data/summary_stats/raw_metrics/all_rows.pkl'
out_file = open(all_rows_out, 'wb')
pickle.dump(all_rows, out_file)
#valid = pickle.load(open(valid_out, 'rb'))

In [None]:
len(invalid)

In [None]:
len(valid)

In [None]:
print "Total job objects collected:\t", len(valid)
print
print "\tBreakdown of files"
print "========================="

for date,info in summary:
    
    print "Date:\t", date
    print info["Saved"], "files saved out of", info["Total"]
    print

# Begin Processing

In [None]:
job_objects = valid
m = float(sum([len(job.hosts.keys()) for job in job_objects]))

In [None]:
cut = len(job_objects)/2
first = 0
stop = len(job_objects)
rem = stop - first

print "Total Jobs (this date):\t\t", len(job_objects)
print "Total Host,Job Pairs:\t\t", int(m)
print("------------------------------------")
print "Remaining jobs to scan:\t\t", int(rem)

In [None]:
job_dfs = {}
t0 = clock.time()
total = 0
current = 0

for job_idx in range( first, stop ):
    job = job_objects[ job_idx ]
    schemas = get_schemas( job )
    total += 1
    
    # support for tracking progress in below print statements
    clear_output(wait=True)
    
    # iterate through each host object job was run on
    for host_name, host in job.hosts.iteritems():
        print("Processing hosts for job {} of {} \t ({}% of total)".format(job_idx+1, stop, np.round( (current+first)/m*100, 2)))
        current += 1
        
        # build MultiIndex for df 
        idx_labels = get_indices( job, host )
        indices = pd.MultiIndex.from_tuples( idx_labels, names=['Stat', 'Device', 'Schema'] )
                    
        # process timestamps
        times = get_times( job, host )
    
        # collect job data
        data = get_data( host, idx_labels )
        
        # create df with MultiIndex, ordered times
        df = pd.DataFrame( index=indices, columns=times ).sort_index()
        
        # fill df
        for stat,devices in host.stats.items():
            for device,data_matrix in devices.items():
                for t_idx in range( len(data_matrix) ):
                    timestamp = times[t_idx]
                    
                    for metric_idx in range( len(data_matrix[ t_idx ]) ):
                        metric = schemas[stat][metric_idx]
                        row_label = (stat,device,metric)
                        datum = data_matrix[t_idx][metric_idx]
                        
                        df.loc[row_label][timestamp] = check_val(datum)
        
        # save job info from DataFrame to csv file
        df.to_csv( path_or_buf=save_dir+"{}_{}.csv".format( host_name, job.id ) )

In [None]:
# check that no job was missed
if total == ( stop-first ):
    print "Success!"
else:
    print stop - first - total, "jobs missing"