#### Dependencies

In [1]:
# Pickle file dependencies
from tacc_stats.pickler.job_stats import Job
import cPickle as pickle

In [2]:
# System dependencies
from os import listdir
import time as clock
from IPython.display import clear_output

In [3]:
# Data manipulation dependencies
import pandas as pd
import numpy as np

In [4]:
# Directory of all pickled jobs via comet
# new_dir = '/oasis/projects/nsf/sys200/stats/xsede_stats/archive'
source_dir = '/oasis/projects/nsf/sys200/tcooper/xsede_stats/comet_pickles/'

# Directory of pre-cleaned job files
dates_dir = './modules/data/dates(2016)/'

# Directory to save to
save_dir = './modules/data/raw/'

In [5]:
listdir(source_dir)

['2016-10-07',
 '2016-08-06',
 '2016-09-27',
 '2016-06-26',
 '2016-07-07',
 '2016-11-14',
 '2016-09-15',
 '2016-08-15',
 '2016-10-14',
 '2016-07-26',
 '2016-08-27',
 '2016-10-26',
 '2016-09-06',
 '2016-11-07',
 '2016-07-14',
 '2016-07-19',
 '2016-09-01',
 '2016-07-13',
 '2016-10-21',
 '2016-08-20',
 '2016-07-21',
 '2016-10-19',
 '2016-08-18',
 '2016-10-13',
 '2016-08-12',
 '2016-09-18',
 '2016-11-19',
 '2016-09-12',
 '2016-11-13',
 '2016-09-20',
 '2016-11-21',
 '2016-08-01',
 '2016-09-24',
 '2016-10-04',
 '2016-08-05',
 '2016-11-17',
 '2016-09-16',
 '2016-07-04',
 '2016-06-25',
 '2016-07-25',
 '2016-08-16',
 '2016-10-17',
 '2016-07-17',
 '2016-09-05',
 '2016-11-04',
 '2016-08-24',
 '2016-10-25',
 '2016-10-22',
 '2016-08-23',
 '2016-10-28',
 '2016-08-29',
 '2016-11-09',
 '2016-09-08',
 '2016-11-03',
 '2016-09-02',
 '2016-10-10',
 '2016-08-11',
 '2016-07-22',
 '2016-07-28',
 '2016-09-30',
 '2016-06-22',
 '2016-08-30',
 '2016-10-31',
 '2016-06-28',
 '2016-09-11',
 '2016-11-10',
 '2016-07-

In [6]:
dates_dict = {date:dates_dir+date for date in listdir(dates_dir)}
dates_list = dates_dict.keys()

In [7]:
len(dates_list)

97

#### Read in jobs from cleaned jobs directory

In [8]:
def prep_target( date_file ):
    jobs_list = []
    
    # open file and read the content in a list
    with open(date_file, 'r') as f:
        lines = f.readlines()
    
        for jobid in lines:
            current = jobid[:-1]
            jobs_list.append(current)
    
    return jobs_list

# Access and open pickled job files
**Process:**
    - Iterate through the non-empty date folders available in source_dir
    - A file is saved in valid_jobs if:
        * The pickled file is a Job object
        * The job ran for more than 6 cycles (1 hour)
        * The total number of jobs saved at the end of the previous date folder is less than 1000
            _This is purely to keep the computations manageable according to compute time requested_
    - Exceptions are skipped

In [9]:
target_date = dates_list[23]
target_file = dates_dict[ target_date ]
jobids = prep_target( target_file )
n = len(jobids)

In [10]:
job_objects = []
t0 = clock.time()
total = 0

for jobid in jobids:
    total += 1
    clear_output(wait=True)
    print("Processing file {} of {} files \t ({}% of total files)".format(total, n, np.round( total/n*100, 2)))
    
    pickle_file = open( source_dir+target_date+'/'+jobid, 'rb')
    job_file = pickle.load(pickle_file)
    job_objects.append(job_file)
    pickle_file.close()      
        
    t2 = clock.time()
    print
    print("Run time: {}s".format(np.round(t2-t0, 1)))

Processing file 1139 of 1139 files 	 (100% of total files)

Run time: 85.5s


### Loops in loops in loops (Cleaning data)
**Notes:**
    - If a value is missing from the data, it will be replaced with '0' for the purpose of this project
    - If a type of statistic was not collected on the job, that column is dropped from the DataFrame
    - Two files are created during each iteration:
         1) A .csv of the descriptive statistics for that host,job pair
         2) A full .csv of the host,job data from the formatted DataFrame
    - Naming convention: Files are labelled as '{host}_{jobid}' to support random lookup
         * A job run on multiple host nodes is processed and saved with each individual host,job pair *

In [11]:
m = float(sum([len(job.hosts.keys()) for job in job_objects]))
cut = len(job_objects)/2

In [12]:
print "Jobs:\t\t\t", len(job_objects)
print "Total Host,Job Pairs:\t", int(m)

Jobs:			1139
Total Host,Job Pairs:	2362


In [13]:
start = 800 #1500
stop = len(job_objects)

In [None]:
schemas = {}
schemas_devices = {}
job_objects[0].schemas
t0 = clock.time()
total = 0
current = 0

for job in range( start, stop ):
    clear_output(wait=True)

    # general job values
    jobid = job_objects[job]
    start = pd.to_datetime(round(jobid.start_time), unit='s').time()
    end = pd.to_datetime(round(jobid.end_time), unit='s').time()
    numCycles = len(jobid.times)
    total += 1
    type_avgs = {}
    times = []
    
    ##################################
    #  build master list of schemas  #
    ##################################
    for stat in jobid.schemas.keys():
        if stat not in schemas.keys():
             schemas[stat] = jobid.schemas[stat].keys()
    
    # iterate through each host object job was run on
    for host_name, host in jobid.hosts.iteritems():
        current += 1
        
        try:
            print("Processing hosts for job {} of {} \t ({}% of total)".format(job+1, stop, np.round( (current)/m*100, 2)))
            
            ##################################
            #    convert timestamps to dt    #
            ##################################
            times.append(start)
            for time in host.times:
                times.append( pd.to_datetime(round(time), unit='s').time() )
            times.append(end)
            
            ##################################
            #  build master list of devices  #
            ##################################
            for stat in host.stats.keys():
                if stat not in schemas_devices.keys():
                    schemas_devices[stat] = host.stats[stat].keys()
                 
            indices_all = []
            for stat,devices in schemas_devices.items():
                for device in devices:
                    for schema in schemas[stat]:
                        indices_all.append( (stat,device,schema) )
    
            all_idx = pd.MultiIndex.from_tuples(indices_all, names=['Stat', 'Device', 'Schema'])  
            all_df = pd.DataFrame( index=all_idx, columns=times ).sort_index()
            
            ##################################
            #   iterate through host.stats   #
            ##################################
            for host_name,host in jobid.hosts.items():
                for stat,devices in host.stats.items():
                    for device,cycles in devices.items():
                        for i in range(len(cycles)):
                            for j in range(len(cycles[i])):
                                try:
                                    time = times[i]
                                    schema = schemas[stat][j]
                                    all_df.loc[(stat,device,schema),time] = cycles[i][j]
                                except:
                                    next
            
            all_df.to_csv(path_or_buf=save_dir+"{}_{}.csv".format( host_name, jobid.id ))
            
        except:
            next

Processing hosts for job 800 of 1139 	 (0.25% of total)
Processing hosts for job 800 of 1139 	 (0.3% of total)
Processing hosts for job 800 of 1139 	 (0.34% of total)
Processing hosts for job 800 of 1139 	 (0.38% of total)
Processing hosts for job 800 of 1139 	 (0.42% of total)
Processing hosts for job 800 of 1139 	 (0.47% of total)
Processing hosts for job 800 of 1139 	 (0.51% of total)
Processing hosts for job 800 of 1139 	 (0.55% of total)
Processing hosts for job 800 of 1139 	 (0.59% of total)
Processing hosts for job 800 of 1139 	 (0.64% of total)
Processing hosts for job 800 of 1139 	 (0.68% of total)
Processing hosts for job 800 of 1139 	 (0.72% of total)
Processing hosts for job 800 of 1139 	 (0.76% of total)


In [None]:
# check that no job was missed
if total == m:
    print "Success!"
else:
    print len(job_objects) - total, "jobs missing"