#### Dependencies

In [1]:
# Pickle file dependencies
from tacc_stats.pickler.job_stats import Job
import cPickle as pickle

In [2]:
# System dependencies
from os import listdir
import time as clock
from IPython.display import clear_output

In [3]:
# Data manipulation dependencies
import pandas as pd

In [4]:
# Directory of all pickled jobs via comet
# new_dir = '/oasis/projects/nsf/sys200/stats/xsede_stats/archive'
source_dir = '/oasis/projects/nsf/sys200/tcooper/xsede_stats/comet_pickles/'

Custom data cleaning functions

In [5]:
# DESCRIPTION:
    # given a date directory, return all pickle files
def drop_invalid( src_dir, date ):
    job_pickles = []
    
    for jobid in listdir(src_dir+date): 
        try:
            pickle_file = open( src_dir+date+'/'+jobid, 'rb')
            job = pickle.load(pickle_file)
            job_pickles.append(jobid)
            pickle_file.close()
        except:
            next 
    return job_pickles
            
# DESCRIPTION:
    # given a date directory of pickle files
    # return files for jobs which ran at/above minimum
        # Note: minimum = 4  >>>  1 hour run time for job
def drop_below( src_dir, pickles, minimum=4 ):
    job_pickles = []
    
    for jobid in listdir(src_dir+date): 
        try:
            pickle_file = open( src_dir+date+'/'+jobid, 'rb')
            job = pickle.load(pickle_file)
            if (len(job.times) > minimum):
                job_pickles.append(jobid)
            pickle_file.close() 
        except:
            next 
    return job_pickles

#### Descriptive Info (Pre-cleaning)
Collections of date directories in source_dir

In [6]:
# dates_list preserves order from parent dir
dates_list = [ date for date in listdir(source_dir) if len(listdir(source_dir+date)) != 0]

In [7]:
# keys: dates, values: number of potential job files
size_info = {date:len(listdir(source_dir+date)) for date in dates_list}

# sorted size_info
by_size = sorted(size_info, key=size_info.get, reverse=False)

# total num of files to parse
n = float(sum([value for key,value in size_info.iteritems()]))

In [8]:
print "Number of date folders to parse:\t", len(dates_list)
print "Number of total files to parse:\t\t", n

Number of date folders to parse:	97
Number of total files to parse:		2199159.0


#### Descriptive Info (Post-cleaning)
Collections of date directories in source_dir

In [15]:
for date in size_info:
    if date in listdir('./modules/data/dates(2016)/'):
        next
    else:
        pickles = drop_invalid( source_dir, date )
        valid = drop_below( source_dir, pickles )
        with open('./modules/data/dates(2016)/'+date, 'wb') as fp:
            for jobid in valid:
                fp.write('%s\n' % jobid)
        fp.close()

In [16]:
len(listdir('./modules/data/dates(2016)/'))

97

In [None]:
target_files = {}

for date in dates_list:
    pickles = drop_invalid( source_dir, date )
    acceptable = drop_below( source_dir, pickles )
    if acceptable: target_files[date] = acceptable

In [None]:
# keys: dates, values: number of potential job files
size_info = {date:len(target_files[date]) for date in target_files.keys()}

# sorted size_info
by_size = sorted(size_info, key=size_info.get, reverse=True)

# total num of files to parse
n = float(sum([value for key,value in size_info.iteritems()]))

In [None]:
print "Number of date folders to parse:\t", len(target_files.keys())
print "Number of total files to parse:\t", n