# Script to process a job.ymal file into a csv

In [59]:
import pandas as pd
import numpy as np
import datetime as dt

#### Extracting the relevant job information from ymal files

In [None]:
def open_file(file):
    # List to store key information
    data = []
    # Keywords in yaml file to be found
    keywords = ['submitted_at:','id:','type:','method:','nsequences:','ncharacters:','completed_at:','num_threads:','query_length:','databases_ncharacters_total:']

    # Read the yaml file, finds the keywords and strips spaces and new lines
    with open (file,'r') as job:

        reader = job.readlines()

        for i in reader:
            for j in keywords: 
                if j in i:
                    data.append(i.strip(' ').strip('\n'))
    return data

###### Dataset definition 

In [None]:
data = open_file()

##### Storage for necessary parameters

In [61]:
# Job dictionary
job = {}

##### Creating and collating the necessary parameters

In [85]:
### Extract the immediate info: id, method, type, num_threads, query_length, and database length

# Each if statement looks for a different paramenter in the file.
# File has multiple id's, this is used to stop after the first one (main one)
has_id = False
for i in data:
    #Finds the id of the job, stops at the first one
    if ('id:' in i) and (has_id==False) :
        idn = i.replace('id: ','')
        has_id = True
        continue
    elif 'method:' in i:
        method = i.replace('method: ','')
        continue
    elif 'type: ' in i:
        tipo = i.replace('type: ','')
        continue
    elif 'num_threads:' in i:
        num_threads = int(i.replace('num_threads: ',''))
        continue
    elif 'query_length:' in i:
        query_length = int(i.replace('query_length: ',''))
        continue
    elif 'databases_ncharacters_total:' in i:
        databases_ncharacters = int(i.replace('databases_ncharacters_total: ',''))
        break
        
# print(idn,method,tipo,num_threads,query_length,databases_ncharacters)


6a0d7129-3b85-4a0b-b5de-30a7837bdba7 blastn nucleotide 1 883 2150034


###### Writing functions to extract the more complex values: total sequences, job duration, and job_duration bins

In [63]:
### Calculating the total number of sequences
# Returns the total number of sequences
def total_sequences(lista):
    '''
        Receives a list, looks for the `nsequences: ` string and returns the total number of sequences used as a integer.
    '''
    nseq = 0
    for i in lista:
        if 'nsequences:' in i:
            num_seq = int(i.replace('nsequences: ','').replace("'",''))
            nseq +=  num_seq
    return nseq

In [64]:
### Calculating job duration

# Find the start and end
for i in data:
    if 'submitted_at' in i:
        start = i.replace('submitted_at: ','')
    if 'completed_at' in i:
        end = i.replace('completed_at: ','')

# Function - returns a dictionary with time data necessary for delta time package
def time_filter (time):
    '''
        Receives a string with datetime values in string and returns a dictionary with the values for a datetime object
    '''
    new_time = time.replace(' ',',')
    for i in time[::-1]:
        if i == '+':
            new_time = new_time[:time.index(i)-1].replace('-','').replace(':','')
    
    year = int(new_time[0:4])
    month = int(new_time[4:6])
    day = int(new_time[6:8])
    hour = int(new_time[9:11])
    minute = int(new_time[11:13])
    seconds = int(new_time[13:15])
    micro = int(new_time[16:22])

    
    dic = {
        'year':year,
        'month':month,
        'day':day,
        'hour':hour,
        'minute':minute,
        'seconds':seconds,
        'micro':micro}

    return dic

In [98]:
### Calculates the delta between start and end

def delta_time(st,et):
    '''
        Receives two dictionaries with datetime values returns a datetime object with the difference in time
    '''
    start = dt.datetime(st['year'],st['month'],st['day'],st['hour'],st['minute'],st['seconds'],st['micro'])
    end = dt.datetime(et['year'],et['month'],et['day'],et['hour'],et['minute'],et['seconds'],et['micro'])
    
    delta = end - start

    return delta

In [100]:
# Function to merge both functions and return the result as a datetime object and in minutes 
def job_duration(start_time,end_time):
    
    '''
        Receives 2 datetime strings, calculates the difference and returns a datetime object [0] or the difference in minutes [1]
    '''
    start_dict = time_filter(start_time)
    end_dict = time_filter(end_time)
    
    delta = delta_time(start_dict,end_dict)
    delta_in_minutes = divmod(delta.total_seconds(),60)[0]
    
    return delta, delta_in_minutes

In [94]:
# Returns the time difference
#time_diff = delta_time(time_filter(start),time_filter(end))
#print(time_diff)

#time_diff_minutes = job_duration(start,end)
#print(time_diff_minutes)

0:00:00.546165
(datetime.timedelta(microseconds=546165), 0.0)


In [67]:
### Function to assign a job duration into a category 
def duration_bins(datetime_object):
    '''
        Receives a datetime object and returns a string with the estimated bin duration
    '''
    if dt.timedelta(minutes = 5) >= datetime_object >= dt.timedelta(seconds = 60):
        bin = '1 to 5 minutes'
    elif dt.timedelta(minutes = 10) >= datetime_object >= dt.timedelta(minutes = 5, seconds = 1):
        bin = '5 to 10 minutes'
    elif dt.timedelta(minutes = 30) >= datetime_object >= dt.timedelta(minutes = 10, seconds = 1):
        bin = '10 to 30 minutes'
    elif dt.timedelta(hours = 1) >= datetime_object >= dt.timedelta(minutes = 31):
        bin = '30 to 60 minutes'
    elif dt.timedelta(hours = 2) >= datetime_object >= dt.timedelta(hours = 1, minutes = 1):
        bin = '1 to 2 hours'
    elif dt.timedelta(hours = 5) >= datetime_object >= dt.timedelta(hours = 2, minutes = 1):
        bin = '2 to 5 hours'
    elif dt.timedelta(hours = 10) >= datetime_object >= dt.timedelta(hours = 5, minutes = 1):
        bin = '5 to 10 hours'
    elif dt.timedelta(hours = 15) >= datetime_object >= dt.timedelta(hours = 10, minutes = 1):
        bin = '10 to 15 hours'
    elif dt.timedelta(hours = 24) >= datetime_object >= dt.timedelta(hours = 15, minutes = 1):
        bin = '15 to 24 hours'
    elif datetime_object >= dt.timedelta(hours = 24, minutes = 1):
        bin = 'More than 1 day'
    else:
        bin = 'Less than 1 minute'
    return bin

In [68]:
# Testing the duration_bins function
#print(duration_bins(dt.timedelta(minutes=852)))

10 to 15 hours


##### Adding information to the job dictionary 

In [95]:
# Add the necessary data to the job dictionary
job['id'] = idn
job['method'] = method
job['type'] = tipo
job['nsequences'] = total_sequences(data)
job['num_threads']=num_threads
job['query_length']=query_length
job['databases_ncharacters_total']=databases_ncharacters
job['duration'] = job_duration(start,end)[1]
job['duration_window'] = duration_bins(job_duration(start,end)[0])
print(job)

# id: unique job id
# method: blast algorithm used
# type: nucleotide or amino acid sequence
# nsequences: number of sequences in the job
# num_threads: number of threads used for the job
# query_lenth: total number of characters in the queried sequences
# databases_ncharacters_total: total number of characters in the queried databases
# duration: job duration in minutes
# duration_window: estimated duration - string. 

{'id': '6a0d7129-3b85-4a0b-b5de-30a7837bdba7', 'method': 'blastn', 'type': 'nucleotide', 'nsequences': 16, 'num_threads': 1, 'query_length': 883, 'databases_ncharacters_total': 2150034, 'duration': 0.0, 'duration_window': 'Less than 1 minute'}


In [131]:
# Put everything into a dataframe
df = pd.DataFrame(job, index = [0])
df.append(job, ignore_index = True)
print(df)

                                     id  method        type  nsequences  \
0  6a0d7129-3b85-4a0b-b5de-30a7837bdba7  blastn  nucleotide          16   

   num_threads  query_length  databases_ncharacters_total  duration  \
0            1           883                      2150034       0.0   

      duration_window  
0  Less than 1 minute  


In [126]:
# Export into a csv when dataframe is complete.
#file = df.to_csv('testfile.tsv', sep='\t')