In [1]:
import numpy as np 
import pandas as pd
import os
import datetime
from tqdm import tqdm

In [40]:
#os.getcwd()

In [2]:
os.listdir('../Google-Data-411')

['Google cluster-usage traces format schema 2014-11-17 external.pdf',
 'machine_events.csv',
 'part-01_job_events.csv',
 'part-01_task_events.csv',
 'part-01_task_usage.csv',
 'part-02_job_events.csv',
 'part-02_task_events.csv',
 'part-02_task_usage.csv',
 'part-03_job_events.csv',
 'part-03_task_events.csv',
 'part-03_task_usage.csv',
 'part-04_job_events.csv',
 'part-04_task_events.csv',
 'part-04_task_usage.csv',
 'part-05_job_events.csv',
 'part-05_task_events.csv',
 'part-05_task_usage.csv',
 'part-06_job_events.csv',
 'part-06_task_events.csv',
 'part-06_task_usage.csv',
 'part-07_job_events.csv',
 'part-07_task_events.csv',
 'part-07_task_usage.csv']

In [90]:
job_events = [] # hold all data
task_events = []
task_usage = []

os.chdir('../Google-Data-411') # move to data directory

for file in os.listdir('../Google-Data-411'): # for each file name in the data directory
    if 'job' in file: # if file name has job in it
        job_events.append(pd.read_csv(file, header=None)) # append to job events list
    elif 'task_events' in file:
        task_events.append(pd.read_csv(file, header=None))
    elif 'task_usage' in file:
        task_usage.append(pd.read_csv(file, header=None))

job_events = pd.concat(job_events) # concatenate all this data to be one big dataframe
task_events = pd.concat(task_events) 
task_usage = pd.concat(task_usage)

os.chdir('../pre-processing') # move back to working directory

In [91]:
machine_events = pd.read_csv('../Google-Data-411/machine_events.csv', header=None)


In [92]:
#
# Output size of these dataframes
#

print('Job events dimensions: ', job_events.shape)
print('Task events dimensions: ', task_events.shape)
print('Task usage dimensions: ', task_usage.shape)

Job events dimensions:  (21161, 8)
Task events dimensions:  (1244596, 13)
Task usage dimensions:  (17944928, 20)


## Job Events

In [93]:
#
# Remove NaN column
#

job_events = job_events.drop(columns=1)
job_events.columns = ['time', 'jobID', 'eventType', 'userName', 'schedClass', 'jobName', 'logicalJobName'] # set column names appropriately
job_events.head() # preview data

Unnamed: 0,time,jobID,eventType,userName,schedClass,jobName,logicalJobName
0,20646876178,6252924356,0,r/Al6kYJOwZITr6wi4pAlEwyGv5TM2EkJ8woA5hszeA=,2,JPCEeQVfAM84156WazlpF2mbNCIH6JNAEdWKADGMW6M=,G/9E4AW9fSviXbmdFO5BBcjVd49zuI1AIU5gHQJLm+8=
1,20647976107,6252809803,3,fVUqjeDqXrzcJObPpFT8U1FwH8uBfyRXJQa4zRlwpb4=,1,7cLW+6pt2CHJM01krasXKfXt1ArqbEMuwVseGSpclus=,kzI7CKSyq7J04F06rtsufIYDPvFxRDHISFXAf2Qav7o=
2,20648680067,6252924356,1,r/Al6kYJOwZITr6wi4pAlEwyGv5TM2EkJ8woA5hszeA=,2,JPCEeQVfAM84156WazlpF2mbNCIH6JNAEdWKADGMW6M=,G/9E4AW9fSviXbmdFO5BBcjVd49zuI1AIU5gHQJLm+8=
3,20648680072,6252924415,0,r/Al6kYJOwZITr6wi4pAlEwyGv5TM2EkJ8woA5hszeA=,0,/9RMVijX6saTjLFPA6Npk+8JAvFpdgBL7M1O0FMFBf0=,QdrSScSXyIpoqR4skT9QFLlVShlxNDTFRBhIWZRq2SM=
4,20648703418,6252897088,4,r/Al6kYJOwZITr6wi4pAlEwyGv5TM2EkJ8woA5hszeA=,2,pmE/ir4WZoTI/I8Zx1FxhEWDHH5S8IiMxASLJjUfjLA=,G/9E4AW9fSviXbmdFO5BBcjVd49zuI1AIU5gHQJLm+8=


In [94]:
print('Percentage of unique jobIDs events: ', 100*len(job_events['jobID'].unique())/job_events.shape[0])

Percentage of unique jobIDs events:  34.68645149095033


In [95]:
#
# Factorise userName as numbers, jobName as numbers, logicalJobName as numbers
#
print('# Unique Names: ', len(job_events['userName'].unique()))
print('# Unique Job Names: ', len(job_events['jobName'].unique()))
print('# Unique Logical Job Names: ', len(job_events['logicalJobName'].unique()))

factorised_names, original_names = pd.factorize(job_events['userName'])
factorised_job_names, original_job_names = pd.factorize(job_events['jobName'])
factorised_logical_job_names, original_logical_job_names = pd.factorize(job_events['logicalJobName'])
factorised_job_ids, original_job_ids = pd.factorize(job_events['jobID'])

job_events['userName'] = factorised_names
job_events['jobName'] = factorised_job_names
job_events['logicalJobName'] = factorised_logical_job_names
job_events['jobID'] = factorised_job_ids
job_events['time'] = pd.to_datetime(job_events['time'], unit='us', origin='2011-05-01')
job_events.head()

# Unique Names:  119
# Unique Job Names:  3500
# Unique Logical Job Names:  2498


Unnamed: 0,time,jobID,eventType,userName,schedClass,jobName,logicalJobName
0,2011-05-01 05:44:06.876178,0,0,0,2,0,0
1,2011-05-01 05:44:07.976107,1,3,1,1,1,1
2,2011-05-01 05:44:08.680067,0,1,0,2,0,0
3,2011-05-01 05:44:08.680072,2,0,0,0,2,2
4,2011-05-01 05:44:08.703418,3,4,0,2,3,0


In [96]:
# Creating key column

timestamps_je = list(job_events['time'].astype(str)) # timestamps
jobid_je = list(job_events['jobID'].astype(str))
eventtype_je = list(job_events['eventType'].astype(str))

millisecs = []
for i in range(len(timestamps_je)): 
    ms = timestamps_je[i][::-1][0:3] # extract final three digits of time
    ji = jobid_je[i]
    et = eventtype_je[i]
    millisecs.append(ms+ji+et) # concatenate jobid, eventtype, time
    #millisecs.append(str(list(job_events['time'])[i]).split(' ')[1].split('.')[1])

job_events['primary_job_id'] = millisecs
job_events.head()


Unnamed: 0,time,jobID,eventType,userName,schedClass,jobName,logicalJobName,primary_job_id
0,2011-05-01 05:44:06.876178,0,0,0,2,0,0,87100
1,2011-05-01 05:44:07.976107,1,3,1,1,1,1,70113
2,2011-05-01 05:44:08.680067,0,1,0,2,0,0,76001
3,2011-05-01 05:44:08.680072,2,0,0,0,2,2,27020
4,2011-05-01 05:44:08.703418,3,4,0,2,3,0,81434


# Task Events

In [97]:
task_events.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,20646168368,,515042969,17,,5,/fk1fVcVxZ6iM6gHZzqbIyq56m5zrmHfpdcZ/zzkq4c=,2,0,0.01562,0.01553,0.000215,0
1,20646168371,,515042969,17,,0,/fk1fVcVxZ6iM6gHZzqbIyq56m5zrmHfpdcZ/zzkq4c=,2,0,0.01562,0.01553,0.000215,0
2,20646876180,,6252924356,0,,0,r/Al6kYJOwZITr6wi4pAlEwyGv5TM2EkJ8woA5hszeA=,2,8,0.06873,0.04773,3.8e-05,0
3,20646899409,,515042969,2,,5,/fk1fVcVxZ6iM6gHZzqbIyq56m5zrmHfpdcZ/zzkq4c=,2,0,0.01562,0.01553,0.000215,0
4,20646899412,,515042969,2,,0,/fk1fVcVxZ6iM6gHZzqbIyq56m5zrmHfpdcZ/zzkq4c=,2,0,0.01562,0.01553,0.000215,0


In [98]:
#
# Remove first missing column
#

task_events = task_events.drop(columns=1)
task_events.columns = ['time', 'jobID', 'taskIndex', 'machineID', 'eventType', 'userName', 'schedulingClass', 'priority', 'CPU', 'RAM', 'Disk', 'machineConstraint'] # set column names appropriately
task_events.head() # preview data

Unnamed: 0,time,jobID,taskIndex,machineID,eventType,userName,schedulingClass,priority,CPU,RAM,Disk,machineConstraint
0,20646168368,515042969,17,,5,/fk1fVcVxZ6iM6gHZzqbIyq56m5zrmHfpdcZ/zzkq4c=,2,0,0.01562,0.01553,0.000215,0
1,20646168371,515042969,17,,0,/fk1fVcVxZ6iM6gHZzqbIyq56m5zrmHfpdcZ/zzkq4c=,2,0,0.01562,0.01553,0.000215,0
2,20646876180,6252924356,0,,0,r/Al6kYJOwZITr6wi4pAlEwyGv5TM2EkJ8woA5hszeA=,2,8,0.06873,0.04773,3.8e-05,0
3,20646899409,515042969,2,,5,/fk1fVcVxZ6iM6gHZzqbIyq56m5zrmHfpdcZ/zzkq4c=,2,0,0.01562,0.01553,0.000215,0
4,20646899412,515042969,2,,0,/fk1fVcVxZ6iM6gHZzqbIyq56m5zrmHfpdcZ/zzkq4c=,2,0,0.01562,0.01553,0.000215,0


In [99]:
#
# Find number of NaN machine IDs
#
print('Percentage of NaN machine IDs: ', 100*(np.sum(task_events['machineID'].isna())/len(task_events)))

Percentage of NaN machine IDs:  37.01112650209385


### Consider either removing these observations or making them obvious during analysis

In [100]:
#
# Factorise all machine IDs (make note of NaN ID), userName, jobID
#
factorised_machine_ids, original_machine_ids = pd.factorize(task_events['machineID'])
task_events['machineID'] = factorised_machine_ids
factorised_usernames, original_usernames = pd.factorize(task_events['userName'])
task_events['userName'] = factorised_usernames
factorised_jobid_task, original_jobid_task = pd.factorize(task_events['jobID'])
task_events['jobID'] = factorised_jobid_task
task_events.head()

Unnamed: 0,time,jobID,taskIndex,machineID,eventType,userName,schedulingClass,priority,CPU,RAM,Disk,machineConstraint
0,20646168368,0,17,-1,5,0,2,0,0.01562,0.01553,0.000215,0
1,20646168371,0,17,-1,0,0,2,0,0.01562,0.01553,0.000215,0
2,20646876180,1,0,-1,0,1,2,8,0.06873,0.04773,3.8e-05,0
3,20646899409,0,2,-1,5,0,2,0,0.01562,0.01553,0.000215,0
4,20646899412,0,2,-1,0,0,2,0,0.01562,0.01553,0.000215,0


In [101]:
#
# Create separate dataframe without NaN
#

nan_values = (task_events['machineID'] != -1).values  # boolean vector for NaNs in machine ID column
task_events_NA = task_events
task_events_clean = task_events.iloc[nan_values].reset_index()
task_events_clean = task_events_clean.reset_index().drop(columns=['index', 'level_0'])

In [102]:
#
# Find unique pairs of taskIndex and jobID (only in task_events_clean)
# Quickest way to find unique pairs is to find sum of taskIndex and jobID
# This way takes less than 10 seconds compared to 30 minutes any other way
# Number generated is almost certainly going to be unique to that specific process

task_events['processID'] = (task_events['jobID']+0.00001*task_events['taskIndex'])
task_events_clean['processID'] = (task_events_clean['jobID']+0.00001*task_events_clean['taskIndex'])

task_combos_full, task_combo_unique_full = pd.factorize(task_events['processID'])
task_combos, task_combo_unique = pd.factorize(task_events_clean['processID'])
task_events_clean['processID'] = task_combos
task_events['processID'] = task_combos_full

In [103]:
#
# Check number of unique pairs is the same
#
print('# Unique pairs in our method ', len(task_events_clean['processID'].unique()))
print('# Unique pairs using pandas function ', len(task_events_clean[['jobID', 'taskIndex']].drop_duplicates().to_numpy()))

# Unique pairs in our method  266605
# Unique pairs using pandas function  266605


In [104]:
#
# Convert time to timestamp
#
task_events['time'] = pd.to_datetime(task_events['time'], unit='us', origin='2011-05-01')
task_events_clean['time'] = pd.to_datetime(task_events_clean['time'], unit='us', origin='2011-05-01')

task_events_clean.head()

Unnamed: 0,time,jobID,taskIndex,machineID,eventType,userName,schedulingClass,priority,CPU,RAM,Disk,machineConstraint,processID
0,2011-05-01 05:44:06.899535,2,1115,0,3,2,0,0,0.06873,0.01193,0.000115,0,0
1,2011-05-01 05:44:06.899762,3,2,1,4,1,0,2,0.06873,0.008774,3.8e-05,0,1
2,2011-05-01 05:44:06.899918,3,8,2,4,1,0,2,0.06873,0.008774,3.8e-05,0,2
3,2011-05-01 05:44:07.630135,0,17,3,1,0,2,0,0.01562,0.01553,0.000215,0,3
4,2011-05-01 05:44:07.975760,3,6,4,4,1,0,2,0.06873,0.008774,3.8e-05,0,4


In [105]:
# Creating key column

timestamps_te = list(task_events_clean['time'].astype(str)) # timestamps
jobid_te = list(task_events_clean['jobID'].astype(str))
eventtype_te = list(task_events_clean['eventType'].astype(str))

millisecs = []
for i in range(len(timestamps_te)): 
    ms = timestamps_te[i][::-1][0:3] # extract final three digits of time
    ji = jobid_te[i]
    et = eventtype_te[i]
    millisecs.append(ms+ji+et) # concatenate jobid, eventtype, time
    #millisecs.append(str(list(job_events['time'])[i]).split(' ')[1].split('.')[1])

task_events_clean['primary_job_id'] = millisecs
task_events_clean.head()

Unnamed: 0,time,jobID,taskIndex,machineID,eventType,userName,schedulingClass,priority,CPU,RAM,Disk,machineConstraint,processID,primary_job_id
0,2011-05-01 05:44:06.899535,2,1115,0,3,2,0,0,0.06873,0.01193,0.000115,0,0,53523
1,2011-05-01 05:44:06.899762,3,2,1,4,1,0,2,0.06873,0.008774,3.8e-05,0,1,26734
2,2011-05-01 05:44:06.899918,3,8,2,4,1,0,2,0.06873,0.008774,3.8e-05,0,2,81934
3,2011-05-01 05:44:07.630135,0,17,3,1,0,2,0,0.01562,0.01553,0.000215,0,3,53101
4,2011-05-01 05:44:07.975760,3,6,4,4,1,0,2,0.06873,0.008774,3.8e-05,0,4,6734


In [106]:
# Creating key column

timestamps_te = list(task_events_clean['time'].astype(str)) # timestamps
jobid_te = list(task_events_clean['machineID'].astype(str))
eventtype_te = list(task_events_clean['eventType'].astype(str))

millisecs = []
for i in range(len(timestamps_te)): 
    ms = timestamps_te[i][::-1][0:3] # extract final three digits of time
    ji = jobid_te[i]
    et = eventtype_te[i]
    millisecs.append(ms+ji+et) # concatenate jobid, eventtype, time
    #millisecs.append(str(list(job_events['time'])[i]).split(' ')[1].split('.')[1])

task_events_clean['primary_machine_id'] = millisecs
task_events_clean.head()

Unnamed: 0,time,jobID,taskIndex,machineID,eventType,userName,schedulingClass,priority,CPU,RAM,Disk,machineConstraint,processID,primary_job_id,primary_machine_id
0,2011-05-01 05:44:06.899535,2,1115,0,3,2,0,0,0.06873,0.01193,0.000115,0,0,53523,53503
1,2011-05-01 05:44:06.899762,3,2,1,4,1,0,2,0.06873,0.008774,3.8e-05,0,1,26734,26714
2,2011-05-01 05:44:06.899918,3,8,2,4,1,0,2,0.06873,0.008774,3.8e-05,0,2,81934,81924
3,2011-05-01 05:44:07.630135,0,17,3,1,0,2,0,0.01562,0.01553,0.000215,0,3,53101,53131
4,2011-05-01 05:44:07.975760,3,6,4,4,1,0,2,0.06873,0.008774,3.8e-05,0,4,6734,6744


In [107]:
# Creating key column

timestamps_te = list(task_events_clean['time'].astype(str)) # timestamps
jobid_te = list(task_events_clean['jobID'].astype(str))
eventtype_te = list(task_events_clean['taskIndex'].astype(str))

millisecs = []
for i in range(len(timestamps_te)): 
    ms = timestamps_te[i][::-1][0:3] # extract final three digits of time
    ji = jobid_te[i]
    et = eventtype_te[i]
    millisecs.append(ms+ji+et) # concatenate jobid, eventtype, time
    #millisecs.append(str(list(job_events['time'])[i]).split(' ')[1].split('.')[1])

task_events_clean['primary_task_id'] = millisecs
task_events_clean.head()

Unnamed: 0,time,jobID,taskIndex,machineID,eventType,userName,schedulingClass,priority,CPU,RAM,Disk,machineConstraint,processID,primary_job_id,primary_machine_id,primary_task_id
0,2011-05-01 05:44:06.899535,2,1115,0,3,2,0,0,0.06873,0.01193,0.000115,0,0,53523,53503,53521115
1,2011-05-01 05:44:06.899762,3,2,1,4,1,0,2,0.06873,0.008774,3.8e-05,0,1,26734,26714,26732
2,2011-05-01 05:44:06.899918,3,8,2,4,1,0,2,0.06873,0.008774,3.8e-05,0,2,81934,81924,81938
3,2011-05-01 05:44:07.630135,0,17,3,1,0,2,0,0.01562,0.01553,0.000215,0,3,53101,53131,531017
4,2011-05-01 05:44:07.975760,3,6,4,4,1,0,2,0.06873,0.008774,3.8e-05,0,4,6734,6744,6736


# Task Usage

In [108]:
print(task_usage.shape)
task_usage.head()

(17944928, 20)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,20645000000,20700000000,4665896876,394,317508493,0.04895,0.06689,0.07715,0.004684,0.005333,0.06702,0.000104,0.000378,0.07971,0.002571,1.222,0.001147,0,0,0.03986
1,20645000000,20700000000,6176114691,80,2912464652,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,0,0,0.0
2,20645000000,20700000000,6206862915,49,635988945,0.000533,0.04016,0.04266,0.000165,0.000305,0.04041,2.5e-05,8e-05,0.003906,0.001215,3.805,0.0113,0,0,0.000468
3,20645000000,20700000000,6238340468,541,1301873,0.03168,0.00647,0.008163,0.000302,0.000536,0.007393,5.7e-05,0.0,0.08252,0.001429,1.286,0.001875,0,0,0.08044
4,20645000000,20700000000,6238340468,1424,3890452312,0.007507,0.01242,0.01355,0.000123,0.000375,0.01248,1.3e-05,3e-06,0.06531,0.000143,1.378,0.002397,0,0,0.00547


In [109]:
task_usage.columns = ['first', 'last', 'jobID', 'taskIndex', 'machineID', 'cpuMeanUsage', 'canonicalMemUsage', 'assignedMemUsage', 'unmappedCacheMemUsage', 'totalCacheMemUsage', 'maxMemUsage', 'meanDiskTime', 'meanDiskSpaceUsed', 'cpuMaxUsage', 'maxDiskTime', 'cyclesPerInstruction', 'memAccessPerInstruction', 'samplePortion', 'aggType', 'cpuSampledUsage']

In [110]:
#
# Remove NaN values
#

task_usage = task_usage.dropna()

In [111]:
print(task_usage.shape)
task_usage.head()

(15126583, 20)


Unnamed: 0,first,last,jobID,taskIndex,machineID,cpuMeanUsage,canonicalMemUsage,assignedMemUsage,unmappedCacheMemUsage,totalCacheMemUsage,maxMemUsage,meanDiskTime,meanDiskSpaceUsed,cpuMaxUsage,maxDiskTime,cyclesPerInstruction,memAccessPerInstruction,samplePortion,aggType,cpuSampledUsage
0,20645000000,20700000000,4665896876,394,317508493,0.04895,0.06689,0.07715,0.004684,0.005333,0.06702,0.000104,0.000378,0.07971,0.002571,1.222,0.001147,0,0,0.03986
2,20645000000,20700000000,6206862915,49,635988945,0.000533,0.04016,0.04266,0.000165,0.000305,0.04041,2.5e-05,8e-05,0.003906,0.001215,3.805,0.0113,0,0,0.000468
3,20645000000,20700000000,6238340468,541,1301873,0.03168,0.00647,0.008163,0.000302,0.000536,0.007393,5.7e-05,0.0,0.08252,0.001429,1.286,0.001875,0,0,0.08044
4,20645000000,20700000000,6238340468,1424,3890452312,0.007507,0.01242,0.01355,0.000123,0.000375,0.01248,1.3e-05,3e-06,0.06531,0.000143,1.378,0.002397,0,0,0.00547
5,20645000000,20700000000,6238340468,1971,905071,0.01056,0.01036,0.01155,0.000117,0.000368,0.01041,2.1e-05,3e-06,0.06995,0.000215,1.648,0.003659,0,0,0.005974


In [112]:
# # Find unique pairs of taskIndex and jobID (only in task_events_clean)
# Quickest way to find unique pairs is to find sum of 0.01*taskIndex and jobID
# This way takes less than 5 seconds compared to 30 minutes any other way
# Number generated is almost certainly going to be unique to that specific process

task_usage['processID'] = (task_usage['jobID']+0.01*task_usage['taskIndex'])
usage_combos_df, usage_unique_combos = pd.factorize(task_usage['processID'])
task_usage['processID'] = usage_combos_df
        
task_usage.head()

Unnamed: 0,first,last,jobID,taskIndex,machineID,cpuMeanUsage,canonicalMemUsage,assignedMemUsage,unmappedCacheMemUsage,totalCacheMemUsage,...,meanDiskTime,meanDiskSpaceUsed,cpuMaxUsage,maxDiskTime,cyclesPerInstruction,memAccessPerInstruction,samplePortion,aggType,cpuSampledUsage,processID
0,20645000000,20700000000,4665896876,394,317508493,0.04895,0.06689,0.07715,0.004684,0.005333,...,0.000104,0.000378,0.07971,0.002571,1.222,0.001147,0,0,0.03986,0
2,20645000000,20700000000,6206862915,49,635988945,0.000533,0.04016,0.04266,0.000165,0.000305,...,2.5e-05,8e-05,0.003906,0.001215,3.805,0.0113,0,0,0.000468,1
3,20645000000,20700000000,6238340468,541,1301873,0.03168,0.00647,0.008163,0.000302,0.000536,...,5.7e-05,0.0,0.08252,0.001429,1.286,0.001875,0,0,0.08044,2
4,20645000000,20700000000,6238340468,1424,3890452312,0.007507,0.01242,0.01355,0.000123,0.000375,...,1.3e-05,3e-06,0.06531,0.000143,1.378,0.002397,0,0,0.00547,3
5,20645000000,20700000000,6238340468,1971,905071,0.01056,0.01036,0.01155,0.000117,0.000368,...,2.1e-05,3e-06,0.06995,0.000215,1.648,0.003659,0,0,0.005974,4


In [113]:
#
# Check number of unique pairs is the same
#
print('# Unique pairs in our method ', len(task_usage['processID'].unique()))
print('# Unique pairs using pandas function ', len(task_usage[['jobID', 'taskIndex']].drop_duplicates().to_numpy()))

# Unique pairs in our method  301578
# Unique pairs using pandas function  301578


In [114]:
#
# Change first and last to timestamp
#
task_usage['first'] = pd.to_datetime(task_usage['first'], unit='us', origin='2011-05-01')
task_usage['last'] = pd.to_datetime(task_usage['last'], unit='us', origin='2011-05-01')

task_usage.head()

Unnamed: 0,first,last,jobID,taskIndex,machineID,cpuMeanUsage,canonicalMemUsage,assignedMemUsage,unmappedCacheMemUsage,totalCacheMemUsage,...,meanDiskTime,meanDiskSpaceUsed,cpuMaxUsage,maxDiskTime,cyclesPerInstruction,memAccessPerInstruction,samplePortion,aggType,cpuSampledUsage,processID
0,2011-05-01 05:44:05,2011-05-01 05:45:00,4665896876,394,317508493,0.04895,0.06689,0.07715,0.004684,0.005333,...,0.000104,0.000378,0.07971,0.002571,1.222,0.001147,0,0,0.03986,0
2,2011-05-01 05:44:05,2011-05-01 05:45:00,6206862915,49,635988945,0.000533,0.04016,0.04266,0.000165,0.000305,...,2.5e-05,8e-05,0.003906,0.001215,3.805,0.0113,0,0,0.000468,1
3,2011-05-01 05:44:05,2011-05-01 05:45:00,6238340468,541,1301873,0.03168,0.00647,0.008163,0.000302,0.000536,...,5.7e-05,0.0,0.08252,0.001429,1.286,0.001875,0,0,0.08044,2
4,2011-05-01 05:44:05,2011-05-01 05:45:00,6238340468,1424,3890452312,0.007507,0.01242,0.01355,0.000123,0.000375,...,1.3e-05,3e-06,0.06531,0.000143,1.378,0.002397,0,0,0.00547,3
5,2011-05-01 05:44:05,2011-05-01 05:45:00,6238340468,1971,905071,0.01056,0.01036,0.01155,0.000117,0.000368,...,2.1e-05,3e-06,0.06995,0.000215,1.648,0.003659,0,0,0.005974,4


In [115]:
#
# Categorise jobID, machineID, taskIndex
#
factorised_jobids_usage, original_jobids_usage = pd.factorize(task_usage['jobID'])
task_usage['jobID'] = factorised_jobids_usage

factorised_machineid_usage, original_machineid_usage = pd.factorize(task_usage['machineID'])
task_usage['machineID'] = factorised_machineid_usage

factorised_taskid_usage, original_taskid_usage = pd.factorize(task_usage['taskIndex'])
task_usage['taskIndex'] = factorised_taskid_usage

task_usage.head()

Unnamed: 0,first,last,jobID,taskIndex,machineID,cpuMeanUsage,canonicalMemUsage,assignedMemUsage,unmappedCacheMemUsage,totalCacheMemUsage,...,meanDiskTime,meanDiskSpaceUsed,cpuMaxUsage,maxDiskTime,cyclesPerInstruction,memAccessPerInstruction,samplePortion,aggType,cpuSampledUsage,processID
0,2011-05-01 05:44:05,2011-05-01 05:45:00,0,0,0,0.04895,0.06689,0.07715,0.004684,0.005333,...,0.000104,0.000378,0.07971,0.002571,1.222,0.001147,0,0,0.03986,0
2,2011-05-01 05:44:05,2011-05-01 05:45:00,1,1,1,0.000533,0.04016,0.04266,0.000165,0.000305,...,2.5e-05,8e-05,0.003906,0.001215,3.805,0.0113,0,0,0.000468,1
3,2011-05-01 05:44:05,2011-05-01 05:45:00,2,2,2,0.03168,0.00647,0.008163,0.000302,0.000536,...,5.7e-05,0.0,0.08252,0.001429,1.286,0.001875,0,0,0.08044,2
4,2011-05-01 05:44:05,2011-05-01 05:45:00,2,3,3,0.007507,0.01242,0.01355,0.000123,0.000375,...,1.3e-05,3e-06,0.06531,0.000143,1.378,0.002397,0,0,0.00547,3
5,2011-05-01 05:44:05,2011-05-01 05:45:00,2,4,4,0.01056,0.01036,0.01155,0.000117,0.000368,...,2.1e-05,3e-06,0.06995,0.000215,1.648,0.003659,0,0,0.005974,4


In [118]:
# Creating key column

timestamps_tu = list(task_usage['first'].astype(str)) # timestamps
jobid_tu = list(task_usage['taskIndex'].astype(str))
eventtype_tu = list(task_usage['machineID'].astype(str))

millisecs = []
for i in range(len(timestamps_tu)): 
    ms = timestamps_tu[i][::-1][0:2] # extract final three digits of time
    ji = jobid_tu[i]
    et = eventtype_tu[i]
    millisecs.append(ms+ji+et) # concatenate jobid, eventtype, time
    #millisecs.append(str(list(job_events['time'])[i]).split(' ')[1].split('.')[1])

task_usage['primary_usage_id'] = millisecs
task_usage.head()

Unnamed: 0,first,last,jobID,taskIndex,machineID,cpuMeanUsage,canonicalMemUsage,assignedMemUsage,unmappedCacheMemUsage,totalCacheMemUsage,...,cpuMaxUsage,maxDiskTime,cyclesPerInstruction,memAccessPerInstruction,samplePortion,aggType,cpuSampledUsage,processID,primary_usage_id,primary_task_id
0,2011-05-01 05:44:05,2011-05-01 05:45:00,0,0,0,0.04895,0.06689,0.07715,0.004684,0.005333,...,0.07971,0.002571,1.222,0.001147,0,0,0.03986,0,5000,50:00
2,2011-05-01 05:44:05,2011-05-01 05:45:00,1,1,1,0.000533,0.04016,0.04266,0.000165,0.000305,...,0.003906,0.001215,3.805,0.0113,0,0,0.000468,1,5011,50:11
3,2011-05-01 05:44:05,2011-05-01 05:45:00,2,2,2,0.03168,0.00647,0.008163,0.000302,0.000536,...,0.08252,0.001429,1.286,0.001875,0,0,0.08044,2,5022,50:22
4,2011-05-01 05:44:05,2011-05-01 05:45:00,2,3,3,0.007507,0.01242,0.01355,0.000123,0.000375,...,0.06531,0.000143,1.378,0.002397,0,0,0.00547,3,5033,50:23
5,2011-05-01 05:44:05,2011-05-01 05:45:00,2,4,4,0.01056,0.01036,0.01155,0.000117,0.000368,...,0.06995,0.000215,1.648,0.003659,0,0,0.005974,4,5044,50:24


In [119]:
# Creating key column

timestamps_tu = list(task_usage['first'].astype(str)) # timestamps
jobid_tu = list(task_usage['jobID'].astype(str))
eventtype_tu = list(task_usage['taskIndex'].astype(str))

millisecs = []
for i in range(len(timestamps_tu)): 
    ms = timestamps_tu[i][::-1][0:2] # extract final three digits of time
    ji = jobid_tu[i]
    et = eventtype_tu[i]
    millisecs.append(ms+ji+et) # concatenate jobid, eventtype, time
    #millisecs.append(str(list(job_events['time'])[i]).split(' ')[1].split('.')[1])

task_usage['primary_task_id'] = millisecs
task_usage.head()

Unnamed: 0,first,last,jobID,taskIndex,machineID,cpuMeanUsage,canonicalMemUsage,assignedMemUsage,unmappedCacheMemUsage,totalCacheMemUsage,...,cpuMaxUsage,maxDiskTime,cyclesPerInstruction,memAccessPerInstruction,samplePortion,aggType,cpuSampledUsage,processID,primary_usage_id,primary_task_id
0,2011-05-01 05:44:05,2011-05-01 05:45:00,0,0,0,0.04895,0.06689,0.07715,0.004684,0.005333,...,0.07971,0.002571,1.222,0.001147,0,0,0.03986,0,5000,5000
2,2011-05-01 05:44:05,2011-05-01 05:45:00,1,1,1,0.000533,0.04016,0.04266,0.000165,0.000305,...,0.003906,0.001215,3.805,0.0113,0,0,0.000468,1,5011,5011
3,2011-05-01 05:44:05,2011-05-01 05:45:00,2,2,2,0.03168,0.00647,0.008163,0.000302,0.000536,...,0.08252,0.001429,1.286,0.001875,0,0,0.08044,2,5022,5022
4,2011-05-01 05:44:05,2011-05-01 05:45:00,2,3,3,0.007507,0.01242,0.01355,0.000123,0.000375,...,0.06531,0.000143,1.378,0.002397,0,0,0.00547,3,5033,5023
5,2011-05-01 05:44:05,2011-05-01 05:45:00,2,4,4,0.01056,0.01036,0.01155,0.000117,0.000368,...,0.06995,0.000215,1.648,0.003659,0,0,0.005974,4,5044,5024


In [None]:
#
# Categorise jobID, machineID, taskIndex (skipped as this takes a long time to run)
#
'''
factorised_jobids_usage, original_jobids_usage = pd.factorize(task_usage['jobID'])
factorised_machineid_usage, original_machineid_usage = pd.factorize(task_usage['machineID'])
factorised_taskid_usage, original_taskid_usage = pd.factorize(task_usage['taskIndex'])

task_usage['jobID'] = factorised_jobids_usage
task_usage['machineID'] = factorised_machineid_usage
task_usage['taskIndex'] = factorised_taskid_usage

task_usage.head()
'''

# Machine Events

In [120]:
machine_events.head()

Unnamed: 0,0,1,2,3,4,5
0,0,5,0,HofLGzk1Or/8Ildj2+Lqv0UGGvY82NLoni8+J/Yy0RU=,0.5,0.2493
1,0,6,0,HofLGzk1Or/8Ildj2+Lqv0UGGvY82NLoni8+J/Yy0RU=,0.5,0.2493
2,0,7,0,HofLGzk1Or/8Ildj2+Lqv0UGGvY82NLoni8+J/Yy0RU=,0.5,0.2493
3,0,10,0,HofLGzk1Or/8Ildj2+Lqv0UGGvY82NLoni8+J/Yy0RU=,0.5,0.2493
4,0,13,0,HofLGzk1Or/8Ildj2+Lqv0UGGvY82NLoni8+J/Yy0RU=,0.5,0.2493


In [121]:
machine_events.columns = ['time', 'machineID', 'eventType', 'platformID', 'capacityCPU', 'capacityMem']

# Factorise platformID
machine_events['platformID'] = pd.factorize(machine_events['platformID'])[0]

# Change time to timestamp
machine_events['time'] = pd.to_datetime(machine_events['time'], unit='us', origin='2011-05-01')
machine_events.head()

Unnamed: 0,time,machineID,eventType,platformID,capacityCPU,capacityMem
0,2011-05-01,5,0,0,0.5,0.2493
1,2011-05-01,6,0,0,0.5,0.2493
2,2011-05-01,7,0,0,0.5,0.2493
3,2011-05-01,10,0,0,0.5,0.2493
4,2011-05-01,13,0,0,0.5,0.2493


In [122]:
# Creating key column

timestamps_me = list(machine_events['time'].astype(str)) # timestamps
jobid_me = list(machine_events['machineID'].astype(str))
eventtype_me = list(machine_events['eventType'].astype(str))

millisecs = []
for i in range(len(timestamps_me)): 
    ms = timestamps_me[i][::-1][0:3] # extract final three digits of time
    ji = jobid_me[i]
    et = eventtype_me[i]
    millisecs.append(ms+ji+et) # concatenate jobid, eventtype, time
    #millisecs.append(str(list(job_events['time'])[i]).split(' ')[1].split('.')[1])

machine_events['primary_machine_id'] = millisecs
machine_events.head()

Unnamed: 0,time,machineID,eventType,platformID,capacityCPU,capacityMem,primary_machine_id
0,2011-05-01,5,0,0,0.5,0.2493,50
1,2011-05-01,6,0,0,0.5,0.2493,60
2,2011-05-01,7,0,0,0.5,0.2493,70
3,2011-05-01,10,0,0,0.5,0.2493,100
4,2011-05-01,13,0,0,0.5,0.2493,130


In [128]:
job_events['primary_job_id'] = job_events['primary_job_id'].astype(np.int64)
task_events_clean['primary_task_id'] = task_events_clean['primary_task_id'].astype(np.int64)
task_events_clean['primary_machine_id'] = task_events_clean['primary_machine_id'].astype(np.int64)
task_events_clean['primary_job_id'] = task_events_clean['primary_job_id'].astype(np.int64)
task_usage['primary_usage_id'] = task_usage['primary_usage_id'].astype(np.int64)
task_usage['primary_task_id'] = task_usage['primary_task_id'].astype(np.int64)
machine_events['primary_machine_id'] = machine_events['primary_machine_id'].astype(np.int64)

# Output

In [52]:
job_events.to_csv('job-events.csv', index=False)
task_events.to_csv('task-events.csv', index=False)
task_usage.to_csv('task-usage.csv', index=False)
machine_events.to_csv('machine-events.csv', index=False)