In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
project = "Hadoop"
output_dir = r"E:\IDEAWorkspace\autologguards\auto-guard\src\main\py\lrt\\" + project + r"Data\folds5"
profile_data_file = project+"NewProfileStatSummary.xlsx"
metrics_data_file = project+"_Metrics_Data.txt"
instructions_data_file = project+"_Count_Of_Instructions.txt"
methodcalls_data_file = project+"_Count_Of_Method_Calls.txt"
methodfreq_data_file = project+"_Method_Frequency.txt"

In [3]:
profile_data = pd.read_excel(profile_data_file, sheet_name="summary", index_col=0)
metrics_data = pd.read_csv(metrics_data_file, sep="\t", index_col=0)
instructions_data = pd.read_csv(instructions_data_file, sep="\t", index_col=0)
methodcalls_data = pd.read_csv(methodcalls_data_file, sep="\t", index_col=0)
methodfreq_data = pd.read_csv(methodfreq_data_file, sep="\t", index_col=0)

In [4]:
# methodcalls_data

In [5]:
# join data by LoggingID in profile_data
joined_data = pd.concat([profile_data, metrics_data, instructions_data, methodcalls_data, methodfreq_data], axis=1, join="inner")

In [6]:
joined_data.columns

Index(['Time(ns)Count', 'Time(ns)Sum', 'Time(ns)Max', 'PotentialMethodCalls',
       'PotentialInstructions', 'Instructions', 'MethodCalls',
       'MethodFrequency'],
      dtype='object')

In [7]:
# sort by Time(ns)Sum (mean of multiple runs of Time(ns)Sum)
sorted_data = joined_data.sort_values(by=['Time(ns)Sum'])

In [8]:
# Set ReverseRank, will be used as the relevance score in LearnToRank
sorted_data["ReverseRank"] = np.arange(1, len(sorted_data)+1)

In [9]:
# Shuffle data
shuffled_data = sorted_data.sample(frac=1)

In [10]:
shuffled_data

Unnamed: 0,Time(ns)Count,Time(ns)Sum,Time(ns)Max,PotentialMethodCalls,PotentialInstructions,Instructions,MethodCalls,MethodFrequency,ReverseRank
dfc06fa1-f6bd,2081.3,1.07739e+08,5378165,756,6598,17,11,1879,155
b291db57-b14b,1,875974,1141683,690,5810,17,10,55,63
c3b46859-b6a5,1,109962,135925,30152,405188,19,13,21183,28
5085b5b4-92a7,91902.2,5.91128e+07,8926196,657,5423,20,15,26833,144
102289d4-ca7e,3502,1.05276e+09,29560797,292,2829,24,18,4021,186
...,...,...,...,...,...,...,...,...,...
9cb6fb0b-24d9,858.7,7.00798e+06,1448895,698,6004,17,11,928,105
24c9dba5-6722,84967.2,7.63651e+07,2600391,314,3082,22,12,701566,150
f7ccd965-9015,86,1.06828e+06,279802,328,3386,19,13,733668,70
2058911d-6abd,6,29322.2,23417,343,3373,20,15,6,12


In [11]:
shuffled_data.to_csv(project+"FullData.csv")

In [12]:
splited_data = np.array_split(shuffled_data, 5)

In [13]:
for i, split in enumerate(splited_data):
    with open(os.path.join(output_dir, "fold"+str(i)), "w") as fold_file:
        for index, row in split.iterrows():
            fold_file.write("{rel} pid:1 1:{PotentialMethodCalls} 2:{PotentialInstructions} 3:{Instructions} 4:{MethodCalls} 5:{Count} 6:{MethodFreq} # {LoggingID}\n".format(
                rel=row["ReverseRank"], PotentialMethodCalls=row["PotentialMethodCalls"], PotentialInstructions=row["PotentialInstructions"], 
                Instructions=row["Instructions"], MethodCalls=row["MethodCalls"], Count=row["Time(ns)Count"], MethodFreq=row["MethodFrequency"], LoggingID=index
            ))