In [1]:
import os
import numpy as np
import csv
import pandas as pd
import itertools
from random import shuffle

In [2]:
data_folder = 'feature_output_new_mark1_2k'
output_folder = 'path_data'
if not(os.path.exists(output_folder)):
    os.mkdir(output_folder)
data_files = [f for f in os.listdir(data_folder)]
print data_files

['gromacs.csv', 'hmmer.csv', 'mcf.csv', 'sphinx3.csv', 'soplex-fix.csv', 'milc.csv', 'gobmk.csv', 'gemsfdtd.csv', 'leslie3d.csv', 'gmm.csv', 'h264.csv', 'povray-fix.csv', 'libquantum.csv', 'gcc.csv', 'stemmer.csv', 'lbm.csv', 'astar.csv', 'namd-fix.csv']


In [3]:
# Average sequence is about length 30
maxSeqLength = 50
# Dimension of each opcode vector
numDimensions = 100

In [4]:
def parse_data(data_files):

    for data_file in data_files:
        num_hot = num_total = 0
        i = j = k = 0
        print "Parsing " + data_file[:-4]
        if(os.path.exists(output_folder + '/data_hot_' + data_file[:-4] + '.npy')):
            continue
        with open(data_folder + "/" + data_file) as f:
            lines = f.readlines()
            shuffle(lines)
        ground_truth = None

        # Get number of hot and cold paths in program
        for line in lines:
            if(len(line.split(' ')) - 1):
                num_total += 1
                ground_truth = (int(line.split(' ')[2]) > 0)
                num_hot += ground_truth
        ground_truth = None

        # Allocate arrays for hot and cold paths
        data_hot = np.zeros((num_hot, maxSeqLength, numDimensions))
        data_cold = np.zeros((num_hot, maxSeqLength, numDimensions))

        # Parse hot and cold path data
        for line in lines:

            # Get whether a path is hot or cold
            if(len(line.split(' ')) - 1):
                if(ground_truth == True):
                    i += 1
                elif(ground_truth == False):
                    j += 1
                ground_truth = (int(line.split(' ')[2]) > 0)
                k = 0

            # Record basic block vector in array
            else:
                if(not len(line.split(',')) - 1 or k >= maxSeqLength):
                    continue
                data = np.asarray([float(val.strip('\n')) for val in line.split(',')])
                if(ground_truth):
                    data_hot[i][k] = data
                elif(j < num_hot):
                    data_cold[j][k] = data
                k += 1
                    
        # Number of Hot and Cold Paths         
        print "Num Hot Paths: " + str(num_hot)
        print "Num Cold Paths: " + str(num_total - num_hot)

        # Save Results
        np.save(output_folder + '/data_hot_' + data_file[:-4] + '.npy', data_hot)
        np.save(output_folder + '/data_cold_' + data_file[:-4] + '.npy', data_cold)

In [6]:
parse_data(data_files)

Parsing gromacs
Num Hot Paths: 1303
Num Cold Paths: 75612
Parsing hmmer
Num Hot Paths: 433
Num Cold Paths: 35646
Parsing mcf
Num Hot Paths: 603
Num Cold Paths: 3891
Parsing sphinx3
Num Hot Paths: 1549
Num Cold Paths: 74560
Parsing soplex-fix
Num Hot Paths: 271
Num Cold Paths: 17728
Parsing milc
Num Hot Paths: 1055
Num Cold Paths: 23343
Parsing gobmk
Num Hot Paths: 47407
Num Cold Paths: 215409
Parsing gemsfdtd
Num Hot Paths: 2523
Num Cold Paths: 60728
Parsing leslie3d
Num Hot Paths: 738
Num Cold Paths: 22579
Parsing gmm
Num Hot Paths: 90
Num Cold Paths: 3203
Parsing h264
Num Hot Paths: 3105
Num Cold Paths: 190972
Parsing povray-fix
Num Hot Paths: 3260
Num Cold Paths: 231072
Parsing libquantum
Num Hot Paths: 254
Num Cold Paths: 6101
Parsing gcc
Num Hot Paths: 33450
Num Cold Paths: 1104291
Parsing stemmer
Num Hot Paths: 927
Num Cold Paths: 4167
Parsing lbm
Num Hot Paths: 73
Num Cold Paths: 200
Parsing astar
Num Hot Paths: 7384
Num Cold Paths: 14791
Parsing namd-fix
Num Hot Paths: 1749
Num