In [1]:
# Imports

import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline
import os
from os import listdir
from os.path import isfile, join
from random import randrange
import re
import tensorflow as tf

# Loading Data 

In [2]:
# Average sequence is about length 30
maxSeqLength = 50
# Dimension of each opcode vector
numDimensions = 100
# Size of batch for sequence learner
batchSize = 24

In [3]:
data_folder = 'feature_output_new_mark1_2k'
output_folder = 'path_data'
if not(os.path.exists(output_folder)):
    os.mkdir(output_folder)
data_files = [f[:-4] for f in os.listdir(data_folder)]
print data_files

['gromacs', 'hmmer', 'mcf', 'sphinx3', 'soplex-fix', 'milc', 'gobmk', 'gemsfdtd', 'leslie3d', 'gmm', 'h264', 'povray-fix', 'libquantum', 'gcc', 'stemmer', 'lbm', 'astar', 'namd-fix']


In [4]:
hot_data = []
cold_data = []
for data_file in data_files:
    print "Loading " + data_file + " paths"
    data_hot = np.load(output_folder + '/data_hot_' + data_file + '.npy')
    data_cold = np.load(output_folder + '/data_cold_' + data_file + '.npy')
    hot_data.append(data_hot)
    cold_data.append(data_cold)

Loading gromacs paths
Loading hmmer paths
Loading mcf paths
Loading sphinx3 paths
Loading soplex-fix paths
Loading milc paths
Loading gobmk paths
Loading gemsfdtd paths
Loading leslie3d paths
Loading gmm paths
Loading h264 paths
Loading povray-fix paths
Loading libquantum paths
Loading gcc paths
Loading stemmer paths
Loading lbm paths
Loading astar paths
Loading namd-fix paths


# Helper Functions

In [5]:
def getTrainBatch(test_program):
    labels = []
    arr = np.zeros([batchSize, maxSeqLength, numDimensions])
    i = 0
    while(i < batchSize):
        # Choose program that is not being tested
        program = test_program
        while(program == test_program):
            program = randrange(len(data_files))
        num = randrange(len(hot_data[program]))
        if (i % 2 == 0):
            try:
                arr[i] = hot_data[program][num-1:num]
                labels.append([1,0])
            except ValueError:
                continue
        else:
            try:
                arr[i] = cold_data[program][num-1:num]
                labels.append([0,1])
            except ValueError:
                continue
        i += 1
    return arr, labels

def getTestBatch(test_program):
    labels = []
    arr = np.zeros([batchSize, maxSeqLength, numDimensions])
    i = 0
    while(i < batchSize):
        num = randrange(len(hot_data[test_program]))
        if (i % 2 == 0):
            try:
                arr[i] = hot_data[test_program][num-1:num]
                labels.append([1,0])
            except ValueError:
                continue                       
        else:
            try:
                arr[i] = cold_data[test_program][num-1:num]
                labels.append([0,1])
            except ValueError:
                continue
        i += 1
    return arr, labels

# LSTM Model

In [162]:
batchSize = 24
lstmUnits = 256
numClasses = 2
iterations = 150001

In [88]:
tf.reset_default_graph()

labels = tf.placeholder(tf.float32, [batchSize, numClasses])
input_data = tf.placeholder(tf.float32, [batchSize, maxSeqLength, numDimensions])
data = tf.Variable(tf.zeros([batchSize, maxSeqLength, numDimensions]),dtype=tf.float32)
data = tf.assign(data, input_data)

In [89]:
lstmCell = tf.contrib.rnn.BasicLSTMCell(lstmUnits)
lstmCell = tf.contrib.rnn.DropoutWrapper(cell=lstmCell, output_keep_prob=0.75)
value, _ = tf.nn.dynamic_rnn(lstmCell, data, dtype=tf.float32)

In [90]:
weight = tf.Variable(tf.truncated_normal([lstmUnits, numClasses]))
bias = tf.Variable(tf.constant(0.1, shape=[numClasses]))
value = tf.transpose(value, [1, 0, 2])
last = tf.gather(value, int(value.get_shape()[0]) - 1)
prediction = (tf.matmul(last, weight) + bias)

In [91]:
softmax = tf.nn.softmax(prediction)
predictedLabels = tf.argmax(prediction,1)
correctPred = tf.equal(tf.argmax(prediction,1), tf.argmax(labels,1))

In [92]:
accuracy = tf.reduce_mean(tf.cast(correctPred, tf.float32))

In [94]:
loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=prediction, labels=labels))
optimizer = tf.train.AdamOptimizer().minimize(loss)

# Training

In [160]:
def train_learner(test_program):

    
    sess = tf.InteractiveSession()
    saver = tf.train.Saver()
    
    tf.summary.scalar('Loss', loss)
    tf.summary.scalar('Accuracy', accuracy)
    merged = tf.summary.merge_all()
    
    logdir = "tensorboard/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S") + "/"
    writer = tf.summary.FileWriter(logdir, sess.graph)   
    saver = tf.train.Saver()
    
    sess.run(tf.global_variables_initializer())

    for i in range(iterations):
       #Next Batch of reviews
       nextBatch, nextBatchLabels = getTrainBatch(test_program);
       sess.run(optimizer, {input_data: nextBatch, labels: nextBatchLabels})
       #Write summary to Tensorboard
       if (i % 50 == 0):
           summary = sess.run(merged, {input_data: nextBatch, labels: nextBatchLabels})
           writer.add_summary(summary, i)

       #Save the network every 10,000 training iterations
       if (i % 10000 == 0 and i != 0):
           if not(os.path.exists("gru_models")):
               os.mkdir("gru_models") 
           save_path = saver.save(sess, "lstm_models/" + data_files[test_program] + "_model.ckpt", global_step=i)
           print("saved to %s" % save_path)
    writer.close()

In [161]:
train_learner(4)

saved to gru_models/soplex-fix_model.ckpt-10000
saved to gru_models/soplex-fix_model.ckpt-20000
saved to gru_models/soplex-fix_model.ckpt-30000
saved to gru_models/soplex-fix_model.ckpt-40000
saved to gru_models/soplex-fix_model.ckpt-50000
saved to gru_models/soplex-fix_model.ckpt-60000
saved to gru_models/soplex-fix_model.ckpt-70000
saved to gru_models/soplex-fix_model.ckpt-80000
saved to gru_models/soplex-fix_model.ckpt-90000
saved to gru_models/soplex-fix_model.ckpt-100000
saved to gru_models/soplex-fix_model.ckpt-110000
saved to gru_models/soplex-fix_model.ckpt-120000
saved to gru_models/soplex-fix_model.ckpt-130000
saved to gru_models/soplex-fix_model.ckpt-140000
saved to gru_models/soplex-fix_model.ckpt-150000


KeyboardInterrupt: 

# Results

In [171]:
def test_learner(test_program):
    sess = tf.InteractiveSession()
    saver = tf.train.Saver()
    saver.restore(sess, tf.train.latest_checkpoint('lstm_models'))
    
    iterations = 30
    accuracy_average = 0
    precision_average = 0
    recall_average = 0
    for i in range(iterations):
        nextBatch, nextBatchLabels = getTestBatch(test_program);
        accuracy_average += (sess.run(accuracy, {data: nextBatch,
                                         labels: nextBatchLabels}))
        predicted_labels = (sess.run(predictedLabels, {data: nextBatch,
                                     labels: nextBatchLabels}))
        true_positive_total = sum(predicted_labels[0::2])
        false_positive_total = sum(predicted_labels[1::2])
        false_negative_total = batchSize / 2 - sum(predicted_labels[0::2])
        precision_average += 1 - float(true_positive_total) / (true_positive_total + false_positive_total)
        recall_average += 1 - float(true_positive_total) / (true_positive_total + false_negative_total)
    print "Accuracy: " + str(accuracy_average / iterations)
    precision_val = precision_average / iterations
    recall_val = recall_average / iterations
    print precision_val
    print recall_val
    f_score = 2 * (precision_val * recall_val) / (precision_val +  recall_val)
    print "F-Score: " + str(f_score)

In [183]:
test_learner(4)

INFO:tensorflow:Restoring parameters from gru_models/soplex-fix_model.ckpt-150000


INFO:tensorflow:Restoring parameters from gru_models/soplex-fix_model.ckpt-150000


Accuracy: 0.941666660706
0.926330891331
0.919444444444
F-Score: 0.922874821484
