In [1]:
import tensorflow as tf
import glob
import os
import numpy as np

In [2]:
#data preparation
class Data:
    def __init__(self,source="/C_bytecode_analysis/output", seq_len=8):
        self.seq_len=seq_len
        self.home=os.getenv("HOME")
        self.source=source
        self.data=[]
        self.keywords=["FOR","CALL","IF","ELSE","WHILE","DO","SWITCH","FUNCTION"]
        self.number_of_lines=0
        self.data_label_dict={}
        for x in self.keywords:
            self.data_label_dict[x]=[]
        offset=0
        for filename in glob.glob(os.path.join(self.home+self.source,'*.labeled_addresses')):
            #print (filename)
            with open(filename) as f:
                lines = f.readlines()
                for line in lines:
                    line=line.strip().split("\t")
                    if len(line)==3:
                        func_name=line[0]
                        classes=line[1]
                        number_of_lines=int(line[2])
                        label_list=[]
                        offset=len(self.data)
                        #print (func_name, classes, number_of_lines)
                    else:
                        #if len(line)<2:
                        #    continue
                        #address = line[0]
                        bytecode = line[1].strip()
                        #instruction = line[2].strip()
                        #source_line = int(line[3])
                        #source_file = line[4]
                        labels = line[5:]
                        label_list.append(labels)
                        #print (address,bytecode,instruction,source_line,source_file,labels)
                        indexes= (str(bytecode))[2:4]
                        if len(bytecode)<=10:
                            indexes+=str(bytecode)[4:min(len(bytecode),10)]
                            indexes+="g"*(10-len(bytecode))
                        else:
                            indexes+=(str(bytecode))[4:10]
                        new=[]
                        for x in indexes:
                                pom=[0]*16
                                if x!="g":
                                    pom[int(x,16)]=1
                                new.extend(pom)
                        self.data.append(np.array(new))
                        number_of_lines-=1
                        if number_of_lines==0:
                            cur=[]
                            counter={}
                            start_dict={}
                            i=-1
                            for labels in label_list:
                                i+=1
                                for label in labels:
                                    if label not in cur:
                                        cur.append(label)
                                        counter[label]=1
                                        start_dict[label]=i
                                    else:
                                        counter[label]+=1
                            for label in counter.keys():
                                self.data_label_dict[label.split("_")[0]].append((start_dict[label]+offset,counter[label]))
            #break
        self.data=np.array(self.data)
        self.labels_with_length=[]
        for x in self.data_label_dict.keys():
            cl=self.keywords.index(x)
            for y in self.data_label_dict[x]:
                self.labels_with_length.append([cl,y[0],y[1]])
        self.labels_with_length=np.array(self.labels_with_length)
                
    def make_batches(self, size=50):
        perm=np.random.permutation(len(self.labels_with_length))
        self.y_l_batches=[]
        pom=self.labels_with_length[perm]
        for i in range(int(len(perm)/size)):
            self.y_l_batches.append(pom[i*size:i*size+size])
        self.y_l_batches=np.array(self.y_l_batches)
        self.num_batches=len(self.y_l_batches)
        self.current_batch_index=0
        self.size=size
        
    def get_batch(self):
        seq_len=self.seq_len
        y_l_batch=self.y_l_batches[self.current_batch_index]
        pom=[self.data[i[1]:i[1]+i[2]] for i in y_l_batch]
        for i in range(len(pom)):
            if len(pom[i])>=seq_len*2:
                #print (len(pom[i]))
                pom[i]=np.vstack((pom[i][:seq_len],pom[i][-seq_len:]))
                #print (len(pom[i]))
                #print ("----")
            else:
                pom[i]=np.vstack([pom[i],np.array([np.zeros(128) for j in range(seq_len*2-len(pom[i]))])])
        x=np.array(pom)
        pom=np.array(y_l_batch[:,0])
        y=np.eye(8)[pom]
        self.current_batch_index+=1
        if self.current_batch_index==self.num_batches:
            self.make_batches()
        return x,y

In [3]:
class ConvNet:
    def __init__(self,data):
        self.data=data
    @staticmethod
    def weight_variable(shape):
      initial = tf.truncated_normal(shape, stddev=0.1)
      return tf.Variable(initial)
    
    @staticmethod
    def bias_variable(shape):
      initial = tf.constant(0.1, shape=shape)
      return tf.Variable(initial)
    
    @staticmethod
    def conv2d(x, W):
      return tf.nn.conv2d(x, W, strides=[1, 1, 1, 1], padding='VALID')

    @staticmethod
    def max_pool_shape(x, shape=[1,1,2,1]):
      return tf.nn.max_pool(x, ksize=shape,
                            strides=shape, padding='VALID')

    def convolution_graph(self,conv_shape=[[5,5,1,16],[5,5,16,32]],fc_shape=[512,8], param_lambda=0.01, seq_len=8):
        
        self.x = tf.placeholder(tf.float32, shape=[None, seq_len*2, 128], name="x")
        self.y_ = tf.placeholder(tf.float32, shape=[None, 8], name="y_")
        self.W_conv=[]
        self.b_conv=[]
        self.h_conv=[]
        self.h_pool=[]
        
        self.x_input= tf.reshape(self.x, [-1, 128, seq_len*2, 1])
        print (self.x_input.shape)
        self.layer_input=[]
        self.layer_input.append(self.x_input)
        #print (self.layer_input[-1].shape)
        
        for i in range(len(conv_shape)):
            self.W_conv.append(self.weight_variable(conv_shape[i]))
            self.b_conv.append(self.bias_variable([conv_shape[i][-1]]))

            self.h_conv.append(tf.nn.relu(self.conv2d(self.layer_input[-1], self.W_conv[-1]) + self.b_conv[-1]))
            self.layer_input.append(self.max_pool_shape(self.h_conv[-1]))
            #print (self.layer_input[-1].shape)
        
        fc_input_shape=[]
        a=1
        for b in self.layer_input[-1].shape[1:]:
            a*=int(b)
        fc_input_shape.append(a)
        
        self.W_fc=[]
        self.b_fc=[]
        self.h_pool_flat=[]
        
        
        for i in range(len(fc_shape[:-1])):
            
            self.W_fc.append(self.weight_variable([fc_input_shape[-1], fc_shape[i]]))
            self.b_fc.append(self.bias_variable([fc_shape[i]]))

            self.h_pool_flat.append(tf.reshape(self.layer_input[-1], [-1, fc_input_shape[-1]]))
            self.layer_input.append(tf.nn.relu(tf.matmul(self.h_pool_flat[-1], self.W_fc[-1]) + self.b_fc[-1]))
            
            fc_input_shape.append(fc_shape[i])
            #self.keep_prob = tf.placeholder(tf.float32)
            #self.h_fc1_drop = tf.nn.dropout(self.h_fc1, self.keep_prob)

        self.W_fc.append(self.weight_variable([fc_input_shape[-1], fc_shape[-1]]))
        self.b_fc.append(self.bias_variable([fc_shape[-1]]))
        self.y_conv = tf.matmul(self.layer_input[-1], self.W_fc[-1]) + self.b_fc[-1]

        self.l2=sum([param_lambda*tf.nn.l2_loss(x) for pom in [self.W_conv,self.W_fc] for x in pom])

        self.cross_entropy = tf.reduce_mean(
            tf.nn.softmax_cross_entropy_with_logits(labels=self.y_, logits=self.y_conv))+self.l2
        self.train_step = tf.train.AdamOptimizer(1e-4).minimize(self.cross_entropy)
        self.correct_prediction = tf.equal(tf.argmax(self.y_conv, 1), tf.argmax(self.y_, 1))
        self.accuracy = tf.reduce_mean(tf.cast(self.correct_prediction, tf.float32))

    def train(self):
        with tf.Session() as sess:
            sess.run(tf.global_variables_initializer())
            self.data.make_batches()
            for i in range(100*self.data.num_batches):
                batch = self.data.get_batch()
                if i%self.data.num_batches==0:
                    acc= self.accuracy.eval(feed_dict={self.x: batch[0], self.y_: batch[1]})
                    print (acc)
                sess.run(self.train_step,feed_dict={self.x: batch[0], self.y_: batch[1]})

In [4]:
for x in data.data_label_dict.keys():
    print (x,len(data.data_label_dict[x]))
print (data.data.shape)

NameError: name 'data' is not defined

In [5]:
convo_network=ConvNet(Data())
convo_network.convolution_graph(conv_shape=[[128,5,1,8]],fc_shape=[10,8], param_lambda=0.01)
convo_network.train()

(?, 128, 16, 1)
0.2
0.66
0.74
0.7
0.88


KeyboardInterrupt: 