# Deep Learning Arch #2
## Embedding->CNN->LSTM->Dense

In [1]:
from keras.layers import Embedding, Dense, Flatten, LSTM, Conv1D, MaxPooling1D, Dropout, Activation
from keras.models import Sequential
from keras.preprocessing.text import one_hot
from keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

import pandas as pd
import numpy as np
import os
import csv 
import sys
import re
import scipy
import numpy as np

csv.field_size_limit(sys.maxsize)

Using TensorFlow backend.


131072

In [2]:
def process_content(file_path):
    with open(file_path, "r") as file:
        lines = file.readlines()
        source_code = []
        for line in lines:
            # filter comments
            if not re.match("\s*\/\/\s*isComment", line):
                source_code.append(line)
    text = ' '.join(source_code)
    return text

In [3]:
def find_vocab_size(trainset):
    vocab = {}
    max_length = 0
    for instance in trainset:
        tokens = instance.split()

        if len(tokens) > max_length:
            max_length = len(tokens)

        for token in tokens:
            if token in vocab:
                vocab[token] += 1
            else:
                vocab[token] = 0

    return vocab, max_length

In [4]:
df = pd.read_pickle('../../data/instances.pkl')
labels = list(set(df['target'].values))
X = []
Y = []

print("Preparing lists...")
for index, row in df.iterrows():
    X.append(process_content("../" + row["source_code"]))
    Y.append(row["target"])

Preparing lists...


In [5]:
encoder = LabelEncoder()
encoder.fit(Y)
encoded_Y = encoder.transform(Y)
y_binary = to_categorical(encoded_Y)

X_train, X_test, y_train, y_test = train_test_split(X, y_binary, train_size = 0.75, random_state=42)



In [None]:
vocab, max_length = find_vocab_size(X)

vocab_size = len(vocab) + 100
encoded_docs_train = [one_hot(d, vocab_size) for d in X_train]
padded_docs_train = pad_sequences(encoded_docs_train, maxlen=max_length, padding='post')

encoded_docs_test = [one_hot(d, vocab_size) for d in X_test]
padded_docs_test = pad_sequences(encoded_docs_test, maxlen=max_length, padding='post')

model2 = Sequential()
model2.add(Embedding(vocab_size, 128, input_length=max_length))
model2.add(Dropout(0.2))
model2.add(Conv1D(64, 5, activation='relu'))
model2.add(MaxPooling1D(pool_size=4))
model2.add(LSTM(100))
model2.add(Dense(y_binary.shape[1], activation='softmax'))
# compile the model
model2.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
# summarize the model
print('################# Model 2 ###############')
print(model2.summary())

model2.fit(padded_docs_train, y_train, epochs=1, verbose=1, validation_split=0.2)
# evaluate the model
loss, accuracy = model2.evaluate(padded_docs_test, y_test, verbose=1)
print('Accuracy: %f' % (accuracy*100))

################# Model 2 ###############
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 28968, 128)        21359616  
_________________________________________________________________
dropout_2 (Dropout)          (None, 28968, 128)        0         
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 28964, 64)         41024     
_________________________________________________________________
max_pooling1d_2 (MaxPooling1 (None, 7241, 64)          0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 100)               66000     
_________________________________________________________________
dense_2 (Dense)              (None, 27)                2727      
Total params: 21,469,367
Trainable params: 21,469,367
Non-trainable params: 0
______________________