# Deep Learning Arch #1
## Embedding->Dense

In [None]:
from keras.layers import Embedding, Dense, Flatten, LSTM, Conv1D, MaxPooling1D, Dropout, Activation
from keras.models import Sequential
from keras.preprocessing.text import one_hot
from keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences
import pandas as pd
import numpy as np
import os
import csv 
import sys
import re
import scipy
import numpy as np

csv.field_size_limit(sys.maxsize)

In [None]:
def process_content(file_path):
    with open(file_path, "r") as file:
        lines = file.readlines()
        source_code = []
        for line in lines:
            # filter comments
            if not re.match("\s*\/\/\s*isComment", line):
                source_code.append(line)
    text = ' '.join(source_code)
    return text

In [None]:
def find_vocab_size(trainset):
    vocab = {}
    max_length = 0
    for instance in trainset:
        tokens = instance.split()

        if len(tokens) > max_length:
            max_length = len(tokens)

        for token in tokens:
            if token in vocab:
                vocab[token] += 1
            else:
                vocab[token] = 0

    return vocab, max_length

In [None]:
df = pd.read_pickle('../data/instances.pkl')
labels = list(set(df['target'].values))
X_train = []
Y_train = []
X_test = [] # split into train and test
Y_test = []

print("Preparing lists...")
for index, row in df.iterrows():
    X_train.append(process_content(row["source_code"]))
    Y_train.append(row["target"])

vocab, max_length = find_vocab_size(X_train)

vocab_size = len(vocab) + 100
encoded_docs = [one_hot(d, vocab_size) for d in X_train]
padded_docs = pad_sequences(encoded_docs, maxlen=max_length, padding='post')

encoded_docs_test = [one_hot(d, vocab_size) for d in X_test]
padded_docs_test = pad_sequences(encoded_docs_test, maxlen=max_length, padding='post')

model1 = Sequential()
model1.add(Embedding(vocab_size, 128, input_length=max_length))
model1.add(Flatten())
model1.add(Dense(32, activation='relu'))
model1.add(Dense(1, activation='softmax'))
# compile the model
model1.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# summarize the model
print('################# Model 1 ###############')
print(model1.summary())

model1.fit(padded_docs, Y_train, epochs=5, verbose=1, validation_split=0.2)
# evaluate the model
loss, accuracy = model1.evaluate(padded_docs_test, Y_test, verbose=0)
print('Accuracy: %f' % (accuracy*100))

In [None]:
model1.save()