# Deep Learning Arch #2
## Embedding->CNN->LSTM->Dense

In [None]:
from keras.layers import Embedding, Dense, Flatten, LSTM, Conv1D, MaxPooling1D, Dropout, Activation
from keras.models import Sequential
from keras.preprocessing.text import one_hot
from keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences
import pandas as pd
import numpy as np
import os
import csv 
import sys
import re
import scipy
import numpy as np

csv.field_size_limit(sys.maxsize)

In [None]:
def process_content(content_list):
    source_code = []
    content_list = eval(content_list)
    for line in content_list:
        # filter comments
        if not re.match("\s*\/\/\s*isComment", line):
            source_code.append(line.replace("\n", " newLine "))
    return ' '.join(source_code)

In [None]:
def find_vocab_size(trainset):
    vocab = {}
    max_length = 0
    for instance in trainset:
        tokens = instance.split()

        if len(tokens) > max_length:
            max_length = len(tokens)

        for token in tokens:
            if token in vocab:
                vocab[token] += 1
            else:
                vocab[token] = 0

    return vocab, max_length

In [None]:
labels = ["tmm", "lc", "dc", "lpl", "lm"]
for label in labels:
    print("===== {} ==============".format(label))
    print("Reading data...")
    df = pd.read_csv('../data/df/train_{}.csv'.format(label), engine="python")
    df_test = pd.read_csv('../data/df/test_{}.csv'.format(label), engine="python")

    X_train = []
    Y_train = []
    X_test = []
    Y_test = []

    print("Preparing lists...")
    for index, row in df.iterrows():
        X_train.append(process_content(row["content"]))
        Y_train.append(row["smells"])

    for index, row in df_test.iterrows():
        X_test.append(process_content(row["content"]))
        Y_test.append(row["smells"])

    vocab, max_length = find_vocab_size(X_train)
    
    vocab_size = len(vocab) + 100
    encoded_docs = [one_hot(d, vocab_size) for d in X_train]
    padded_docs = pad_sequences(encoded_docs, maxlen=max_length, padding='post')

    encoded_docs_test = [one_hot(d, vocab_size) for d in X_test]
    padded_docs_test = pad_sequences(encoded_docs_test, maxlen=max_length, padding='post')

    model2 = Sequential()
    model2.add(Embedding(vocab_size, 128, input_length=max_length))
    model2.add(Conv1D(64, 5, activation='relu'))
    model2.add(MaxPooling1D(pool_size=4))
    model2.add(LSTM(100))
    model2.add(Dense(11, activation='softmax'))
    # compile the model
    model2.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    # summarize the model
    print('################# Model 2 ###############')
    print(model2.summary())
    
    model2.fit(padded_docs, Y_train, epochs=5, verbose=1, validation_split=0.2)
    # evaluate the model
    loss, accuracy = model2.evaluate(padded_docs_test, Y_test, verbose=0)
    print('Accuracy: %f' % (accuracy*100))