# nlp-transform-snippets

creates snippets out of large text files

In [None]:
!pip3 install wget==3.2

In [None]:
import wget
import logging
import numpy as np
import os
import re
import shutil
import sys
import tarfile
import time

In [None]:
# file name for training data zip
input_filename = os.environ.get('input_filename ', 'data.zip')

# resulting model zip file name
output_model_zip = os.environ.get('output_model_zip', 'model.zip')

# temporal data storage for local execution
data_dir = os.environ.get('data_dir', '../../data/')

In [None]:
parameters = list(
  map(
      lambda s: re.sub('$', '"', s),
      map(
          lambda s: s.replace('=', '="'),
          filter(
              lambda s: s.find('=') > -1 and bool(re.match('[A-Za-z0-9_]*=[.\/A-Za-z0-9]*', s)),
              sys.argv
          )
      )
  )
)

for parameter in parameters:
    logging.warning('Parameter: '+parameter) 
    exec(parameter)

In [None]:
source_folder=str(time.time())
shutil.unpack_archive(data_dir + input_filename, extract_dir=data_dir + source_folder)

In [None]:
# TODO generalize
letter = 'abcdefghijklmnopqrstuvwxyz'
digits = '0123456789'
others = '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'
alphabet = letter + digits + others
print('alphabet size:', len(alphabet))

# all-zeroes padding vector:
pad_vector = [0 for x in alphabet]

# pre-calculated one-hot vectors:
supported_chars_map = {}

for i, ch in enumerate(alphabet):
  vec = [0 for x in alphabet]
  vec[i] = 1
  supported_chars_map[ch] = vec


In [None]:
def get_source_snippets(file_name, breakup=True):
    # Read the file content and lower-case:                                    
    text = ""
    with open(file_name, mode='r') as file:
        text = file.read().lower()
    lines = text.split('\n')
    nlines = len(lines)
    if breakup and nlines > 50:
        aThird = nlines//3
        twoThirds = 2*aThird
        text1 = '\n'.join(lines[:aThird])
        text2 = '\n'.join(lines[aThird:twoThirds])
        text3 = '\n'.join(lines[twoThirds:])
        return [text1, text2, text3]
    return [text]

In [None]:
def turn_sample_to_vector(sample, sample_vectors_size=1024,
                          normalize_whitespace=True):
    if normalize_whitespace:
        # Map (most) white-space to space and compact to single one:
        sample = sample.replace('\n', ' ').replace('\r', ' ').replace('\t', ' ')
        sample = re.sub('\s+', ' ', sample)

    # Encode the characters to one-hot vectors:
    sample_vectors = []
    for ch in sample:
        if ch in supported_chars_map:
            sample_vectors.append(supported_chars_map[ch])

    # Truncate to fixed length:
    sample_vectors = sample_vectors[0:sample_vectors_size]

    # Pad with 0 vectors:
    if len(sample_vectors) < sample_vectors_size:
        for i in range(0, sample_vectors_size - len(sample_vectors)):
            sample_vectors.append(pad_vector)

    return np.array(sample_vectors)

In [None]:
def turn_file_to_vectors(file_name, sample_vectors_size=1024, normalize_whitespace=True, breakup=True):
    samples = get_source_snippets(file_name, breakup)
    return [turn_sample_to_vector(s, sample_vectors_size, normalize_whitespace) for s in samples]

In [None]:
def get_input_and_labels(root_folder, sample_vectors_size=1024, breakup=True):
    X = []
    Y = []
    for i, lang in enumerate(langs):
        print('Processing language:', lang)
        # One-hot class label vector:
        class_label = [0 for x in range(0, num_classes)]
        class_label[i] = 1
        # For all files in language folder:
        folder = os.path.join(root_folder, lang)
        for fn in os.listdir(folder):
            if fn.startswith("."):
                continue  # Skip hidden files and Jupyterlab cache directories
            file_name = os.path.join(folder, fn)
            sample_vectors = turn_file_to_vectors(file_name,
                                                sample_vectors_size=sample_vectors_size,
                                                breakup=breakup)
            for fv in sample_vectors:
                X.append(fv)                 # the sample feature vector
                Y.append(class_label)        # the class ground-truth

    return np.array(X, dtype=np.int8), np.array(Y, dtype=np.int8)

In [None]:
# TODO generalize
langs = [
  "C",
  "C#",
  "C++",
  "D",
  "Haskell",
  "Java",
  "JavaScript",
  "PHP",
  "Python",
  "Rust"
]

num_classes = len(langs)

In [None]:
x, y = get_input_and_labels(root_folder=data_dir + source_folder + '/train') #TODO use data folder

# Shuffle data
shuffle_indices = np.random.permutation(np.arange(len(y)))
x_shuffled = x[shuffle_indices]
y_shuffled = y[shuffle_indices]

print('samples shape', x_shuffled.shape)
print('class labels shape:', y_shuffled.shape)

In [None]:
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Activation, Dense, Dropout, Flatten, Input
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Concatenate

# Model Hyperparameters
kernel_sizes = (3, 9, 19)
pooling_sizes = (3, 9, 19)
num_filters = 128
dropout_prob = 0.5
hidden_dims = 128

stage_in = Input(shape=(1024, 68))
convs = []
for i in range(0, len(kernel_sizes)):
  conv = Conv1D(filters=num_filters,
                kernel_size=kernel_sizes[i],
                padding='valid',
                activation='relu',
                strides=1)(stage_in)
  pool = MaxPooling1D(pool_size=pooling_sizes[i])(conv)
  flatten = Flatten()(pool)
  convs.append(flatten)

if len(kernel_sizes) > 1:
    out = Concatenate()(convs)
else:
    out = convs[0]

stages = Model(inputs=stage_in, outputs=out)

model = Sequential([
    stages,
    Dense(hidden_dims, activation='relu'),
    Dropout(dropout_prob),
    Dense(num_classes, activation='softmax')
])

model.summary()

# Note: also need pydot and GraphViz installed for this.
#from tensorflow.keras.utils import plot_model                               
#plot_model(model, show_shapes=True, expand_nested=True)

In [None]:
batch_size = 64
num_epochs = 20
val_split = 0.1

model.compile(loss='categorical_crossentropy', optimizer='adam',
              metrics=['accuracy'])

history = model.fit(x_shuffled, y_shuffled, batch_size=batch_size,
                    epochs=num_epochs, validation_split=val_split,
                    verbose=1)

In [None]:
model_folder=str(time.time())
model.save(data_dir + model_folder)
shutil.make_archive(data_dir + output_model_zip.split('.zip')[0], 'zip', data_dir + model_folder)