In [28]:
import matplotlib.pyplot as plt
import os
import re
import shutil
import string
import tensorflow as tf
import numpy as np

from tensorflow.keras import layers
from tensorflow.keras import losses
from tensorflow.keras import preprocessing
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

In [3]:
url = "http://storage.googleapis.com/download.tensorflow.org/data/stack_overflow_16k.tar.gz"

dataset = tf.keras.utils.get_file("stack_overflow_16k", url,
                                    untar=True, cache_dir='.',
                                    cache_subdir='')

dataset_dir = os.path.join(os.path.dirname(dataset), 'stack_overflow_16k')

Downloading data from http://storage.googleapis.com/download.tensorflow.org/data/stack_overflow_16k.tar.gz


In [5]:
batch_size = 32
seed = 218

train_dataset = preprocessing.text_dataset_from_directory(
    '/content/train', 
    label_mode="categorical", 
    batch_size=batch_size, 
    validation_split=0.2, 
    subset='training', 
    seed=seed
  )

val_dataset = preprocessing.text_dataset_from_directory(
    '/content/train', 
    label_mode="categorical", 
    batch_size=batch_size, 
    validation_split=0.2, 
    subset='validation', 
    seed=seed
  )



Found 8000 files belonging to 4 classes.
Using 6400 files for training.
Found 8000 files belonging to 4 classes.
Using 1600 files for validation.


In [6]:
# def custom_standardization(input_data):
#   lowercase = tf.strings.lower(input_data)
#   stripped_html = tf.strings.regex_replace(lowercase, '<br />', ' ')
#   return tf.strings.regex_replace(stripped_html,
#                                   '[%s]' % re.escape(string.punctuation),
#                                   '')

vectorize_layer = TextVectorization(
    max_tokens=10000,  
)


train_text = train_dataset.map(lambda x, y: x)
vectorize_layer.adapt(train_text)

In [7]:
def vectorize_text(text, label):
  return vectorize_layer(text), label

In [8]:
train_dataset = train_dataset.map(vectorize_text)
val_dataset = val_dataset.map(vectorize_text)

In [None]:
AUTOTUNE = tf.data.AUTOTUNE

train_dataset = train_dataset.cache().prefetch(buffer_size=AUTOTUNE)
val_dataset = val_dataset.cache().prefetch(buffer_size=AUTOTUNE)

In [9]:
vocab_len = len(vectorize_layer.get_vocabulary())+1
model = tf.keras.Sequential([
  layers.Embedding(vocab_len, 16),
  layers.Conv1D(filters=64, kernel_size=5,
                strides=1, padding="causal",
                activation="relu"),
  layers.Bidirectional(layers.GRU(64)),
  layers.Dense(30, activation="relu"),
  layers.Dense(4, activation="softmax")
])

# model = tf.keras.Sequential([
#   layers.Embedding(vocab_len, 16, mask_zero=True),
#   layers.Bidirectional(layers.LSTM(64)),
#   layers.Dropout(0.2),
#   layers.Dense(1, activation="sigmoid")])

model.compile(loss="binary_crossentropy",
              optimizer='adam',
              metrics=["accuracy"])

epochs = 10
history = model.fit(
    train_dataset,
    validation_data=val_dataset,
    epochs=epochs)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [12]:
test_dataset = preprocessing.text_dataset_from_directory(
    '/content/test', 
    label_mode="categorical", 
    batch_size=batch_size, 
    seed=seed
  )

Found 8000 files belonging to 4 classes.


In [13]:
prod_model = tf.keras.Sequential([
    vectorize_layer,
    model
])

prod_model.compile(loss="binary_crossentropy",
              optimizer='adam',
              metrics=["accuracy"])

In [14]:
prod_model.evaluate(test_dataset)



[0.47904953360557556, 0.7605000138282776]

In [49]:
#Do model prediction with TextVectorization
pred = prod_model.predict(['''
  This is the easiest to use
'''])

#Get index pred with max probability and find the language at the index
lang_options = ["c#", "java", "javascript", "python"]
lang_options[np.argmax(pred[0])]

'python'