In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import numpy as np
from tensorflow.data import Dataset
from tensorflow.keras import layers
from tensorflow.keras.layers import Dense, Activation, Dropout
from tensorflow.keras import losses
from tensorflow.keras import preprocessing
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
from keras.models import Sequential
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from transformers import pipeline
import json
from keras.preprocessing.text import Tokenizer
from sklearn.preprocessing import LabelBinarizer

In [2]:
folder = './datasets/tagged/'
path_en = folder + 'integrated_en.json'
path_es = folder + 'integrated_es.json'
path_fr = folder + 'integrated_fr.json'

In [3]:
def readDataset(file, n=-1):
    file = open(file, 'r')
    data = []
    for i, line in enumerate(file):
        data.append(json.loads(line))
        if n != -1 and i == n:
            break
    return pd.json_normalize(data)

def splitData(data):
    sentences = data['text'].values
    y = data['tag'].values
    sentences_train, sentences_test, y_train, y_test = train_test_split(sentences, y, test_size=0.20, random_state=1000)
    return sentences_train, sentences_test, y_train, y_test

def adaptVocab(text_dataset):
    vectorize_layer = TextVectorization(
    ngrams=None, max_tokens=None, vocabulary=None,
    output_mode='int', output_sequence_length=None, pad_to_max_tokens=True, 
)
    vectorize_layer.adapt(text_dataset.batch(32))
    vocab_size = len(vectorize_layer.get_vocabulary())
    print('Vocab size:', vocab_size)
    return vectorize_layer, vocab_size

def getTokenizer(vocab_size, sentences_train, sentences_test, y_train, y_test):
    tokenizer = Tokenizer(num_words=vocab_size)
    tokenizer.fit_on_texts(sentences_train)


    x_train = tokenizer.texts_to_matrix(sentences_train)
    x_test = tokenizer.texts_to_matrix(sentences_test)

    encoder = LabelBinarizer()
    encoder.fit(y_train)
    y_train = encoder.transform(y_train)
    y_test = encoder.transform(y_test)

    return x_train, y_train, x_test, y_test, tokenizer, encoder

def getModel(num_labels):
    model = Sequential()
    model.add(Dense(512, input_shape=(vocab_size,)))
    model.add(Activation('relu'))
    model.add(Dropout(0.3))
    model.add(Dense(512))
    model.add(Activation('relu'))
    model.add(Dropout(0.3))
    model.add(Dense(num_labels))
    model.add(Activation('softmax'))
    model.summary()

    model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

    return model




English

In [4]:

data = readDataset(path_en)
num_labels = len(data["tag"].unique())
sentences_train, sentences_test, y_train, y_test = splitData(data)
text_dataset = Dataset.from_tensor_slices((sentences_train))
vectorize_layer, vocab_size = adaptVocab(text_dataset)
x_train, y_train, x_test, y_test, tokenizer, encoder = getTokenizer(vocab_size, sentences_train, sentences_test, y_train, y_test)

Vocab size: 22779


In [5]:
model = getModel(num_labels)
num_epochs =10
batch_size = 128
history = model.fit(x_train, y_train,
                    batch_size=batch_size,
                    epochs=num_epochs,
                    verbose=2,
                    validation_split=0.2)

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 512)               11663360  
_________________________________________________________________
activation (Activation)      (None, 512)               0         
_________________________________________________________________
dropout (Dropout)            (None, 512)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 512)               262656    
_________________________________________________________________
activation_1 (Activation)    (None, 512)               0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 512)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 6)                 3

In [6]:
prediction = model.predict(tokenizer.texts_to_matrix(["The pfizer vaccine with ARNm is the best of them "]))
predict_class = np.argmax(prediction, axis=-1)
print(encoder.classes_[predict_class[0]])

vaccines


Spanish

In [7]:
data = readDataset(path_es)
num_labels = len(data["tag"].unique())
sentences_train, sentences_test, y_train, y_test = splitData(data)
text_dataset = Dataset.from_tensor_slices((sentences_train))
vectorize_layer, vocab_size = adaptVocab(text_dataset)
x_train, y_train, x_test, y_test, tokenizer, encoder = getTokenizer(vocab_size, sentences_train, sentences_test, y_train, y_test)

Vocab size: 24550


In [8]:
model = getModel(num_labels)
num_epochs =10
batch_size = 128
history = model.fit(x_train, y_train,
                    batch_size=batch_size,
                    epochs=num_epochs,
                    verbose=2,
                    validation_split=0.2)

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_3 (Dense)              (None, 512)               12570112  
_________________________________________________________________
activation_3 (Activation)    (None, 512)               0         
_________________________________________________________________
dropout_2 (Dropout)          (None, 512)               0         
_________________________________________________________________
dense_4 (Dense)              (None, 512)               262656    
_________________________________________________________________
activation_4 (Activation)    (None, 512)               0         
_________________________________________________________________
dropout_3 (Dropout)          (None, 512)               0         
_________________________________________________________________
dense_5 (Dense)              (None, 6)                

In [9]:
prediction = model.predict(tokenizer.texts_to_matrix(["Las vacunas llegarán este jueves a Francia para empezar con el proceso de vacunación"]))
predict_class = np.argmax(prediction, axis=-1)
print(encoder.classes_[predict_class[0]])

vaccination


French

In [10]:
data = readDataset(path_fr)
num_labels = len(data["tag"].unique())
sentences_train, sentences_test, y_train, y_test = splitData(data)
text_dataset = Dataset.from_tensor_slices((sentences_train))
vectorize_layer, vocab_size = adaptVocab(text_dataset)
x_train, y_train, x_test, y_test, tokenizer, encoder = getTokenizer(vocab_size, sentences_train, sentences_test, y_train, y_test)

Vocab size: 18508


In [11]:
model = getModel(num_labels)
num_epochs =10
batch_size = 128
history = model.fit(x_train, y_train,
                    batch_size=batch_size,
                    epochs=num_epochs,
                    verbose=2,
                    validation_split=0.2)

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_6 (Dense)              (None, 512)               9476608   
_________________________________________________________________
activation_6 (Activation)    (None, 512)               0         
_________________________________________________________________
dropout_4 (Dropout)          (None, 512)               0         
_________________________________________________________________
dense_7 (Dense)              (None, 512)               262656    
_________________________________________________________________
activation_7 (Activation)    (None, 512)               0         
_________________________________________________________________
dropout_5 (Dropout)          (None, 512)               0         
_________________________________________________________________
dense_8 (Dense)              (None, 6)                

In [13]:
prediction = model.predict(tokenizer.texts_to_matrix(["School called me saying that my kid was contact traced, a classmate got covid, so he has to stay home. His sibling also, because, well, they are siblings\nI love them both, but...all the cooking! Why do they have to eat more than once a day??! They don't make good pets."]))
predict_class = np.argmax(prediction, axis=-1)
print(encoder.classes_[predict_class[0]])

NONE
