In [15]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import numpy as np
from tensorflow.data import Dataset
from tensorflow.keras import layers
from tensorflow.keras.layers import Dense, Activation, Dropout
from tensorflow.keras import losses
from tensorflow.keras import preprocessing
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
from keras.models import Sequential
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from transformers import pipeline
import json
from keras.preprocessing.text import Tokenizer
from sklearn.preprocessing import LabelBinarizer
from sklearn.metrics import classification_report

In [16]:
folder = './datasets/tagged/'
path_en = folder + 'integrated_en.json'
path_es = folder + 'integrated_es.json'
path_fr = folder + 'integrated_fr.json'

In [17]:
def readDataset(file, n=-1):
    file = open(file, 'r')
    data = []
    for i, line in enumerate(file):
        data.append(json.loads(line))
        if n != -1 and i == n:
            break
    return pd.json_normalize(data)

def splitData(data):
    sentences = data['text'].values
    y = data['tag'].values
    sentences_train, sentences_test, y_train, y_test = train_test_split(sentences, y, test_size=0.20, random_state=1000)
    return sentences_train, sentences_test, y_train, y_test

def adaptVocab(text_dataset):
    vectorize_layer = TextVectorization(
    ngrams=None, max_tokens=None, vocabulary=None,
    output_mode='int', output_sequence_length=None, pad_to_max_tokens=True, 
)
    vectorize_layer.adapt(text_dataset.batch(32))
    vocab_size = len(vectorize_layer.get_vocabulary())
    print('Vocab size:', vocab_size)
    return vectorize_layer, vocab_size

def getTokenizer(vocab_size, sentences_train, sentences_test, y_train, y_test):
    tokenizer = Tokenizer(num_words=vocab_size)
    tokenizer.fit_on_texts(sentences_train)


    x_train = tokenizer.texts_to_matrix(sentences_train)
    x_test = tokenizer.texts_to_matrix(sentences_test)

    encoder = LabelBinarizer()
    encoder.fit(y_train)
    y_train = encoder.transform(y_train)
    y_test = encoder.transform(y_test)

    return x_train, y_train, x_test, y_test, tokenizer, encoder

def getModel(num_labels):
    model = Sequential()
    model.add(Dense(512, input_shape=(vocab_size,)))
    model.add(Activation('relu'))
    model.add(Dropout(0.3))
    model.add(Dense(512))
    model.add(Activation('relu'))
    model.add(Dropout(0.3))
    model.add(Dense(num_labels))
    model.add(Activation('softmax'))
    model.summary()

    model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

    return model


def get_metrics_by_class(model, x, y):
    y_pred = model.predict(x, batch_size=64, verbose=1)
    y_pred_bool = np.argmax(y_pred, axis=1)
    y_label = np.argmax(y, axis=1)
    #print(confusion_matrix(y_pred_bool, y_label))
    print(classification_report(y_label, y_pred_bool))




English

In [18]:

data = readDataset(path_en)
num_labels = len(data["tag"].unique())
sentences_train, sentences_test, y_train, y_test = splitData(data)
text_dataset = Dataset.from_tensor_slices((sentences_train))
vectorize_layer, vocab_size = adaptVocab(text_dataset)
x_train, y_train, x_test, y_test, tokenizer, encoder = getTokenizer(vocab_size, sentences_train, sentences_test, y_train, y_test)

Vocab size: 22779


In [19]:
model = getModel(num_labels)
num_epochs =10
batch_size = 128
history = model.fit(x_train, y_train,
                    batch_size=batch_size,
                    epochs=num_epochs,
                    verbose=2,
                    validation_split=0.2)

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_9 (Dense)              (None, 512)               11663360  
_________________________________________________________________
activation_9 (Activation)    (None, 512)               0         
_________________________________________________________________
dropout_6 (Dropout)          (None, 512)               0         
_________________________________________________________________
dense_10 (Dense)             (None, 512)               262656    
_________________________________________________________________
activation_10 (Activation)   (None, 512)               0         
_________________________________________________________________
dropout_7 (Dropout)          (None, 512)               0         
_________________________________________________________________
dense_11 (Dense)             (None, 6)                

In [20]:
prediction = model.predict(tokenizer.texts_to_matrix(["The pfizer vaccine with ARNm is the best of them "]))
predict_class = np.argmax(prediction, axis=-1)
print(encoder.classes_[predict_class[0]])

vaccines


In [22]:
get_metrics_by_class(model, x_test, y_test)

              precision    recall  f1-score   support

           0       0.64      0.58      0.61       374
           1       0.89      0.67      0.76        24
           2       0.84      0.69      0.76        75
           3       0.50      0.55      0.52       122
           4       0.71      0.84      0.77       545
           5       0.67      0.50      0.57       233

    accuracy                           0.67      1373
   macro avg       0.71      0.64      0.66      1373
weighted avg       0.67      0.67      0.67      1373



Spanish

In [23]:
data = readDataset(path_es)
num_labels = len(data["tag"].unique())
sentences_train, sentences_test, y_train, y_test = splitData(data)
text_dataset = Dataset.from_tensor_slices((sentences_train))
vectorize_layer, vocab_size = adaptVocab(text_dataset)
x_train, y_train, x_test, y_test, tokenizer, encoder = getTokenizer(vocab_size, sentences_train, sentences_test, y_train, y_test)

Vocab size: 24550


In [24]:
model = getModel(num_labels)
num_epochs =10
batch_size = 128
history = model.fit(x_train, y_train,
                    batch_size=batch_size,
                    epochs=num_epochs,
                    verbose=2,
                    validation_split=0.2)

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_12 (Dense)             (None, 512)               12570112  
_________________________________________________________________
activation_12 (Activation)   (None, 512)               0         
_________________________________________________________________
dropout_8 (Dropout)          (None, 512)               0         
_________________________________________________________________
dense_13 (Dense)             (None, 512)               262656    
_________________________________________________________________
activation_13 (Activation)   (None, 512)               0         
_________________________________________________________________
dropout_9 (Dropout)          (None, 512)               0         
_________________________________________________________________
dense_14 (Dense)             (None, 6)                

In [25]:
prediction = model.predict(tokenizer.texts_to_matrix(["Las vacunas llegarán este jueves a Francia para empezar con el proceso de vacunación"]))
predict_class = np.argmax(prediction, axis=-1)
print(encoder.classes_[predict_class[0]])

vaccination


In [26]:
get_metrics_by_class(model, x_test, y_test)

              precision    recall  f1-score   support

           0       0.60      0.68      0.64       490
           1       0.77      0.77      0.77        30
           2       0.50      0.18      0.27        22
           3       0.80      0.78      0.79        36
           4       0.62      0.63      0.62       460
           5       0.52      0.42      0.46       261

    accuracy                           0.60      1299
   macro avg       0.63      0.57      0.59      1299
weighted avg       0.60      0.60      0.60      1299



French

In [27]:
data = readDataset(path_fr)
num_labels = len(data["tag"].unique())
sentences_train, sentences_test, y_train, y_test = splitData(data)
text_dataset = Dataset.from_tensor_slices((sentences_train))
vectorize_layer, vocab_size = adaptVocab(text_dataset)
x_train, y_train, x_test, y_test, tokenizer, encoder = getTokenizer(vocab_size, sentences_train, sentences_test, y_train, y_test)

Vocab size: 18508


In [28]:
model = getModel(num_labels)
num_epochs =10
batch_size = 128
history = model.fit(x_train, y_train,
                    batch_size=batch_size,
                    epochs=num_epochs,
                    verbose=2,
                    validation_split=0.2)

Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_15 (Dense)             (None, 512)               9476608   
_________________________________________________________________
activation_15 (Activation)   (None, 512)               0         
_________________________________________________________________
dropout_10 (Dropout)         (None, 512)               0         
_________________________________________________________________
dense_16 (Dense)             (None, 512)               262656    
_________________________________________________________________
activation_16 (Activation)   (None, 512)               0         
_________________________________________________________________
dropout_11 (Dropout)         (None, 512)               0         
_________________________________________________________________
dense_17 (Dense)             (None, 6)                

In [29]:
get_metrics_by_class(model, x_test, y_test)

              precision    recall  f1-score   support

           0       0.62      0.45      0.52       165
           1       1.00      0.86      0.92         7
           2       0.86      0.33      0.48        18
           3       0.67      0.71      0.69        55
           4       0.58      0.67      0.62       323
           5       0.67      0.67      0.67       359

    accuracy                           0.63       927
   macro avg       0.73      0.62      0.65       927
weighted avg       0.63      0.63      0.63       927



In [30]:
prediction = model.predict(tokenizer.texts_to_matrix(["School called me saying that my kid was contact traced, a classmate got covid, so he has to stay home. His sibling also, because, well, they are siblings\nI love them both, but...all the cooking! Why do they have to eat more than once a day??! They don't make good pets."]))
predict_class = np.argmax(prediction, axis=-1)
print(encoder.classes_[predict_class[0]])

NONE
