In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import numpy as np
from tensorflow.data import Dataset
from tensorflow.keras import layers
from tensorflow.keras.layers import Dense, Activation, Dropout
from tensorflow.keras import losses
from tensorflow.keras import preprocessing
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
from keras.models import Sequential
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from transformers import pipeline
import json
from keras.preprocessing.text import Tokenizer
from sklearn.preprocessing import LabelBinarizer
from sklearn.metrics import classification_report

In [2]:
def readDataset(file, n=-1):
    file = open(file, 'r')
    data = []
    for i, line in enumerate(file):
        data.append(json.loads(line))
        if n != -1 and i == n:
            break
    return pd.json_normalize(data)

def splitData(data):
    sentences = data['text'].values
    y = data['tag'].values
    sentences_train, sentences_test, y_train, y_test = train_test_split(sentences, y, test_size=0.20, random_state=1000)
    return sentences_train, sentences_test, y_train, y_test

def adaptVocab(text_dataset):
    vectorize_layer = TextVectorization(
    ngrams=None, max_tokens=None, vocabulary=None,
    output_mode='int', output_sequence_length=None, pad_to_max_tokens=True, 
)
    vectorize_layer.adapt(text_dataset.batch(32))
    vocab_size = len(vectorize_layer.get_vocabulary())
    print('Vocab size:', vocab_size)
    return vectorize_layer, vocab_size

def getTokenizer(vocab_size, sentences_train, sentences_test, y_train, y_test):
    tokenizer = Tokenizer(num_words=vocab_size)
    tokenizer.fit_on_texts(sentences_train)


    x_train = tokenizer.texts_to_matrix(sentences_train)
    x_test = tokenizer.texts_to_matrix(sentences_test)

    encoder = LabelBinarizer()
    encoder.fit(y_train)
    y_train = encoder.transform(y_train)
    y_test = encoder.transform(y_test)

    return x_train, y_train, x_test, y_test, tokenizer, encoder

def getModel(num_labels):
    model = Sequential()
    model.add(Dense(512, input_shape=(vocab_size,)))
    model.add(Activation('relu'))
    model.add(Dropout(0.3))
    model.add(Dense(512))
    model.add(Activation('relu'))
    model.add(Dropout(0.3))
    model.add(Dense(num_labels))
    model.add(Activation('softmax'))
    model.summary()

    model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

    return model

def get_metrics_by_class(model, x, y):
    y_pred = model.predict(x, batch_size=64, verbose=1)
    y_pred_bool = np.argmax(y_pred, axis=1)
    y_label = np.argmax(y, axis=1)
    #print(confusion_matrix(y_pred_bool, y_label))
    print(classification_report(y_label, y_pred_bool))




English

In [3]:
english_path = './datasets/en.jsonl'
data = readDataset(english_path)
num_labels = len(data["tag"].unique())
sentences_train, sentences_test, y_train, y_test = splitData(data)
text_dataset = Dataset.from_tensor_slices((sentences_train))
vectorize_layer, vocab_size = adaptVocab(text_dataset)
x_train, y_train, x_test, y_test, tokenizer, encoder = getTokenizer(vocab_size, sentences_train, sentences_test, y_train, y_test)

Vocab size: 20150


In [4]:
model = getModel(num_labels)
num_epochs =10
batch_size = 128
history = model.fit(x_train, y_train,
                    batch_size=batch_size,
                    epochs=num_epochs,
                    verbose=2,
                    validation_split=0.2)

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 512)               10317312  
_________________________________________________________________
activation (Activation)      (None, 512)               0         
_________________________________________________________________
dropout (Dropout)            (None, 512)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 512)               262656    
_________________________________________________________________
activation_1 (Activation)    (None, 512)               0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 512)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 6)                 3

In [5]:
# model.get_classes()
prediction = model.predict(tokenizer.texts_to_matrix(["The pfizer vaccine with ARNm is the best of them "]))

In [6]:
predict_class = np.argmax(prediction, axis=-1)
print(encoder.classes_[predict_class[0]])


vaccines


In [7]:
get_metrics_by_class(model, x_test, y_test)

              precision    recall  f1-score   support

           0       0.48      0.49      0.48       247
           1       0.00      0.00      0.00         2
           2       0.80      0.68      0.74        72
           3       0.49      0.47      0.48       129
           4       0.74      0.79      0.76       525
           5       0.58      0.52      0.55       199

    accuracy                           0.64      1174
   macro avg       0.52      0.49      0.50      1174
weighted avg       0.63      0.64      0.64      1174

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Tagging

In [8]:
dfAll = readDataset('./datasets/en_hash.json', n=1000)
print(dfAll.shape)

(1001, 18)


In [9]:
def predictClass(text):
    prediction = model.predict(tokenizer.texts_to_matrix([text]))
    predict_class = np.argmax(prediction, axis=-1)
    return encoder.classes_[predict_class[0]]

def predictAll(dfAll):
    predictions = []
    for i, text in enumerate(dfAll['text'].values):
        modelPrediction = predictClass(text)
        predictions.append(modelPrediction)
        if i % 1000 == 0:
            endChar = '\n' if i % 10000 == 0 else ' '
            print(i, end=endChar)
    return predictions


In [10]:
predictions = predictAll(dfAll)

0
1000 

In [11]:
len(predictions)

1001

In [12]:
subsets = pd.DataFrame(dfAll.iloc[:1001, : ])
subsets['simpleTag'] = predictions

In [13]:
subsets[['text', 'simpleTag']][:100]

Unnamed: 0,text,simpleTag
0,Ways to Know It's Time for New Office Space #C...,vaccination
1,A Utah pharmacist will not serve prison time f...,NONE
2,Wasn't one of those principles sending the unc...,NONE
3,ugggggggggg come on people https://t.co/BIcFCB...,vaccination
4,@dougquan @TorontoStar I dont care what millio...,vaccines
...,...,...
95,Went to the gym for the first time since the p...,NONE
96,@JuliaMorales Sadly we can’t keep them from br...,school-reopening
97,@Travisdhanraj What about York Region and Toro...,school-reopening
98,what makes Walking Dead so beloved to so many....,NONE


In [14]:
subsets['simpleTag'].value_counts()

NONE                440
vaccination         345
vaccines            136
school-reopening     57
mental-health        23
Name: simpleTag, dtype: int64

In [15]:
subsets.to_json('./datasets/1000simpleTagged_en.json', orient='records', lines=True)

Español

In [16]:
spanish_path = './datasets/en.jsonl'
data = readDataset(spanish_path)
num_labels = len(data["tag"].unique())
sentences_train, sentences_test, y_train, y_test = splitData(data)
text_dataset = Dataset.from_tensor_slices((sentences_train))
vectorize_layer, vocab_size = adaptVocab(text_dataset)
x_train, y_train, x_test, y_test, tokenizer, encoder = getTokenizer(vocab_size, sentences_train, sentences_test, y_train, y_test)

Vocab size: 20150


In [17]:
model = getModel(num_labels)
num_epochs =10
batch_size = 128
history = model.fit(x_train, y_train,
                    batch_size=batch_size,
                    epochs=num_epochs,
                    verbose=2,
                    validation_split=0.2)

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_3 (Dense)              (None, 512)               10317312  
_________________________________________________________________
activation_3 (Activation)    (None, 512)               0         
_________________________________________________________________
dropout_2 (Dropout)          (None, 512)               0         
_________________________________________________________________
dense_4 (Dense)              (None, 512)               262656    
_________________________________________________________________
activation_4 (Activation)    (None, 512)               0         
_________________________________________________________________
dropout_3 (Dropout)          (None, 512)               0         
_________________________________________________________________
dense_5 (Dense)              (None, 6)                

In [18]:
get_metrics_by_class(model, x_test, y_test)

              precision    recall  f1-score   support

           0       0.48      0.55      0.51       247
           1       0.00      0.00      0.00         2
           2       0.75      0.67      0.71        72
           3       0.51      0.36      0.42       129
           4       0.76      0.81      0.78       525
           5       0.62      0.52      0.57       199

    accuracy                           0.65      1174
   macro avg       0.52      0.49      0.50      1174
weighted avg       0.65      0.65      0.64      1174

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [19]:
dfAll = readDataset('./datasets/es_hash.json', n=1000)
print(dfAll.shape)

predictions = predictAll(dfAll)
subsets = pd.DataFrame(dfAll.iloc[:1001, : ])
subsets['simpleTag'] = predictions

(1001, 17)
0
1000 

In [20]:
subsets['simpleTag'].value_counts()

vaccination      678
NONE             229
vaccines          78
mental-health     16
Name: simpleTag, dtype: int64

In [21]:
subsets.to_json('./datasets/1000simpleTagged_es.json', orient='records', lines=True)

Frances

In [22]:
french_path = './datasets/fr.jsonl'
data = readDataset(french_path)

In [23]:
data['tag'].unique()

array(['vaccination', 'vaccines', 'mental-health', 'NONE',
       'school-reopening', 'household-violence', 'vaccine',
       'school reopening', 'none', 'mental health', 'vaccination ',
       'mental health ', 'mental-health '], dtype=object)

- vaccines
- vaccination
- mental-health
- school-reopening
- household-violence
- NONE

In [24]:
newData = data['tag'].apply(lambda x: x.replace("vaccines", "vaccine").replace("vaccine", "vaccines"))
newData = newData.apply(lambda x: x.strip())
newData = newData.apply(lambda x: x.replace("mental health", "mental-health").replace("none", "NONE"))
newData = newData.apply(lambda x: x.replace("school reopening", "school-reopening"))

In [25]:
newData.unique()

array(['vaccination', 'vaccines', 'mental-health', 'NONE',
       'school-reopening', 'household-violence'], dtype=object)

In [26]:
data['tag'] = newData
data['tag'].unique()

array(['vaccination', 'vaccines', 'mental-health', 'NONE',
       'school-reopening', 'household-violence'], dtype=object)

Save cleaned Data


In [27]:
data.to_json('./datasets/tagged/fr.jsonl', orient='records', lines=True)

In [28]:
num_labels = len(data["tag"].unique())
sentences_train, sentences_test, y_train, y_test = splitData(data)
text_dataset = Dataset.from_tensor_slices((sentences_train))
vectorize_layer, vocab_size = adaptVocab(text_dataset)
# y_train
# data
x_train, y_train, x_test, y_test, tokenizer, encoder = getTokenizer(vocab_size, sentences_train, sentences_test, y_train, y_test)

Vocab size: 16361


In [29]:
model = getModel(num_labels)
num_epochs =10
batch_size = 128
history = model.fit(x_train, y_train,
                    batch_size=batch_size,
                    epochs=num_epochs,
                    verbose=2,
                    validation_split=0.2)

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_6 (Dense)              (None, 512)               8377344   
_________________________________________________________________
activation_6 (Activation)    (None, 512)               0         
_________________________________________________________________
dropout_4 (Dropout)          (None, 512)               0         
_________________________________________________________________
dense_7 (Dense)              (None, 512)               262656    
_________________________________________________________________
activation_7 (Activation)    (None, 512)               0         
_________________________________________________________________
dropout_5 (Dropout)          (None, 512)               0         
_________________________________________________________________
dense_8 (Dense)              (None, 6)                

In [30]:
get_metrics_by_class(model, x_test, y_test)

              precision    recall  f1-score   support

           0       0.43      0.26      0.33       103
           1       0.00      0.00      0.00         2
           2       0.67      0.33      0.44        12
           3       0.79      0.76      0.77        49
           4       0.61      0.61      0.61       296
           5       0.65      0.75      0.70       338

    accuracy                           0.63       800
   macro avg       0.52      0.45      0.48       800
weighted avg       0.62      0.63      0.62       800

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [31]:
dfAll = readDataset('./datasets/fr_hash.json', n=1000)
print(dfAll.shape)

predictions = predictAll(dfAll)
subsets = pd.DataFrame(dfAll.iloc[:1001, : ])
subsets['simpleTag'] = predictions

(1001, 18)
0
1000 

In [32]:
subsets['simpleTag'].value_counts()

vaccines            453
vaccination         326
NONE                195
school-reopening     21
mental-health         6
Name: simpleTag, dtype: int64

In [33]:
subsets.to_json('./datasets/1000simpleTagged_fr.json', orient='records', lines=True)