In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import numpy as np
from tensorflow.data import Dataset
from tensorflow.keras import layers
from tensorflow.keras.layers import Dense, Activation, Dropout
from tensorflow.keras import losses
from tensorflow.keras import preprocessing
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
from keras.models import Sequential
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from transformers import pipeline
import json
from keras.preprocessing.text import Tokenizer
from sklearn.preprocessing import LabelBinarizer

In [2]:
def readDataset(file, n=-1):
    file = open(file, 'r')
    data = []
    for i, line in enumerate(file):
        data.append(json.loads(line))
        if n != -1 and i == n:
            break
    return pd.json_normalize(data)

def splitData(data):
    sentences = data['text'].values
    y = data['tag'].values
    sentences_train, sentences_test, y_train, y_test = train_test_split(sentences, y, test_size=0.20, random_state=1000)
    return sentences_train, sentences_test, y_train, y_test

def adaptVocab(text_dataset):
    vectorize_layer = TextVectorization(
    ngrams=None, max_tokens=None, vocabulary=None,
    output_mode='int', output_sequence_length=None, pad_to_max_tokens=True, 
)
    vectorize_layer.adapt(text_dataset.batch(32))
    vocab_size = len(vectorize_layer.get_vocabulary())
    print('Vocab size:', vocab_size)
    return vectorize_layer, vocab_size

def tokenizer(vocab_size, sentences_train, sentences_test, y_train, y_test):
    tokenizer = Tokenizer(num_words=vocab_size)
    tokenizer.fit_on_texts(sentences_train)


    x_train = tokenizer.texts_to_matrix(sentences_train)
    x_test = tokenizer.texts_to_matrix(sentences_test)

    encoder = LabelBinarizer()
    encoder.fit(y_train)
    y_train = encoder.transform(y_train)
    y_test = encoder.transform(y_test)

    return x_train, y_train, x_test, y_test, tokenizer, encoder

def getModel(num_labels):
    model = Sequential()
    model.add(Dense(512, input_shape=(vocab_size,)))
    model.add(Activation('relu'))
    model.add(Dropout(0.3))
    model.add(Dense(512))
    model.add(Activation('relu'))
    model.add(Dropout(0.3))
    model.add(Dense(num_labels))
    model.add(Activation('softmax'))
    model.summary()

    model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

    return model




English

In [3]:
english_path = './datasets/en.jsonl'
data = readDataset(english_path)
num_labels = len(data["tag"].unique())
sentences_train, sentences_test, y_train, y_test = splitData(data)
text_dataset = Dataset.from_tensor_slices((sentences_train))
vectorize_layer, vocab_size = adaptVocab(text_dataset)
x_train, y_train, x_test, y_test, tokenizer, encoder = tokenizer(vocab_size, sentences_train, sentences_test, y_train, y_test)

Vocab size: 20150


In [4]:
model = getModel(num_labels)
num_epochs =10
batch_size = 128
history = model.fit(x_train, y_train,
                    batch_size=batch_size,
                    epochs=num_epochs,
                    verbose=2,
                    validation_split=0.2)

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 512)               10317312  
_________________________________________________________________
activation (Activation)      (None, 512)               0         
_________________________________________________________________
dropout (Dropout)            (None, 512)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 512)               262656    
_________________________________________________________________
activation_1 (Activation)    (None, 512)               0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 512)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 6)                 3

In [5]:
# model.get_classes()
prediction = model.predict(tokenizer.texts_to_matrix(["The pfizer vaccine with ARNm is the best of them "]))

In [6]:
predict_class = np.argmax(prediction, axis=-1)
print(encoder.classes_[predict_class[0]])


vaccines


Tagging

In [7]:
dfAll = readDataset('./datasets/en_hash.json', n=1000)
print(dfAll.shape)

(1001, 18)


In [3]:
def predictClass(text):
    prediction = model.predict(tokenizer.texts_to_matrix([text]))
    predict_class = np.argmax(prediction, axis=-1)
    return encoder.classes_[predict_class[0]]

def predictAll(dfAll):
    predictions = []
    for i, text in enumerate(dfAll['text'].values):
        modelPrediction = predictClass(text)
        predictions.append(modelPrediction)
        if i % 1000 == 0:
            endChar = '\n' if i % 10000 == 0 else ' '
            print(i, end=endChar)
    return predictions


In [9]:
predictions = predictAll(dfAll)

0
1000 

In [14]:
len(predictions)

1001

In [17]:
subsets = pd.DataFrame(dfAll.iloc[:1001, : ])
subsets['simpleTag'] = predictions

In [21]:
subsets[['text', 'simpleTag']][:100]

Unnamed: 0,text,simpleTag
0,Ways to Know It's Time for New Office Space #C...,vaccination
1,A Utah pharmacist will not serve prison time f...,NONE
2,Wasn't one of those principles sending the unc...,NONE
3,ugggggggggg come on people https://t.co/BIcFCB...,vaccination
4,@dougquan @TorontoStar I dont care what millio...,vaccines
...,...,...
95,Went to the gym for the first time since the p...,NONE
96,@JuliaMorales Sadly we can’t keep them from br...,school-reopening
97,@Travisdhanraj What about York Region and Toro...,school-reopening
98,what makes Walking Dead so beloved to so many....,NONE


In [30]:
subsets['simpleTag'].value_counts()

NONE                459
vaccination         280
vaccines            200
mental-health        31
school-reopening     31
Name: simpleTag, dtype: int64

In [34]:
subsets.to_json('./datasets/1000simpleTagged_en.json', orient='records', lines=True)

Español

In [3]:
spanish_path = './datasets/en.jsonl'
data = readDataset(spanish_path)
num_labels = len(data["tag"].unique())
sentences_train, sentences_test, y_train, y_test = splitData(data)
text_dataset = Dataset.from_tensor_slices((sentences_train))
vectorize_layer, vocab_size = adaptVocab(text_dataset)
x_train, y_train, x_test, y_test, tokenizer, encoder = tokenizer(vocab_size, sentences_train, sentences_test, y_train, y_test)

Vocab size: 20150


In [4]:
model = getModel(num_labels)
num_epochs =10
batch_size = 128
history = model.fit(x_train, y_train,
                    batch_size=batch_size,
                    epochs=num_epochs,
                    verbose=2,
                    validation_split=0.2)

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 512)               10317312  
_________________________________________________________________
activation (Activation)      (None, 512)               0         
_________________________________________________________________
dropout (Dropout)            (None, 512)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 512)               262656    
_________________________________________________________________
activation_1 (Activation)    (None, 512)               0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 512)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 6)                 3

In [7]:
dfAll = readDataset('./datasets/es_hash.json', n=1000)
print(dfAll.shape)

predictions = predictAll(dfAll)
subsets = pd.DataFrame(dfAll.iloc[:1001, : ])
subsets['simpleTag'] = predictions

(1001, 17)
0
1000 

In [8]:
subsets['simpleTag'].value_counts()

vaccination      586
vaccines         228
NONE             176
mental-health     11
Name: simpleTag, dtype: int64

In [9]:
subsets.to_json('./datasets/1000simpleTagged_es.json', orient='records', lines=True)

Frances

In [3]:
french_path = './datasets/fr.jsonl'
data = readDataset(french_path)

In [4]:
data['tag'].unique()

array(['vaccination', 'vaccines', 'mental-health', 'NONE',
       'school-reopening', 'household-violence', 'vaccine',
       'school reopening', 'none', 'mental health', 'vaccination ',
       'mental health ', 'mental-health '], dtype=object)

- vaccines
- vaccination
- mental-health
- school-reopening
- household-violence
- NONE

In [5]:
newData = data['tag'].apply(lambda x: x.replace("vaccines", "vaccine").replace("vaccine", "vaccines"))
newData = newData.apply(lambda x: x.strip())
newData = newData.apply(lambda x: x.replace("mental health", "mental-health").replace("none", "NONE"))
newData = newData.apply(lambda x: x.replace("school reopening", "school-reopening"))

In [6]:
newData.unique()

array(['vaccination', 'vaccines', 'mental-health', 'NONE',
       'school-reopening', 'household-violence'], dtype=object)

In [7]:
data['tag'] = newData
data['tag'].unique()

array(['vaccination', 'vaccines', 'mental-health', 'NONE',
       'school-reopening', 'household-violence'], dtype=object)

Save cleaned Data


In [8]:
data.to_json('./datasets/tagged/fr.jsonl', orient='records', lines=True)

In [9]:
num_labels = len(data["tag"].unique())
sentences_train, sentences_test, y_train, y_test = splitData(data)
text_dataset = Dataset.from_tensor_slices((sentences_train))
vectorize_layer, vocab_size = adaptVocab(text_dataset)
# y_train
# data
x_train, y_train, x_test, y_test, tokenizer, encoder = tokenizer(vocab_size, sentences_train, sentences_test, y_train, y_test)

Vocab size: 16361


In [10]:
model = getModel(num_labels)
num_epochs =10
batch_size = 128
history = model.fit(x_train, y_train,
                    batch_size=batch_size,
                    epochs=num_epochs,
                    verbose=2,
                    validation_split=0.2)

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 512)               8377344   
_________________________________________________________________
activation (Activation)      (None, 512)               0         
_________________________________________________________________
dropout (Dropout)            (None, 512)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 512)               262656    
_________________________________________________________________
activation_1 (Activation)    (None, 512)               0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 512)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 6)                 3

In [11]:
dfAll = readDataset('./datasets/fr_hash.json', n=1000)
print(dfAll.shape)

predictions = predictAll(dfAll)
subsets = pd.DataFrame(dfAll.iloc[:1001, : ])
subsets['simpleTag'] = predictions

(1001, 18)
0
1000 

In [12]:
subsets['simpleTag'].value_counts()

vaccines            459
vaccination         368
NONE                143
school-reopening     21
mental-health        10
Name: simpleTag, dtype: int64

In [13]:
subsets.to_json('./datasets/1000simpleTagged_fr.json', orient='records', lines=True)