In [1]:
from transformers import TFBertModel,  BertConfig, BertTokenizerFast

# Then what you need from tensorflow.keras
from tensorflow.keras.layers import Input, Dropout, Dense
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.initializers import TruncatedNormal
from tensorflow.keras.losses import CategoricalCrossentropy
from tensorflow.keras.metrics import CategoricalAccuracy
from tensorflow.keras.utils import to_categorical

In [2]:
CLASS_NAME = "quarantine"  # "vaccines" "masks"

In [3]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="4"

In [4]:
# Name of the BERT model to use
model_name = 'DeepPavlov/rubert-base-cased-sentence'

# Load transformers config and set output_hidden_states to False
config = BertConfig.from_pretrained(model_name)
config.output_hidden_states = False

# Load BERT tokenizer
tokenizer = BertTokenizerFast.from_pretrained(pretrained_model_name_or_path = model_name, config=config)

# Load the Transformers BERT model
transformer_model = TFBertModel.from_pretrained(model_name, config=config, from_pt=True)

# Load the MainLayer
bert = transformer_model.layers[0]

All PyTorch model weights were used when initializing TFBertModel.

All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [20]:
# Import data from csv
data = pd.read_csv('train.tsv', sep='\t')

# Select required columns
data = data[['text', f'{CLASS_NAME}_stance', f'{CLASS_NAME}_argument']]

# Set your model output as categorical and save in new label col
data['stance_label'] = pd.Categorical(data[f'{CLASS_NAME}_stance'])
data['argument_label'] = pd.Categorical(data[f'{CLASS_NAME}_argument'])

# Transform your output to numeric
data[f'{CLASS_NAME}_stance'] = data['stance_label'].cat.codes
data[f'{CLASS_NAME}_argument'] = data['argument_label'].cat.codes

In [55]:
# Import data from csv
data_test = pd.read_csv('./val_all.tsv', sep='\t')

# Select required columns
data_test = data_test[['text', f'{CLASS_NAME}_stance', f'{CLASS_NAME}_argument']]

# Set your model output as categorical and save in new label col
data_test['stance_label'] = pd.Categorical(data_test[f'{CLASS_NAME}_stance'])
data_test['argument_label'] = pd.Categorical(data_test[f'{CLASS_NAME}_argument'])

# Transform your output to numeric
data_test[f'{CLASS_NAME}_stance'] = data_test['stance_label'].cat.codes
data_test[f'{CLASS_NAME}_argument'] = data_test['argument_label'].cat.codes

In [31]:
# Ready output data for the model
test_y_stance = to_categorical(data_test[f'{CLASS_NAME}_stance'])
test_y_argument = to_categorical(data_test[f'{CLASS_NAME}_argument'])

# Tokenize the input (takes some time)
test_x = tokenizer(
    text=data_test['text'].to_list(),
    add_special_tokens=True,
    max_length=256,
    truncation=True,
    padding='max_length', 
    return_tensors='tf',
    return_token_type_ids = False,
    return_attention_mask = True,
    verbose = True)

In [22]:
# Build your model input
input_ids = Input(shape=(256,), name='input_ids', dtype='int32')
inputs = {'input_ids': input_ids}

# Load the Transformers BERT model as a layer in a Keras model
bert_model = bert(inputs)[1]
dropout = Dropout(config.hidden_dropout_prob, name='pooled_output')
pooled_output = dropout(bert_model, training=False)

# Then build your model output
stance = Dense(units=len(data.stance_label.value_counts()), kernel_initializer=TruncatedNormal(stddev=config.initializer_range), name='stance')(pooled_output)
argument = Dense(units=len(data.argument_label.value_counts()), kernel_initializer=TruncatedNormal(stddev=config.initializer_range), name='argument')(pooled_output)
outputs = {'stance': stance, 'argument': argument}

# And combine it all in a model object
model = Model(inputs=inputs, outputs=outputs, name='BERT_MultiLabel_MultiClass')

# Take a look at the model
model.summary()

Model: "BERT_MultiLabel_MultiClass"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 256)]        0           []                               
                                                                                                  
 bert (TFBertMainLayer)         TFBaseModelOutputWi  177853440   ['input_ids[0][0]']              
                                thPooling(last_hidd                                               
                                en_state=(None, 256                                               
                                , 768),                                                           
                                 pooler_output=(Non                                               
                                e, 768),                                 

In [23]:
# Set an optimizer
optimizer = Adam(
    learning_rate=5e-05,
    epsilon=1e-08,
    decay=0.01,
    clipnorm=1.0)

# Set loss and metrics
loss = {'stance': CategoricalCrossentropy(from_logits = True), 'argument': CategoricalCrossentropy(from_logits = True)}
metric = {'stance': CategoricalAccuracy('accuracy'), 'argument': CategoricalAccuracy('accuracy')}

# Compile the model
model.compile(
    optimizer = optimizer,
    loss = loss, 
    metrics = metric)

# Ready output data for the model
y_stance = to_categorical(data[f'{CLASS_NAME}_stance'])
y_argument = to_categorical(data[f'{CLASS_NAME}_argument'])

# Tokenize the input (takes some time)
x = tokenizer(
    text=data['text'].to_list(),
    add_special_tokens=True,
    max_length=256,
    truncation=True,
    padding=True, 
    return_tensors='tf',
    return_token_type_ids = False,
    return_attention_mask = True,
    verbose = True)

# Fit the model
history = model.fit(
    # x={'input_ids': x['input_ids'], 'attention_mask': x['attention_mask']},
    x={'input_ids': x['input_ids']},
    y={'stance': y_stance, 'argument': y_argument},
    validation_data=({'input_ids': test_x['input_ids'][:8]}, {'stance': test_y_stance[:8], 'argument': test_y_argument[:8]}),
    batch_size=8,
    epochs=20)

Epoch 1/20
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: 'arguments' object has no attribute 'posonlyargs'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: 'arguments' object has no attribute 'posonlyargs'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: 'arguments' object has no attribute 'posonlyargs'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: 'arguments' object has no attribute 'posonlyargs'
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/

In [56]:
val_results = model.predict(x={'input_ids': test_x['input_ids']})

In [57]:
data_test[f'{CLASS_NAME}_stance_predict'] = val_results['stance'].argmax(axis=-1)
data_test[f'{CLASS_NAME}_argument_predict'] = val_results['argument'].argmax(axis=-1)

In [58]:
from sklearn.metrics import classification_report

print(classification_report(data_test[f'{CLASS_NAME}_stance'].values.tolist(), val_results['stance'].argmax(axis=-1), zero_division=0))

              precision    recall  f1-score   support

           0       0.98      1.00      0.99       977
           1       0.00      0.00      0.00        39
           2       0.65      0.98      0.78       290
           3       0.00      0.00      0.00       125

    accuracy                           0.88      1431
   macro avg       0.41      0.50      0.44      1431
weighted avg       0.80      0.88      0.84      1431



In [59]:
from sklearn.metrics import classification_report

print(classification_report(data_test[f'{CLASS_NAME}_argument'].values.tolist(), val_results['argument'].argmax(axis=-1), zero_division=0))

              precision    recall  f1-score   support

           0       0.98      1.00      0.99       977
           1       0.00      0.00      0.00        39
           2       0.81      0.96      0.88       369
           3       0.00      0.00      0.00        46

    accuracy                           0.93      1431
   macro avg       0.45      0.49      0.47      1431
weighted avg       0.88      0.93      0.90      1431



In [60]:
data_test[f'{CLASS_NAME}_stance_predict'] -= 1

In [61]:
data_test[f'{CLASS_NAME}_argument_predict'] -= 1

In [62]:
data_test[['text', f'{CLASS_NAME}_stance_predict', f'{CLASS_NAME}_argument_predict']].to_csv(f"val_predict_{CLASS_NAME}.tsv", sep='\t', index=None)

In [64]:
df1 = pd.read_csv(f"val_predict_{CLASS_NAME}.tsv", sep='\t')
df1.rename(columns={"quarantine_stance_predict": "quarantine_stance",
                    "quarantine_argument_predict": "quarantine_argument"}, inplace=True)

### Code for concatentation of all files with results for masks, vaccines and quarantine:

In [65]:
# df2 = pd.read_csv("val_predict_masks.tsv", sep='\t')
# df2.rename(columns={"masks_stance_predict": "masks_stance", 
#                        "masks_argument_predict": "masks_argument"}, inplace=True)

# df3 = pd.read_csv("val_predict_vaccines.tsv", sep='\t')
# df3.rename(columns={"vaccines_stance_predict": "vaccines_stance", 
#                        "vaccines_argument_predict": "vaccines_argument",}, inplace=True)

In [66]:
# result = pd.merge(df1, df2, on="text")
# result = pd.merge(result, df3, on="text")
# result.to_csv("val_predict_concat.tsv", sep='\t', index=None)

In [34]:
# !zip val_predict_concat.zip val_predict_concat.tsv

updating: val_predict_concat.tsv (deflated 73%)
