# Two-Stage COVID-19 Misinformation Classifier


In [None]:
# Install package
!pip install transformers gdown --quiet

[K     |████████████████████████████████| 4.4 MB 35.5 MB/s 
[K     |████████████████████████████████| 596 kB 62.5 MB/s 
[K     |████████████████████████████████| 101 kB 11.4 MB/s 
[K     |████████████████████████████████| 6.6 MB 59.9 MB/s 
[?25h

In [None]:
# Download trained models
!gdown 1n0R-lv3RHOS64IdAyMAeFUrUmW_1VcI_
!unzip two-stage-misinformation-classifier-models.zip

Downloading...
From: https://drive.google.com/uc?id=1n0R-lv3RHOS64IdAyMAeFUrUmW_1VcI_
To: /content/two-stage-misinformation-classifier-models.zip
100% 3.58G/3.58G [00:16<00:00, 211MB/s]
Archive:  two-stage-misinformation-classifier-models.zip
   creating: content/models/
   creating: content/models/bi_lstm_misinfo.model/
  inflating: content/models/bi_lstm_misinfo.model/keras_metadata.pb  
   creating: content/models/bi_lstm_misinfo.model/variables/
  inflating: content/models/bi_lstm_misinfo.model/variables/variables.data-00000-of-00001  
  inflating: content/models/bi_lstm_misinfo.model/variables/variables.index  
  inflating: content/models/bi_lstm_misinfo.model/saved_model.pb  
  inflating: content/models/checkpoint  
   creating: content/models/.ipynb_checkpoints/
  inflating: content/models/weights_relevan_bert.index  
  inflating: content/models/weights_relevan_bert.data-00000-of-00001  


In [None]:
# Dependencies

import numpy as np
import pandas as pd

# TensorFlow
import tensorflow as tf

# BERT
from transformers import BertTokenizer, TFBertForSequenceClassification, TFAutoModel, InputExample, InputFeatures

In [None]:
class TwoStageMisinformationClassifier():

  def __init__(self):

    self.relevance_classifier = TFBertForSequenceClassification.from_pretrained("indobenchmark/indobert-large-p1")
    self.relevance_classifier.load_weights('content/models/weights_relevan_bert')

    self.bert_model = TFAutoModel.from_pretrained("indobenchmark/indobert-large-p1")
    self.tokenizer = BertTokenizer.from_pretrained("indobenchmark/indobert-large-p1")

    self.misinfo_classifier = tf.keras.models.load_model('content/models/bi_lstm_misinfo.model')
    
    self.max_length = 80

  def convert_data_to_examples(self, doc_list): 
    inputExamples = [InputExample(guid=None, # Globally unique ID for bookkeeping, unused in this case
                                        text_a = text, 
                                        text_b = None,
                                        label = 0) for text in doc_list]
    
    return pd.Series(inputExamples)

    
  def convert_examples_to_tf_dataset(self, examples, tokenizer, max_length):
      features = [] # -> will hold InputFeatures to be converted later

      for e in examples:
          input_dict = tokenizer.encode_plus(
              e.text_a,
              add_special_tokens=True,
              max_length=max_length,
              return_token_type_ids=True,
              return_attention_mask=True,
              pad_to_max_length=True,
              truncation=True
          )

          input_ids, token_type_ids, attention_mask = (input_dict["input_ids"],
              input_dict["token_type_ids"], input_dict['attention_mask'])

          features.append(
              InputFeatures(
                  input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, label=e.label
              )
          )

      def gen():
          for f in features:
              yield (
                  {
                      "input_ids": f.input_ids,
                      "attention_mask": f.attention_mask,
                      "token_type_ids": f.token_type_ids,
                  },
                  f.label,
              )

      return tf.data.Dataset.from_generator(
          gen,
          ({"input_ids": tf.int32, "attention_mask": tf.int32, "token_type_ids": tf.int32}, tf.int64),
          (
              {
                  "input_ids": tf.TensorShape([None]),
                  "attention_mask": tf.TensorShape([None]),
                  "token_type_ids": tf.TensorShape([None]),
              },
              tf.TensorShape([]),
          ),
      )

  def predict(self, tweets):

    input_examples = self.convert_data_to_examples(tweets)
    input_data = self.convert_examples_to_tf_dataset(list(input_examples), self.tokenizer, self.max_length)
    input_data = input_data.batch(32)

    relevance_preds = self.relevance_classifier.predict(input_data)
    relevance_preds_encoded = [np.argmax(i) for i in relevance_preds.get('logits')]

    bert_raw_output = self.bert_model.predict(input_data)
    misinfo_preds = self.misinfo_classifier.predict(bert_raw_output.get('last_hidden_state'))
    misinfo_preds_encoded = [1 if i >= 0.5 else 0 for i in misinfo_preds]

    final_output = []
    for is_relevant, is_misinfo in zip(relevance_preds_encoded, misinfo_preds_encoded):
      if is_relevant:
        if is_misinfo:
          final_output.append('misinformation')
        else:
          final_output.append('true')
      else:
        final_output.append('irrelevant')
    
    return final_output

In [None]:
tsmc = TwoStageMisinformationClassifier()

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-large-p1 and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some layers from the model checkpoint at indobenchmark/indobert-large-p1 were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertMo

Downloading:   0%|          | 0.00/224k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

In [None]:
tweets = ['presiden menggunakan vaksin sinovac', 'pemerintah perlu menurunkan harga minyak', 'obat covid mengandung microchip']

tsmc.predict(tweets)



['true', 'irrelevant', 'misinformation']