# Pipline for generically classifying COVID-19 tweets

In [1]:
# Importing Libraries
import numpy as np
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt
from transformers import pipeline
import tensorflow as tf

2022-03-22 16:43:35.952241: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-03-22 16:43:35.952372: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [2]:
#List of Categories (can be changed)
labels = ['positive',
        'negative',
        'neutral']

#Some form of training data that will be used to finetune model
df = pd.read_csv('FLGovTweetsLIWC.csv')

## Taking a look at our data and formatting it (if needed)

In [3]:
df.head()
#For the zero-shot classification all we need is the text for now...

Unnamed: 0.1,Unnamed: 0,time,text,counts,percentages
0,0,2020-05-14 20:49:43+00:00,"emergency officials, first responders, hospita...","{'Social': 3, 'Affect': 1, 'Power': 3, 'Affili...","{'Social': 0.23076923076923078, 'Affect': 0.07..."
1,1,2020-05-14 21:05:09+00:00,public health is everyone’s responsibility. ma...,"{'Social': 4, 'Affect': 2, 'Power': 0, 'Affili...","{'Social': 0.4444444444444444, 'Affect': 0.222..."
2,2,2020-05-14 21:08:57+00:00,"taryn kryzda talks about covid-19, hurricane p...","{'Social': 4, 'Affect': 2, 'Power': 1, 'Affili...","{'Social': 0.4444444444444444, 'Affect': 0.222..."
3,3,2020-05-14 22:00:44+00:00,it’s vital to practice good handwashing practi...,"{'Social': 5, 'Affect': 3, 'Power': 0, 'Affili...","{'Social': 0.3125, 'Affect': 0.1875, 'Power': ..."
4,4,2020-05-14 22:31:41+00:00,"st, lucie county #covid19 update for may 14, 2...","{'Social': 0, 'Affect': 0, 'Power': 0, 'Affili...",{}


## Our first pipeline will be for zeroshot classification

In [4]:
classifier = pipeline('zero-shot-classification')

No model was supplied, defaulted to roberta-large-mnli (https://huggingface.co/roberta-large-mnli)
2022-03-22 16:44:35.514719: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2022-03-22 16:44:35.514809: W tensorflow/stream_executor/cuda/cuda_driver.cc:269] failed call to cuInit: UNKNOWN ERROR (303)
2022-03-22 16:44:35.514850: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (d48784423f1c): /proc/driver/nvidia/version does not exist
2022-03-22 16:44:35.515227: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
All

In [5]:
zero_shot_classified = [classifier(i, labels) for i in df['text'][:5]]

In [6]:
zero_shot_classified

[{'sequence': 'emergency officials, first responders, hospitals, and public health officials have partnered to step up efforts to coordinate with nursing homes and long-term care facilities as covid-19 continues to claim the lives of older residents. details: https://t.co/ljgh125kwz https://t.co/zeh5i160t6',
  'labels': ['negative', 'neutral', 'positive'],
  'scores': [0.9177665114402771, 0.06806164979934692, 0.014171851798892021]},
 {'sequence': 'public health is everyone’s responsibility. make sure to: -wash your hands often with soap and water for at least 20 seconds 🧼 -clean and disinfect frequently touched objects and surfaces frequently🧽 -stay home when you are sick🏠 https://t.co/pbrjxtfgt8 https://t.co/3npapf1kme',
  'labels': ['positive', 'neutral', 'negative'],
  'scores': [0.36294662952423096, 0.36210837960243225, 0.2749450206756592]},
 {'sequence': 'taryn kryzda talks about covid-19, hurricane preparedness, "our emergency management agency is ready and we want you to be read

* An obvious downside to zero-shot is it's not as accurate and it takes forever!! We will have to do more testing with this... 

## Basic Pipeline to fine-tune and use huggingface models

In [7]:
#Importing the Data 
#training_data = pd.read_csv('file_name.csv')

Preprocessing the Data

In [11]:
#This code was adapted from the huggingface docs https://huggingface.co/docs/transformers/training#prepare-a-dataset
from transformers import AutoTokenizer, TFAutoModel

tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-covid19-base-uncased", normalization=True)
model = TFAutoModel.from_pretrained("vinai/bertweet-covid19-base-uncased", num_labels=7)

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)


#tokenized_datasets = dataset.map(tokenize_function, batched=True)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Some layers from the model checkpoint at vinai/bertweet-covid19-base-uncased were not used when initializing TFRobertaModel: ['lm_head']
- This IS expected if you are initializing TFRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFRobertaModel were initialized from the model checkpoint at vinai/bertweet-covid19-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaModel for predictions without further training.


In [12]:
from transformers import TrainingArguments

training_args = TrainingArguments(output_dir="twtter_trainer")

In [13]:
from datasets import load_metric

metric = load_metric("accuracy")

Downloading:   0%|          | 0.00/1.41k [00:00<?, ?B/s]

In [14]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [17]:
from transformers import TrainingArguments

training_args = TrainingArguments(output_dir="test_trainer", evaluation_strategy="epoch")

In [16]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=X_train,
    eval_dataset=Y_train,
    compute_metrics=compute_metrics,
)

NameError: name 'X_train' is not defined

In [None]:
trainer.train()