# **Pre-BERT Classification**

In [None]:
!pip install pandas numpy scikit-learn transformers torch



In [None]:
import pandas as pd

file_path = '/content/oxygen_related_COVID_tweets.xlsx'
data = pd.read_excel(file_path)

print(data.head())
print(data.info())


   Unnamed: 0                                               text  label
0           0  @jackmathers4 @JaniceDean How about the vaccin...    0.0
1           1  @fordnation Imagine the most populous province...   -1.0
2           2  @arunbajpairajan Ye bhi nhi pta hota ki iopec ...   -1.0
3           3  @CalapooiaRiver @patteepoo @Ilovezaatar @DC_Dr...    0.0
4           4      @NotHoodlum Or in a wheelchair....with Oxygen    0.0
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 79799 entries, 0 to 79798
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Unnamed: 0  79799 non-null  int64  
 1   text        79798 non-null  object 
 2   label       315 non-null    float64
dtypes: float64(1), int64(1), object(1)
memory usage: 1.8+ MB
None


In [None]:
# Separate labeled and unlabeled data
labeled_data = data[data['label'].notna()]
unlabeled_data = data[data['label'].isna()]

print(f"Labeled data: {labeled_data.shape}")
print(f"Unlabeled data: {unlabeled_data.shape}")

Labeled data: (315, 3)
Unlabeled data: (79484, 3)


In [None]:
import re

def clean_text(text):
    text = re.sub(r'@\w+', '', text)  # Remove mentions
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    text = re.sub(r'[^A-Za-z0-9\s]', '', text)  # Remove special characters
    text = text.lower().strip()  # Convert to lowercase
    return text

labeled_data['clean_text'] = labeled_data['text'].apply(clean_text)
labeled_data['clean_text']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  labeled_data['clean_text'] = labeled_data['text'].apply(clean_text)


Unnamed: 0,clean_text
0,how about the vaccines ventilators at least pr...
1,imagine the most populous province wasnt all c...
2,ye bhi nhi pta hota ki iopec countries ka loan...
3,yeahno oxygen can get through your mask right ...
4,or in a wheelchairwith oxygen
...,...
413,name sanjay gupta number 8303201364 requirem...
414,sir a colleagues parents 79 amp 74 are both im...
415,sir jab logo ko oxygen chaiye ventilator chahi...
416,many in this covid crisis are left with only o...


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

In [None]:
X_train, X_val, y_train, y_val = train_test_split(
    labeled_data['clean_text'], labeled_data['label'], test_size=0.33, random_state=42
)

In [None]:
tfidf = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf.fit_transform(X_train)
X_val_tfidf = tfidf.transform(X_val)

In [None]:
model = LogisticRegression()
model.fit(X_train_tfidf, y_train)

y_pred = model.predict(X_val_tfidf)
print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

        -1.0       0.89      0.44      0.59        36
         0.0       0.57      0.96      0.71        45
         1.0       0.90      0.39      0.55        23

    accuracy                           0.65       104
   macro avg       0.78      0.60      0.62       104
weighted avg       0.75      0.65      0.63       104



In [None]:
from sklearn.svm import SVC

svm_model = SVC(kernel='linear', random_state=42)
svm_model.fit(X_train_tfidf, y_train)

y_pred_svm = svm_model.predict(X_val_tfidf)
print(classification_report(y_val, y_pred_svm))


              precision    recall  f1-score   support

        -1.0       0.88      0.61      0.72        36
         0.0       0.63      0.96      0.76        45
         1.0       0.91      0.43      0.59        23

    accuracy                           0.72       104
   macro avg       0.81      0.67      0.69       104
weighted avg       0.78      0.72      0.71       104



# **BERT Fine tuning**

In [None]:
!pip install transformers sentence-transformers torch pandas



In [None]:
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from torch.utils.data import Dataset

In [None]:
import torch

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

device

device(type='cuda')

In [None]:
data_path = '/content/oxygen_related_COVID_tweets.xlsx'
df = pd.read_excel(data_path)
df.head()

Unnamed: 0.1,Unnamed: 0,text,label
0,0,@jackmathers4 @JaniceDean How about the vaccin...,0.0
1,1,@fordnation Imagine the most populous province...,-1.0
2,2,@arunbajpairajan Ye bhi nhi pta hota ki iopec ...,-1.0
3,3,@CalapooiaRiver @patteepoo @Ilovezaatar @DC_Dr...,0.0
4,4,@NotHoodlum Or in a wheelchair....with Oxygen,0.0


In [None]:
df_labeled = df[df['label'].notnull()].reset_index(drop=True)
df_labeled['label'] = df_labeled['label'].astype(int)

In [None]:
df_labeled['label'] = df_labeled['label'].map({-1: 0, 0: 1, 1: 2})
df_labeled.head()

Unnamed: 0.1,Unnamed: 0,text,label
0,0,@jackmathers4 @JaniceDean How about the vaccin...,1
1,1,@fordnation Imagine the most populous province...,0
2,2,@arunbajpairajan Ye bhi nhi pta hota ki iopec ...,0
3,3,@CalapooiaRiver @patteepoo @Ilovezaatar @DC_Dr...,1
4,4,@NotHoodlum Or in a wheelchair....with Oxygen,1


In [None]:
import re

def clean_text(text):
    text = re.sub(r'@\w+', '', text)  # Remove mentions
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    text = re.sub(r'[^A-Za-z0-9\s]', '', text)  # Remove special characters
    text = text.lower().strip()  # Convert to lowercase
    return text

df_labeled['clean_text'] = df_labeled['text'].apply(clean_text)
df_labeled['clean_text']

Unnamed: 0,clean_text
0,how about the vaccines ventilators at least pr...
1,imagine the most populous province wasnt all c...
2,ye bhi nhi pta hota ki iopec countries ka loan...
3,yeahno oxygen can get through your mask right ...
4,or in a wheelchairwith oxygen
...,...
310,name sanjay gupta number 8303201364 requirem...
311,sir a colleagues parents 79 amp 74 are both im...
312,sir jab logo ko oxygen chaiye ventilator chahi...
313,many in this covid crisis are left with only o...


In [None]:
train_df, val_df = train_test_split(df_labeled, test_size=0.2, random_state=42)

In [None]:
# roberta-base
from transformers import RobertaForSequenceClassification, RobertaTokenizer
model_name = 'roberta-base'
tokenizer = RobertaTokenizer.from_pretrained(model_name)
model = RobertaForSequenceClassification.from_pretrained(model_name,
                                                         num_labels=3,
                                                         hidden_dropout_prob=0.2,
                                                        #  attention_probs_dropout_prob=0.2
                                                         )

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
# model_name = 'distilbert-base-uncased-finetuned-sst-2-english';
# tokenizer = DistilBertTokenizer.from_pretrained(model_name)
# model = DistilBertForSequenceClassification.from_pretrained(model_name,
#                                                             num_labels=3,
#                                                             ignore_mismatched_sizes=True,
#                                                             output_attentions = False,
#                                                             output_hidden_states = False
#                                                             )

In [None]:
# model_name = 'bert-base-uncased'
# tokenizer = AutoTokenizer.from_pretrained(model_name, do_lower_case=True)
# model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3)

In [None]:
model.to(device)

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.2, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
             

In [None]:
class TweetDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.dataframe = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, index):
        tweet = str(self.dataframe.iloc[index]['text'])
        label = self.dataframe.iloc[index]['label']
        encoding = self.tokenizer.encode_plus(
            tweet,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }


In [None]:
max_len = 128
train_dataset = TweetDataset(train_df, tokenizer, max_len)
val_dataset = TweetDataset(val_df, tokenizer, max_len)

In [None]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    acc = accuracy_score(labels, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [None]:
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=10,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    learning_rate=2e-5,
    max_grad_norm=1.0,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)



In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,1.0474,1.083636,0.365079,0.195275,0.133283,0.365079
2,0.8838,0.850894,0.507937,0.47478,0.617989,0.507937
3,0.6828,0.821033,0.603175,0.599376,0.606668,0.603175
4,0.4303,0.811471,0.650794,0.654712,0.703377,0.650794
5,0.1538,1.023752,0.68254,0.676959,0.80833,0.68254
6,0.1616,1.124694,0.730159,0.725882,0.827008,0.730159
7,0.2385,1.150618,0.714286,0.711189,0.820106,0.714286
8,0.32,1.048684,0.746032,0.749649,0.805586,0.746032
9,0.0565,1.124242,0.793651,0.796355,0.851874,0.793651
10,0.109,1.129928,0.793651,0.796355,0.851874,0.793651


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


TrainOutput(global_step=320, training_loss=0.43542139772325755, metrics={'train_runtime': 89.3482, 'train_samples_per_second': 28.204, 'train_steps_per_second': 3.581, 'total_flos': 165761453168640.0, 'train_loss': 0.43542139772325755, 'epoch': 10.0})

In [None]:
eval_results = trainer.evaluate()
print(f"Validation Accuracy: {eval_results['eval_accuracy']}")
print(f"F1 Score: {eval_results['eval_f1']}")

Validation Accuracy: 0.7936507936507936
F1 Score: 0.7963546859981494


In [None]:
eval_results

{'eval_loss': 1.1299282312393188,
 'eval_accuracy': 0.7936507936507936,
 'eval_f1': 0.7963546859981494,
 'eval_precision': 0.8518740829665199,
 'eval_recall': 0.7936507936507936,
 'eval_runtime': 0.453,
 'eval_samples_per_second': 139.082,
 'eval_steps_per_second': 17.661,
 'epoch': 10.0}