In [44]:
!pip install transformers
!pip install tweet-preprocessor
!pip install datasets
!pip install bert_score



In [45]:
from transformers import AutoTokenizer
import pandas as pd
import sys
sys.path.insert(0, "/content/drive/MyDrive/Colab Notebooks/")

In [46]:
from TextProcessor import *

input = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/twitter_parsed_dataset.csv")
input_text = input['Text'].to_numpy()
input_y = input['oh_label'].to_numpy()

tp = TextProcessor(remove_punctuation=False,
remove_stop_word=False,
min_word_size=1,
special_token_method=SpecialTokenMethod.REMOVE)

parsed_tweets = tp.transform(input_text[0:100])
print(parsed_tweets)


['-pron- read -pron- in context . no change in meaning . the history of islamic slavery .', 'now -pron- idiot claim that people who try to stop -pron- from become a terrorist make -pron- a terrorist . islamically brain dead .', 'call -pron- sexist , but when -pron- go to an auto place , -pron- would rather talk to a guy', 'wrong , isis follow the example of mohammed and the quran exactly .', 'no no no no no no', ": saudi preacher who ' rape and torture ' -pron- five -year - old daughter to death be release after", 'nooo not sexist but most woman be bad driver', "go to make some pancake ..... don't hve any strawberry .... but -pron- hve banana ......", ': how dare -pron- have feeling be a fantastic way to dehumanize someone .', ': there be something wrong when a girl win wayne rooney street striker', '-pron- be not on the autoblocker , and i do not run blockbot . block manually for now .', 'a good muslim be good despite -pron- bad religion , not because of -pron- .', '-pron- help everyo

In [47]:
import random
from sklearn.model_selection import train_test_split
trainset = [[x, input_y[i]]
                 for i, x in enumerate(parsed_tweets)]
random.shuffle(trainset)

X = pd.DataFrame(trainset).iloc[:, 0].to_numpy()
y = pd.DataFrame(trainset).iloc[:, 1].to_numpy()

X_train_raw, X_test_raw, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
print(X_train_raw)

[': egypt threaten military strike on yemen houthis ( iran - back ) if -pron- threaten strategic bab al - manab shipping lane htt'
 'male follower , -pron- be time to collect -pron- trash .'
 'this fucking potato be blow my mind . duck fat . -pron- guy . figuratively dying of bliss .'
 '-pron- should be attack everyone that follow a religious cult of hated and murder like islam .'
 'woo can not wait to see what happen ! ! !'
 'if -pron- want to understand the lie of muslims live in peace with jews , read ibn warraq'
 'for real ? -pron- be not sexist , but man be superior .'
 'a lady be currently drive this bus ... funnily enough -pron- be hit the curb twice'
 'islam do not answer anything . -pron- pretend to answer with illogical and delusional superstition .'
 ': what i learn from'
 'i get that -pron- be probably go to defend -pron- friend regardless , but what -pron- do be shitty , and then -pron- follow -pron- w / more shit .'
 ': what do -pron- think of the two team face sudden dea

In [48]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-cased', use_fast=True)

In [50]:
import torch
from transformers import AutoConfig, AutoModelForSequenceClassification, AutoTokenizer, AdamW, get_linear_schedule_with_warmup
from transformers import Trainer, TrainingArguments
from datasets import load_metric
import numpy as np

class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels, tokenizer):
        self.tokenizer = tokenizer
        self.encodings = [self.tokenize_tweet(tweet) for tweet in encodings]
        self.labels = labels

    def __getitem__(self, idx):
        text = self.encodings[idx]
        item = {key: torch.tensor(val) for key, val in text.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

    def tokenize_tweet(self, tweet_text):
        return self.tokenizer(tweet_text, truncation=True, padding=True)
    
class BertDataModule():
    def __init__(self,x_tr,y_tr,x_test,y_test,tokenizer, batch_size=16):
        super().__init__()
        self.tr_text = x_tr
        self.tr_label = y_tr
        self.test_text = x_test
        self.test_label = y_test
        self.tokenizer = tokenizer
        self.batch_size = batch_size

        self.setup()

    def setup(self):
        self.train_dataset = Dataset(encodings=self.tr_text,  labels=self.tr_label,tokenizer=self.tokenizer)
        #self.val_dataset= Dataset(encodings=self.val_text, labels=self.val_label,tokenizer=self.tokenizer)
        self.test_dataset =Dataset(encodings=self.test_text, labels=self.test_label,tokenizer=self.tokenizer)

    def train_dataloader(self):
        return DataLoader(self.train_dataset,batch_size= self.batch_size, shuffle = True , num_workers=4)

    def val_dataloader(self):
        return DataLoader (self.val_dataset,batch_size= 16)

    def test_dataloader(self):
        return DataLoader (self.test_dataset,batch_size= 16)

class SBMBertClassifier:
    def __init__(self, n_epochs=3, lr=2e-5, batch_size=16):
        config = AutoConfig.from_pretrained('bert-base-cased', num_labels=1)
        self.model = AutoModelForSequenceClassification.from_config(config)
        self.bertscore = load_metric("bertscore")
        self.lr = lr
        self.n_epochs = n_epochs
        self.batch_size = batch_size

    def compute_metrics(self, eval_pred):
        acc_metric = load_metric("accuracy")
        logits, labels = eval_pred
        predictions = np.argmax(logits, axis=-1)
        #self.bertscore.add_batch(predictions=predictions, references=labels)
        return acc_metric.compute(predictions=predictions, references=labels)

    def train(self, datamodule):
        args = TrainingArguments(
            "sbm",
            evaluation_strategy = "epoch",
            learning_rate=self.lr,
            per_device_train_batch_size=self.batch_size,
            per_device_eval_batch_size=self.batch_size,
            num_train_epochs=self.n_epochs,
            weight_decay=0.01,
            load_best_model_at_end=True,
        )
        trainer = Trainer(
            self.model,
            args,
            train_dataset=datamodule.train_dataset,
            eval_dataset=datamodule.test_dataset,
            tokenizer=datamodule.tokenizer,
            compute_metrics=self.compute_metrics)
        trainer.train()

    def score():
        metric = load_metric("bertscore")
        for batch in dataset:
            inputs, references = batch
            predictions = model(inputs)
            metric.add_batch(predictions=predictions, references=references)
        score = metric.compute()
        print(score)

    def validate(self):
        self.trainer.test()

    def test(self):
        self.trainer.test()

    def predict(self, text):
        # TODO ??? No predict in pl.trainer
        return self.model(text)



In [51]:
datamodule = BertDataModule(X_train_raw, y_train, X_test_raw, y_test, tokenizer)
cls = SBMBertClassifier()
cls.train(datamodule)

  return F.mse_loss(input, target, reduction=self.reduction)


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.150196,0.818182
2,No log,0.540801,0.818182
3,No log,0.383013,0.818182


  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
