In [1]:
import os
import shutil
import torch
import numpy as np
import pandas as pd
import evaluate
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
from datasets import load_dataset, Dataset
from sklearn.model_selection import train_test_split
from transformers import BertForSequenceClassification, AutoTokenizer, TrainingArguments, Trainer


tf.get_logger().setLevel('ERROR')

In [2]:
torch.cuda.is_available()

True

In [3]:
tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")
model = BertForSequenceClassification.from_pretrained("google-bert/bert-base-uncased")

# Move model to GPU if available
device = torch.device("cuda")
model.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [4]:
metric = evaluate.load("accuracy")

In [5]:
def tokenization(data):
    tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
    result = tokenizer(str(data["Text"]),truncation=True,   
                       max_length=512, return_overflowing_tokens=True)

    sample_map = result.pop("overflow_to_sample_mapping")
    for key, values in data.items():
        result[key] = [values[i] for i in sample_map]
    return result
def pad_attention_mask(dataset):
    for i in range(len(dataset)):
        attention_mask = dataset[i]['attention_mask']
        if len(attention_mask) < 512:
            # Pad with zeros to reach length 512
            padding_length = 512 - len(attention_mask)
            dataset[i]['attention_mask'] = attention_mask + [0] * padding_length
            # Also pad input_ids accordingly
            dataset[i]['input_ids'] = dataset[i]['input_ids'] + [0] * padding_length
    return dataset

def process_data(dataset):
    processed_data = []
    for item in dataset:
        # Assuming your dataset items are dictionaries like {'train': {'attention_mask': [...], ...}}
        flat_item = item['train']
        processed_data.append(flat_item)
    return pad_attention_mask(processed_data)


In [6]:
# temp = pd.read_csv('../data/csv/processed_toxic_classification_dataset_full.csv')
# temp = temp.dropna()
# temp.to_csv('../data/csv/processed_toxic_classification_dataset_full_two.csv', index=False)

In [7]:
data = load_dataset("csv", data_files="../data/csv/processed_toxic_classification_dataset_full.csv")

In [8]:
data['train']

Dataset({
    features: ['Text', 'Toxic'],
    num_rows: 31909
})

In [9]:
data = data.map(tokenization,batched=True)

data = data.remove_columns(["Text"])
data = data.rename_column("Toxic", "labels")

In [10]:
training_args = TrainingArguments(
    output_dir="../data/models/",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    remove_unused_columns=False,  # Prevent removal of dataset columns
)



In [11]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [12]:
pandas_data = pd.DataFrame(data)
pandas_data

Unnamed: 0,train
0,"{'labels': 0, 'input_ids': [101, 1031, 1005, 2..."
1,"{'labels': 0, 'input_ids': [101, 3087, 3327, 2..."
2,"{'labels': 0, 'input_ids': [101, 5432, 9268, 6..."
3,"{'labels': 0, 'input_ids': [101, 2393, 3246, 2..."
4,"{'labels': 0, 'input_ids': [101, 3436, 2147, 1..."
...,...
3050,"{'labels': 0, 'input_ids': [101, 1010, 1005, 7..."
3051,"{'labels': 0, 'input_ids': [101, 10992, 3115, ..."
3052,"{'labels': 0, 'input_ids': [101, 21759, 9148, ..."
3053,"{'labels': 0, 'input_ids': [101, 13642, 2276, ..."


In [13]:
train_dataset, val_dataset = train_test_split(pandas_data, test_size=0.2, random_state=42)
train_dataset

Unnamed: 0,train
962,"{'labels': 0, 'input_ids': [101, 4790, 8385, 2..."
1978,"{'labels': 0, 'input_ids': [101, 6672, 4487, 5..."
351,"{'labels': 0, 'input_ids': [101, 2071, 5993, 7..."
1037,"{'labels': 0, 'input_ids': [101, 1041, 12731, ..."
1497,"{'labels': 0, 'input_ids': [101, 2063, 2288, 5..."
...,...
1638,"{'labels': 0, 'input_ids': [101, 11912, 5929, ..."
1095,"{'labels': 0, 'input_ids': [101, 2451, 2015, 1..."
1130,"{'labels': 0, 'input_ids': [101, 22851, 11246,..."
1294,"{'labels': 0, 'input_ids': [101, 17470, 1038, ..."


In [14]:
train_dataset = Dataset.from_pandas(pd.DataFrame(train_dataset))
val_dataset = Dataset.from_pandas(pd.DataFrame(val_dataset))

In [15]:
train_dataset = process_data(train_dataset.remove_columns(["__index_level_0__"]))
val_dataset = process_data(val_dataset.remove_columns(["__index_level_0__"]))

In [16]:
def collate_fn(batch):
    return {
        'input_ids': torch.stack([torch.tensor(item['input_ids']) for item in batch]),
        'attention_mask': torch.stack([torch.tensor(item['attention_mask']) for item in batch]),
        'labels': torch.stack([torch.tensor(item['labels']) for item in batch]) if 'labels' in batch[0] else None
    }

In [17]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    data_collator=collate_fn
)


In [18]:
data

DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 3055
    })
})

In [None]:
trainer.train()

In [None]:
eval_results = trainer.evaluate()
train_results = trainer.evaluate(train_dataset)
accuracy = eval_results.get("eval_accuracy", None)
print(f"Evaluation results: {eval_results}")
print(f"Training Evaluation results: {train_results}")
print(f"Validation Accuracy: {accuracy}")

In [None]:
def predict(text):
    # Tokenize the input text
    inputs = tokenizer(text, return_tensors="pt", padding="max_length", truncation=True, max_length=128)
    
    # Run the model for prediction
    with torch.no_grad():
        logits = model(**inputs).logits
    
    # Get the predicted class
    predicted_class = torch.argmax(logits, dim=1).item()
    return predicted_class

In [None]:
texts = [
    "This is an amazing product!",
    "I hate this so much. It's the worst experience ever!",
    "Let's meet up for coffee tomorrow.",
    "You are absolutely incompetent."
]

for text in texts:
    prediction = predict(text)
    # label = "Toxic" if prediction == 1 else "Non-Toxic"
    # print(f"Text: {text}\nPrediction: {label}\n")
    print(prediction)