In [1]:
from datasets import load_dataset
from transformers import AutoTokenizer, pipeline, Trainer, TrainingArguments, AutoModelForSequenceClassification
import evaluate 
import numpy as np
import torch

dataset = load_dataset("csv", data_files="ExtractedTweets.csv", split="train")
dataset = dataset.train_test_split(test_size=0.2)
print(dataset)
#print(dataset["train"][100])

tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-cased")


def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True)


DatasetDict({
    train: Dataset({
        features: ['Party', 'Handle', 'text'],
        num_rows: 69168
    })
    test: Dataset({
        features: ['Party', 'Handle', 'text'],
        num_rows: 17292
    })
})


Map:   0%|          | 0/69168 [00:00<?, ? examples/s]

Map:   0%|          | 0/17292 [00:00<?, ? examples/s]

In [2]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [6]:
accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [7]:
id2label = {0: "Democrat", 1: "Republican"}
label2id = {"Democrat": 0, "Republican": 1}

model = AutoModelForSequenceClassification.from_pretrained(
    "google-bert/bert-base-cased", num_labels=2, id2label=id2label, label2id=label2id
)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
training_args = TrainingArguments(output_dir="test_trainer", evaluation_strategy="epoch")

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer.train()

  0%|          | 0/25938 [00:00<?, ?it/s]

ValueError: The model did not return a loss from the inputs, only the following keys: logits. For reference, the inputs it received are input_ids,token_type_ids,attention_mask.

In [None]:
# Evaluate the model on the test set
eval_results = trainer.evaluate(eval_dataset=tokenized_datasets["test"])
print(eval_results)
