In [1]:
!pip install transformers[torch] datasets scikit-learn
!pip install accelerate -U
!pip install --upgrade transformers
!pip install evaluate

import numpy as np
import pandas as pd
import torch
import transformers
import re
import evaluate

from datasets import load_dataset, DatasetDict, Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding, Trainer, TrainingArguments, AdamW
from sklearn.metrics import accuracy_score, precision_recall_fscore_support



In [2]:
test_ds = pd.read_csv('/content/test.csv')
train_ds = pd.read_csv('/content/train.csv')

print(train_ds.head())
print(test_ds.head())

   ID                                              TITLE  \
0   1  Detecting the impact of public transit on the ...   
1   2  Is Proxima Centauri b habitable? -- A study of...   
2   3  Verifying Security Protocols using Dynamic Str...   
3   4            Scenic: Language-Based Scene Generation   
4   5  Near-Optimal Discrete Optimization for Experim...   

                                            ABSTRACT  label  
0    In many developing countries, public transit...      0  
1    We address the important question of whether...      1  
2    Current formal approaches have been successf...      0  
3    Synthetic data has proved increasingly usefu...      0  
4    The experimental design problem concerns the...      0  
   ID                                              TITLE  \
0   1  An analytic resolution to the competition betw...   
1   2  Attention-based Natural Language Person Retrieval   
2   3  Asymptotics of multivariate contingency tables...   
3   4  Discriminant of the 

In [3]:
print(train_ds.columns)
print(test_ds.columns)

Index(['ID', 'TITLE', 'ABSTRACT', 'label'], dtype='object')
Index(['ID', 'TITLE', 'ABSTRACT'], dtype='object')


In [4]:
train_ds['text'] = "[CLS] " + train_ds['TITLE'] + " [SEP] " + train_ds['ABSTRACT'] + " [SEP]"
test_ds['text'] = "[CLS] " + test_ds['TITLE'] + " [SEP] " + test_ds['ABSTRACT'] + " [SEP]"

# Drop the original columns if no longer needed
'''
train_ds = train_ds.drop(columns=['TITLE', 'ABSTRACT'])
test_ds = test_ds.drop(columns=['TITLE', 'ABSTRACT'])
'''

from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
train_ds['label'] = label_encoder.fit_transform(train_ds['label'])
#test_ds['label'] = label_encoder.transform(test_ds['label'])

train_dataset = Dataset.from_pandas(train_ds)
test_dataset = Dataset.from_pandas(test_ds)

In [5]:
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [6]:
def tokenize_function(examples):
    return tokenizer(examples['text'], padding="max_length", truncation=True, max_length=384)

tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True)
tokenized_test_dataset = test_dataset.map(tokenize_function, batched=True)


num_labels = 4
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)


Map:   0%|          | 0/15472 [00:00<?, ? examples/s]

Map:   0%|          | 0/4844 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
accuracy_metric = evaluate.load("accuracy")
f1_metric = evaluate.load("f1")
precision_metric = evaluate.load("precision")
recall_metric = evaluate.load("recall")

def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)

    accuracy = accuracy_metric.compute(predictions=predictions, references=labels)
    f1 = f1_metric.compute(predictions=predictions, references=labels, average="weighted")
    precision = precision_metric.compute(predictions=predictions, references=labels, average="weighted")
    recall = recall_metric.compute(predictions=predictions, references=labels, average="weighted")

    return {
        "accuracy": accuracy["accuracy"],
        "f1": f1["f1"],
        "precision": precision["precision"],
        "recall": recall["recall"],
    }

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.77k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.55k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.36k [00:00<?, ?B/s]

In [8]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [9]:
training_args = TrainingArguments(
    "test-trainer",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_test_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)



In [10]:

trainer.train()


Epoch,Training Loss,Validation Loss
1,0.5742,No log
2,0.3742,No log
3,0.2706,No log


TrainOutput(global_step=2901, training_loss=0.3901057277864853, metrics={'train_runtime': 1520.2132, 'train_samples_per_second': 30.533, 'train_steps_per_second': 1.908, 'total_flos': 9159586536701952.0, 'train_loss': 0.3901057277864853, 'epoch': 3.0})

In [11]:
# Evaluate the model
evaluation_results = trainer.evaluate()

# Print the evaluation results
print(evaluation_results)

# Predictions
predictions = trainer.predict(tokenized_test_dataset)
preds = np.argmax(predictions.predictions, axis=-1)

{'eval_runtime': 48.9452, 'eval_samples_per_second': 98.968, 'eval_steps_per_second': 6.191, 'epoch': 3.0}


In [14]:
output_ds = pd.DataFrame({
    'ID': test_ds['ID'],
    'label': preds
})

output_ds.to_csv('predictions.csv', index=False)