In [8]:
import torch
import jsonlines
from transformers import RobertaForSequenceClassification, RobertaTokenizer, Trainer, TrainingArguments
import numpy as np
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.metrics import roc_auc_score
import logging

def dummy_data_collector(features):
    batch = {}
    batch['input_ids'] = torch.stack([f[0] for f in features])
    batch['attention_mask'] = torch.stack([f[1] for f in features])
    batch['labels'] = torch.stack([f[2] for f in features])
    return batch

def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    acc = np.mean(preds == p.label_ids.reshape(-1))
    
    precision, recall, fscore, _ = score(p.label_ids.reshape(-1), preds, average='binary')
    
    # Compute AUC
    if p.label_ids.shape[1] > 1:  # Check for multi-class classification
        auc_test = roc_auc_score(p.label_ids, p.predictions, multi_class='ovr')
    else:
        auc_test = roc_auc_score(p.label_ids, p.predictions[:, 1])
    
    return {
        "eval_loss": p.predictions.mean().item(),
        "eval_accuracy": acc,
        "eval_precision": precision,
        "eval_recall": recall,
        "eval_f1": fscore,
        "eval_auc": auc_test,
    }


def main():
    logging.basicConfig(level=logging.INFO, format='%(message)s')  # Modified logging format
    logger = logging.getLogger(__name__)
    
    logger.info('Loading Roberta tokenizer...')
    tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

    labels = []
    all_articles = []
    with jsonlines.open('GPT.jsonl', 'r') as input_articles:
        for article in input_articles:
            all_articles.append(article['text'])
            labels.append(article['label'])

    encoded_article = tokenizer.batch_encode_plus(
        all_articles,
        truncation=True,
        add_special_tokens=True,
        padding='longest',
        return_attention_mask=True,
        return_tensors='pt',
    )

    input_ids = encoded_article['input_ids']
    attention_masks = encoded_article['attention_mask']
    
    labels = np.asarray(labels)
    labels = np.expand_dims(np.where((labels == 'machine'), 1, 0), 1)
    labels = torch.from_numpy(labels)

    dataset = torch.utils.data.TensorDataset(input_ids, attention_masks, labels)

    model = RobertaForSequenceClassification.from_pretrained(
        'roberta-base',
        num_labels=2,
        output_attentions=False,
        output_hidden_states=False,
    )

    training_args = TrainingArguments(
        output_dir='./results',
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        num_train_epochs=3,
        logging_dir='./logs',
        evaluation_strategy="steps",
        do_predict=True,
        eval_steps=500,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        data_collator=dummy_data_collector,
        compute_metrics=compute_metrics,
    )

    predictions = trainer.predict(dataset)
    
    # Extracting necessary values from PredictionOutput object
    loss = np.mean(predictions.predictions)
    acc = np.mean(np.argmax(predictions.predictions, axis=1) == predictions.label_ids.reshape(-1))
    
    precision, recall, fscore, _ = score(predictions.label_ids.reshape(-1), 
                                         np.argmax(predictions.predictions, axis=1), 
                                         average='binary')
    
    if predictions.label_ids.shape[1] > 1:
        auc_test = roc_auc_score(predictions.label_ids, predictions.predictions, multi_class='ovr')
    else:
        auc_test = roc_auc_score(predictions.label_ids, predictions.predictions[:, 1])
    
    eval_results = {
        "eval_loss": loss,
        "eval_accuracy": acc,
        "eval_precision": precision,
        "eval_recall": recall,
        "eval_f1": fscore,
        "eval_auc": auc_test,
    }
    
    # Print the results in the desired format
    print("Evaluation results:")
    for key, value in eval_results.items():
        print(f"{key}: {value}")

if __name__ == "__main__":
    main()




INFO:__main__:Loading Roberta tokenizer...
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluation results:
eval_loss: -0.03537864238023758
eval_accuracy: 0.4935
eval_precision: 0.4966512107161257
eval_recall: 0.964
eval_f1: 0.6555593335600136
eval_auc: 0.35198999999999997


In [5]:
import torch
import jsonlines
from transformers import RobertaForSequenceClassification, RobertaTokenizer, Trainer, TrainingArguments, RobertaConfig, AdamW, get_linear_schedule_with_warmup
import numpy as np
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.metrics import roc_auc_score
import logging

def dummy_data_collector(features):
    batch = {}
    batch['input_ids'] = torch.stack([f[0] for f in features])
    batch['attention_mask'] = torch.stack([f[1] for f in features])
    batch['labels'] = torch.stack([f[2] for f in features])
    return batch

def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    acc = np.mean(preds == p.label_ids.reshape(-1))
    
    precision, recall, fscore, _ = score(p.label_ids.reshape(-1), preds, average='binary', zero_division=0)
    
    # Check if both classes are present in labels
    unique_labels = np.unique(p.label_ids)
    if len(unique_labels) == 2:
        if p.label_ids.shape[1] > 1:  # Check for multi-class classification
            auc_test = roc_auc_score(p.label_ids, p.predictions, multi_class='ovr')
        else:
            auc_test = roc_auc_score(p.label_ids, p.predictions[:, 1])
    else:
        auc_test = 0.0  # Set AUC to 0 if only one class is present
    
    return {
        "eval_loss": p.predictions.mean().item(),
        "eval_accuracy": acc,
        "eval_precision": precision,
        "eval_recall": recall,
        "eval_f1": fscore,
        "eval_auc": auc_test,
    }

def main():
    logging.basicConfig(level=logging.INFO, format='%(message)s')
    logger = logging.getLogger(__name__)
    
    logger.info('Loading Roberta tokenizer...')
    tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

    labels = []
    all_articles = []
    with jsonlines.open('GPT.jsonl', 'r') as input_articles:
        for article in input_articles:
            all_articles.append(article['text'])
            labels.append(article['label'])

    encoded_article = tokenizer.batch_encode_plus(
        all_articles,
        truncation=True,
        add_special_tokens=True,
        padding='longest',
        return_attention_mask=True,
        return_tensors='pt',
    )

    input_ids = encoded_article['input_ids']
    attention_masks = encoded_article['attention_mask']
    
    labels = np.asarray(labels)
    labels = np.expand_dims(np.where((labels == 'machine'), 1, 0), 1)
    labels = torch.from_numpy(labels)

    # Print unique labels and counts
    unique_labels, counts = np.unique(labels.numpy(), return_counts=True)
    logger.info(f"Unique labels: {unique_labels}, Counts: {counts}")

    # Splitting dataset into training and evaluation sets
    train_size = int(0.8 * len(input_ids))
    eval_size = len(input_ids) - train_size

    train_dataset = torch.utils.data.TensorDataset(input_ids[:train_size], 
                                                   attention_masks[:train_size], 
                                                   labels[:train_size])

    eval_dataset = torch.utils.data.TensorDataset(input_ids[train_size:], 
                                                  attention_masks[train_size:], 
                                                  labels[train_size:])

    config = RobertaConfig.from_pretrained('roberta-base', num_labels=2)
    model = RobertaForSequenceClassification(config)
    
    optimizer = AdamW(model.parameters(), lr=5e-5)
    scheduler = get_linear_schedule_with_warmup(optimizer, 
                                                num_warmup_steps=0,
                                                num_training_steps=len(train_dataset) * 3)
    
    training_args = TrainingArguments(
        output_dir='./results',
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        num_train_epochs=3,
        logging_dir='./logs',
        evaluation_strategy="steps",
        do_predict=True,
        eval_steps=500,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        data_collator=dummy_data_collector,
        compute_metrics=compute_metrics,
    )

    for epoch in range(training_args.num_train_epochs):
        logger.info(f"Starting epoch {epoch + 1}")
        trainer.train()
        
        # Debugging: Print some predictions during training
        predictions = trainer.predict(eval_dataset)
        logger.info(f"Sample predictions: {predictions.predictions[:5]}")
        
        eval_results = trainer.evaluate()
        
        print(f"Epoch {epoch + 1} evaluation results:")
        for key, value in eval_results.items():
            print(f"{key}: {value}")

        scheduler.step()

if __name__ == "__main__":
    main()


Loading Roberta tokenizer...
Unique labels: [0 1], Counts: [1000 1000]
Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
Starting epoch 1


Step,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Auc
500,0.6877,0.945108,0.0,0.0,0.0,0.0,0.0


Sample predictions: [[-0.48676005  0.09830824]
 [-0.48674285  0.09826421]
 [-0.48674405  0.09827331]
 [-0.4867698   0.09832444]
 [-0.48673174  0.098283  ]]


Starting epoch 2


Epoch 1 evaluation results:
eval_loss: 1.027869462966919
eval_accuracy: 0.0
eval_precision: 0.0
eval_recall: 0.0
eval_f1: 0.0
eval_auc: 0.0
eval_runtime: 8.0449
eval_samples_per_second: 49.721
eval_steps_per_second: 6.215
epoch: 3.0


Step,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Auc
500,0.681,0.940912,0.0,0.0,0.0,0.0,0.0


Sample predictions: [[-0.4073565  -0.0441561 ]
 [-0.40677866 -0.04466706]
 [-0.40595853 -0.045229  ]
 [-0.40803176 -0.04358913]
 [-0.40634042 -0.04496273]]


Starting epoch 3


Epoch 2 evaluation results:
eval_loss: 0.8909348845481873
eval_accuracy: 0.0
eval_precision: 0.0
eval_recall: 0.0
eval_f1: 0.0
eval_auc: 0.0
eval_runtime: 8.0625
eval_samples_per_second: 49.612
eval_steps_per_second: 6.202
epoch: 3.0


Step,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Auc
500,0.0039,5.8e-05,1.0,0.0,0.0,0.0,0.0


Sample predictions: [[ 5.071643  -4.7022247]
 [ 5.071638  -4.702224 ]
 [ 5.071645  -4.7022233]
 [ 5.0716424 -4.7022243]
 [ 5.0716443 -4.702227 ]]


Epoch 3 evaluation results:
eval_loss: 5.6861204939195886e-05
eval_accuracy: 1.0
eval_precision: 0.0
eval_recall: 0.0
eval_f1: 0.0
eval_auc: 0.0
eval_runtime: 8.0546
eval_samples_per_second: 49.661
eval_steps_per_second: 6.208
epoch: 3.0
