In [1]:
import torch
import jsonlines
from torch.utils.data import DataLoader, TensorDataset
from transformers import BertForSequenceClassification, BertTokenizer, Trainer, TrainingArguments
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.metrics import roc_auc_score
import numpy as np
import tempfile

if torch.cuda.is_available():
    device = torch.device("cuda:1")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")


def dummy_data_collector(features):
    batch = {}
    batch['input_ids'] = torch.stack([f[0] for f in features])
    batch['attention_mask'] = torch.stack([f[1] for f in features])
    batch['labels'] = torch.stack([f[2] for f in features])
    return batch


def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    precision, recall, fscore, support = score(p.label_ids.reshape(-1), preds)

    auc_test = roc_auc_score(p.label_ids.reshape(-1), p.predictions[:, 1])
    return {
        "acc": np.mean(preds == p.label_ids.reshape(-1)),
        "auc_test": auc_test,
        "precision_human": precision[0],
        "recall_human": recall[0],
        "fscore_human": fscore[0],
        "support_human": float(support[0]),
        "precision_machine": precision[1],
        "recall_machine": recall[1],
        "fscore_machine": fscore[1],
        "support_machine": float(support[1])
    }


def main():
    prediction_output = "./EVAL_processed_webtext_eval_tokens_topp_096_4k.jsonl"
    test_dir = "./GPT.jsonl"
    train_batch_size = 2
    val_batch_size = 32
    num_train_epochs = 1

    tokenizer = BertTokenizer.from_pretrained('bert-large-cased')

    labels = []
    all_articles_test = []

    with jsonlines.open(test_dir, 'r') as input_articles:
        for article in input_articles:
            all_articles_test.append(article['text'])
            labels.append(article['label'])

    labels = np.asarray(labels)
    labels = np.expand_dims(np.where((labels == 'machine'), 1, 0), 1)
    labels_test = torch.from_numpy(labels)

    input_ids_test = []
    attention_masks_test = []

    for article in all_articles_test:
        encoded_dict = tokenizer(article, return_tensors="pt", pad_to_max_length=True, truncation=True, max_length=512)
        input_ids_test.append(encoded_dict['input_ids'])
        attention_masks_test.append(encoded_dict['attention_mask'])

    input_ids_test = torch.cat(input_ids_test, dim=0)
    attention_masks_test = torch.cat(attention_masks_test, dim=0)

    test_dataset = TensorDataset(input_ids_test, attention_masks_test, labels_test)

    model = BertForSequenceClassification.from_pretrained(
        'bert-large-cased',
        num_labels=2,
        output_attentions=False,
        output_hidden_states=False
    )

   # Create a temporary directory for output_dir
    with tempfile.TemporaryDirectory() as tmp_dir:
        training_args = TrainingArguments(
            output_dir=tmp_dir,  # Use temporary directory
            overwrite_output_dir=True,
            do_predict=True,
            per_device_train_batch_size=train_batch_size,
            per_device_eval_batch_size=val_batch_size,
            num_train_epochs=num_train_epochs
        )

        trainer = Trainer(
            model=model,
            args=training_args,
            data_collator=dummy_data_collector,
            compute_metrics=compute_metrics
        )

        preds_man = trainer.predict(test_dataset)
    preds_man = trainer.predict(test_dataset)
    
    # Print evaluation metrics
    metrics = compute_metrics(preds_man)
    for key, value in metrics.items():
        print(f"{key}: {value}")


if __name__ == "__main__":
    main()


There are 1 GPU(s) available.
We will use the GPU: Tesla P100-PCIE-12GB


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-large-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


acc: 0.5235
auc_test: 0.6215189999999999
precision_human: 0.5137507314218841
recall_human: 0.878
fscore_human: 0.6482096714654855
support_human: 1000.0
precision_machine: 0.5807560137457045
recall_machine: 0.169
fscore_machine: 0.2618125484120836
support_machine: 1000.0


In [1]:
import os
import torch
import jsonlines
import random
import numpy as np
import logging
from torch.utils.data import DataLoader, TensorDataset
from transformers import BertForSequenceClassification, BertTokenizer, Trainer, TrainingArguments, EvalPrediction
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.metrics import precision_recall_fscore_support

# Check for GPU availability
if torch.cuda.is_available():
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    device = torch.device("cuda:1" if torch.cuda.device_count() > 1 else "cuda:0")
    print('We will use the GPU:', torch.cuda.get_device_name(device))
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

# Data collector for dummy batching
def dummy_data_collector(features):
    batch = {}
    batch['input_ids'] = torch.stack([f[0] for f in features])
    batch['attention_mask'] = torch.stack([f[1] for f in features])
    batch['labels'] = torch.stack([f[2] for f in features])
    return batch

# Compute metrics for evaluation
def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    precision, recall, fscore, _ = precision_recall_fscore_support(
        p.label_ids, preds, average='binary'
    )
    accuracy = np.mean(preds == p.label_ids)
    
    return {
        "eval_precision": precision,
        "eval_recall": recall,
        "eval_fscore": fscore,
        "eval_accuracy": accuracy,
    }


def main():
    # Load tokenizer
    tokenizer = BertTokenizer.from_pretrained('bert-large-cased')

    # Read gpt.jsonl file
    all_articles = []
    with jsonlines.open('GPT.jsonl', 'r') as input_articles:
        for article in input_articles:
            all_articles.append(article)
    
    # Shuffle and split data
    random.shuffle(all_articles)
    train_articles = all_articles[:int(0.7 * len(all_articles))]
    val_articles = all_articles[int(0.7 * len(all_articles)):int(0.85 * len(all_articles))]
    test_articles = all_articles[int(0.85 * len(all_articles)):]

    # Preprocess data
    def preprocess(articles):
        input_ids = []
        attention_masks = []
        labels = []
        for article in articles:
            encoded_dict = tokenizer(article['text'], return_tensors="pt", padding='max_length', truncation=True, max_length=512)
            input_ids.append(encoded_dict['input_ids'].squeeze())
            attention_masks.append(encoded_dict['attention_mask'].squeeze())
            labels.append(1 if article['label'] == 'machine' else 0)
        return torch.stack(input_ids), torch.stack(attention_masks), torch.tensor(labels)
    
    train_input_ids, train_attention_masks, train_labels = preprocess(train_articles)
    val_input_ids, val_attention_masks, val_labels = preprocess(val_articles)
    test_input_ids, test_attention_masks, test_labels = preprocess(test_articles)

    # Create datasets
    train_dataset = TensorDataset(train_input_ids, train_attention_masks, train_labels)
    val_dataset = TensorDataset(val_input_ids, val_attention_masks, val_labels)
    test_dataset = TensorDataset(test_input_ids, test_attention_masks, test_labels)

    # Load model
    model = BertForSequenceClassification.from_pretrained('bert-large-cased', num_labels=2)
    model.to(device)

      # Training arguments
    training_args = TrainingArguments(
        output_dir='./results',
        num_train_epochs=1,                # Reduced to 2 epochs
        per_device_train_batch_size=4,  
        per_device_eval_batch_size=4,   
        gradient_accumulation_steps=2,  
        save_steps=100,                   # Reduced save steps
        evaluation_strategy="steps",
        logging_dir='./logs',
        logging_steps=20,                  # Reduced logging steps
        fp16=True,                       
    )



    # Initialize Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        compute_metrics=compute_metrics,
        data_collator=dummy_data_collector
    )

    # Train and evaluate
    trainer.train()
    eval_results = trainer.evaluate(eval_dataset=test_dataset)

    # Print evaluation metrics
    for key, value in eval_results.items():
        print(f"{key}: {value:.4f}" if isinstance(value, float) else f"{key}: {value}")

if __name__ == "__main__":
    main()


There are 1 GPU(s) available.
We will use the GPU: Tesla P100-PCIE-12GB


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-large-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Step,Training Loss,Validation Loss,Precision,Recall,Fscore,Accuracy
20,0.7214,0.542839,0.659898,0.921986,0.769231,0.74
40,0.5006,0.469555,0.702564,0.971631,0.815476,0.793333
60,0.4191,0.417316,0.704082,0.978723,0.818991,0.796667
80,0.3466,0.459609,0.833333,0.780142,0.805861,0.823333
100,0.3736,0.721681,1.0,0.198582,0.331361,0.623333
120,0.5463,0.436562,0.851562,0.77305,0.810409,0.83
140,0.4026,0.359715,0.758427,0.957447,0.846395,0.836667
160,0.3088,0.342691,0.857143,0.808511,0.832117,0.846667


eval_precision: 0.9173
eval_recall: 0.7974
eval_fscore: 0.8531
eval_accuracy: 0.8600
eval_loss: 0.3137
eval_runtime: 21.6850
eval_samples_per_second: 13.8340
eval_steps_per_second: 3.4590
epoch: 1.0000


In [1]:
import os
import torch
import jsonlines
import random
import numpy as np
import logging
from torch.utils.data import DataLoader, TensorDataset
from transformers import BertForSequenceClassification, BertTokenizer, Trainer, TrainingArguments, EvalPrediction
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.metrics import precision_recall_fscore_support

# Check for GPU availability
if torch.cuda.is_available():
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    device = torch.device("cuda:1" if torch.cuda.device_count() > 1 else "cuda:0")
    print('We will use the GPU:', torch.cuda.get_device_name(device))
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

# Data collector for dummy batching
def dummy_data_collector(features):
    batch = {}
    batch['input_ids'] = torch.stack([f[0] for f in features])
    batch['attention_mask'] = torch.stack([f[1] for f in features])
    batch['labels'] = torch.stack([f[2] for f in features])
    return batch

# Compute metrics for evaluation
def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    precision, recall, fscore, _ = precision_recall_fscore_support(
        p.label_ids, preds, average='binary'
    )
    accuracy = np.mean(preds == p.label_ids)
    
    return {
        "eval_precision": precision,
        "eval_recall": recall,
        "eval_fscore": fscore,
        "eval_accuracy": accuracy,
    }

def beam_search(logits, beam_size=3):
    num_labels = logits.size(1)
    if beam_size > num_labels:
        beam_size = num_labels
    
    # Flatten the logits to have shape [batch_size * seq_len, num_labels]
    logits = logits.view(-1, logits.size(-1))

    topk = torch.topk(logits, beam_size, dim=1)
    topk_indices = topk.indices
    topk_scores = topk.values

    beams = [([], 1.0)]  # List of (sequence, score)

    for i in range(logits.size(0)):
        new_beams = []
        for beam_seq, beam_score in beams:
            for next_label in topk_indices[i]:
                new_seq = beam_seq + [next_label.item()]
                new_score = beam_score * topk_scores[i][next_label].item()
                new_beams.append((new_seq, new_score))
        
        beams = sorted(new_beams, key=lambda x: x[1], reverse=True)[:beam_size]
    
    return max(beams, key=lambda x: x[1])[0]


def evaluate_with_beam_search(model, eval_dataset, beam_size=3):
    model.eval()
    
    all_preds = []
    all_labels = []
    
    for input_ids, attention_mask, labels in DataLoader(eval_dataset, batch_size=4):
        with torch.no_grad():
            logits = model(input_ids.to(device), attention_mask=attention_mask.to(device))[0]
            probs = torch.softmax(logits, dim=-1)
            preds = beam_search(probs, beam_size)
            
            all_preds.extend(preds)
            all_labels.extend(labels.tolist())
    
    return all_preds, all_labels


def main():
    # Load tokenizer
    tokenizer = BertTokenizer.from_pretrained('bert-large-cased')

    # Read GPT.jsonl file
    all_articles = []
    with jsonlines.open('GPT.jsonl', 'r') as input_articles:
        for article in input_articles:
            all_articles.append(article)
    
    # Shuffle and split data
    random.shuffle(all_articles)
    train_articles = all_articles[:int(0.7 * len(all_articles))]
    val_articles = all_articles[int(0.7 * len(all_articles)):int(0.85 * len(all_articles))]
    test_articles = all_articles[int(0.85 * len(all_articles)):]

    # Preprocess data
    def preprocess(articles):
        input_ids = []
        attention_masks = []
        labels = []
        for article in articles:
            encoded_dict = tokenizer(article['text'], return_tensors="pt", padding='max_length', truncation=True, max_length=512)
            input_ids.append(encoded_dict['input_ids'].squeeze())
            attention_masks.append(encoded_dict['attention_mask'].squeeze())
            labels.append(1 if article['label'] == 'machine' else 0)
        return torch.stack(input_ids), torch.stack(attention_masks), torch.tensor(labels)
    
    train_input_ids, train_attention_masks, train_labels = preprocess(train_articles)
    val_input_ids, val_attention_masks, val_labels = preprocess(val_articles)
    test_input_ids, test_attention_masks, test_labels = preprocess(test_articles)

    # Create datasets
    train_dataset = TensorDataset(train_input_ids, train_attention_masks, train_labels)
    val_dataset = TensorDataset(val_input_ids, val_attention_masks, val_labels)
    test_dataset = TensorDataset(test_input_ids, test_attention_masks, test_labels)

    # Load model
    model = BertForSequenceClassification.from_pretrained('bert-large-cased', num_labels=2)
    model.to(device)

    # Training arguments
    training_args = TrainingArguments(
        output_dir='./results',
        num_train_epochs=1,
        per_device_train_batch_size=2,
        per_device_eval_batch_size=2,
        gradient_accumulation_steps=2,
        save_steps=100,
        evaluation_strategy="steps",
        logging_dir='./logs',
        logging_steps=20,
        fp16=True,
    )

    # Initialize Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        compute_metrics=compute_metrics,
        data_collator=dummy_data_collector
    )

    # Train and evaluate
    trainer.train()
    eval_results = trainer.evaluate(eval_dataset=test_dataset)

    # Print evaluation metrics
    for key, value in eval_results.items():
        print(f"{key}: {value:.4f}" if isinstance(value, float) else f"{key}: {value}")

    # Evaluate with beam search
    all_preds, all_labels = evaluate_with_beam_search(model, test_dataset)
    
    # Calculate metrics
    precision, recall, fscore, _ = precision_recall_fscore_support(all_labels, all_preds, average='binary')
    accuracy = np.mean(np.array(all_preds) == np.array(all_labels))
    
    # Print evaluation metrics with beam search
    print("Evaluation with Beam Search:")
    print(f"eval_precision: {precision:.4f}")
    print(f"eval_recall: {recall:.4f}")
    print(f"eval_fscore: {fscore:.4f}")
    print(f"eval_accuracy: {accuracy:.4f}")

if __name__ == "__main__":
    main()


There are 1 GPU(s) available.
We will use the GPU: Tesla P100-PCIE-12GB


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-large-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Step,Training Loss,Validation Loss,Precision,Recall,Fscore,Accuracy
20,0.7733,0.69826,0.483333,1.0,0.651685,0.483333
40,0.7357,0.69333,0.483333,1.0,0.651685,0.483333
60,0.7137,0.694025,0.0,0.0,0.0,0.516667
80,0.7159,0.696108,0.483333,1.0,0.651685,0.483333
100,0.6969,0.709687,0.483333,1.0,0.651685,0.483333
120,0.7301,0.693511,0.483333,1.0,0.651685,0.483333
140,0.6642,0.697026,0.483333,1.0,0.651685,0.483333
160,0.7389,0.692604,0.0,0.0,0.0,0.516667
180,0.6911,0.696704,0.0,0.0,0.0,0.516667
200,0.6911,0.699725,0.0,0.0,0.0,0.516667


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


eval_precision: 0.5100
eval_recall: 1.0000
eval_fscore: 0.6755
eval_accuracy: 0.5100
eval_loss: 0.6960
eval_runtime: 21.6529
eval_samples_per_second: 13.8550
eval_steps_per_second: 6.9270
epoch: 1.0000
Evaluation with Beam Search:
eval_precision: 0.0000
eval_recall: 0.0000
eval_fscore: 0.0000
eval_accuracy: 0.4900


  _warn_prf(average, modifier, msg_start, len(result))
