In [13]:
import json
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
import torch

# Read the data from GPT.jsonl
with open('GPT.jsonl', 'r') as file:
    lines = file.readlines()

data = [json.loads(line) for line in lines]

# Create a DataFrame
df = pd.DataFrame(data)

# Convert labels to integers
df['label'] = df['label'].map({'machine': 0, 'human': 1})

# Split the data into train and test sets
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# Create a Hugging Face Dataset
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

# Tokenize the data
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

def tokenize_data(data):
    return tokenizer(data['text'], padding='max_length', truncation=True, max_length=128)

train_dataset = train_dataset.map(tokenize_data, batched=True)
test_dataset = test_dataset.map(tokenize_data, batched=True)

# Set up the model and training arguments
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
)

# Define the evaluation metrics
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(axis=-1)
    return {
        'accuracy': accuracy_score(labels, predictions),
        'precision': precision_score(labels, predictions),
        'recall': recall_score(labels, predictions),
        'f1': f1_score(labels, predictions),
    }

# Set up the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)

# Train the model
trainer.train()

# Evaluate the model
results = trainer.evaluate()

print("Evaluation results:")
for key, value in results.items():
    print(f"{key}: {value}")


Map:   0%|          | 0/1600 [00:00<?, ? examples/s]

Map:   0%|          | 0/400 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Step,Training Loss


Evaluation results:
eval_loss: 0.38436031341552734
eval_accuracy: 0.8475
eval_precision: 0.8888888888888888
eval_recall: 0.7960199004975125
eval_f1: 0.8398950131233597
eval_runtime: 0.9555
eval_samples_per_second: 418.634
eval_steps_per_second: 7.326
epoch: 3.0


In [16]:
import json
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
import torch

# Read the data from GPT.jsonl
with open('GPT.jsonl', 'r') as file:
    lines = file.readlines()

data = [json.loads(line) for line in lines]

# Create a DataFrame
df = pd.DataFrame(data)

# Convert labels to integers
df['label'] = df['label'].map({'machine': 0, 'human': 1})

# Split the data into train and test sets
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# Create a Hugging Face Dataset
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

# Tokenize the data
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

def tokenize_data(data):
    return tokenizer(data['text'], padding='max_length', truncation=True, max_length=128)

train_dataset = train_dataset.map(tokenize_data, batched=True)
test_dataset = test_dataset.map(tokenize_data, batched=True)

# Set up the model and training arguments
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    warmup_steps=100,
    weight_decay=0.01,
    logging_dir='./logs',
)


def beam_search(logits, beam_size=3):
    """
    Beam search to find the top beam_size labels.
    """
    # Convert logits to PyTorch tensor
    logits_tensor = torch.tensor(logits)
    
    # Ensure the number of labels is greater than or equal to the beam size
    if logits_tensor.size(-1) < beam_size:
        beam_size = logits_tensor.size(-1)
    
    # Apply topk to logits tensor
    topk = logits_tensor.topk(beam_size, dim=-1)
    topk_indices = topk.indices.tolist()
    topk_values = topk.values.tolist()
    
    return topk_indices, topk_values



# Define the evaluation metrics
def compute_metrics_with_beam(eval_pred):
    logits, labels = eval_pred
    
    # Apply beam search to logits
    beam_indices, beam_values = beam_search(logits)
    
    # Convert beam indices to predictions
    predictions = [indices[0] for indices in beam_indices]
    
    return {
        'accuracy': accuracy_score(labels, predictions),
        'precision': precision_score(labels, predictions, average='macro'),
        'recall': recall_score(labels, predictions, average='macro'),
        'f1': f1_score(labels, predictions, average='macro'),
    }

# Set up the Trainer with beam search metrics
trainer_beam = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics_with_beam,
)

# Train the model
trainer_beam.train()

# Evaluate the model with beam search
results_beam = trainer_beam.evaluate()

print("Evaluation results with beam search:")
for key, value in results_beam.items():
    print(f"{key}: {value}")


Map:   0%|          | 0/1600 [00:00<?, ? examples/s]

Map:   0%|          | 0/400 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Step,Training Loss
500,0.7431
1000,0.7159
1500,0.5871
2000,0.3633


Evaluation results with beam search:
eval_loss: 0.9556524753570557
eval_accuracy: 0.8175
eval_precision: 0.8184210526315789
eval_recall: 0.8176329408235206
eval_f1: 0.8174075625785555
eval_runtime: 1.6908
eval_samples_per_second: 236.576
eval_steps_per_second: 118.288
epoch: 3.0
