In [None]:
!pip install transformers datasets

import pandas as pd
from sklearn.model_selection import train_test_split as split_data
from transformers import BertTokenizer as CustomBertTokenizer, BertForSequenceClassification as CustomBertForSequenceClassification, Trainer as CustomTrainer, TrainingArguments as CustomTrainingArguments
from datasets import Dataset as CustomDataset, load_metric as custom_load_metric
import torch as custom_torch
import numpy as np

# import google drive
import os
from google.colab import drive
drive.mount('/content/drive')



Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:


# Load dataset
dataset_df = pd.read_csv('/content/drive/My Drive/Dataset.csv')

# Keep only the relevant columns
dataset_df = dataset_df[['Requirement', 'Author']]

# Encode labels
dataset_df['Author'] = dataset_df['Author'].apply(lambda x: 0 if x == 'Human' else 1)

# Rename the label column to 'labels' as expected by the Trainer
dataset_df = dataset_df.rename(columns={'Author': 'labels'})

# Split the data into training and test sets
training_df, testing_df = split_data(dataset_df, test_size=0.3, random_state=42)

# Convert to Hugging Face's Dataset format
training_custom_dataset = CustomDataset.from_pandas(training_df)
testing_custom_dataset = CustomDataset.from_pandas(testing_df)

# Load BERT tokenizer
custom_tokenizer = CustomBertTokenizer.from_pretrained('bert-base-uncased')

# Tokenization function
def custom_tokenize_function(examples):
    return custom_tokenizer(examples['Requirement'], padding='max_length', truncation=True)

# Apply the tokenization
training_custom_dataset = training_custom_dataset.map(custom_tokenize_function, batched=True)
testing_custom_dataset = testing_custom_dataset.map(custom_tokenize_function, batched=True)

# Set format for PyTorch
training_custom_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
testing_custom_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Map:   0%|          | 0/280 [00:00<?, ? examples/s]

Map:   0%|          | 0/120 [00:00<?, ? examples/s]

In [None]:
# Load pre-trained BERT model
custom_model = CustomBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# Load metric
metric = custom_load_metric("accuracy")

# Training arguments
custom_training_args = CustomTrainingArguments(
    output_dir='./results',
    eval_strategy="epoch",  # Changed from evaluation_strategy to eval_strategy due to deprecation
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=2,
    weight_decay=0.01,
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
    evaluation_strategy="steps"
)

# Define compute metrics
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

# Trainer
custom_trainer = CustomTrainer(
    model=custom_model,
    args=custom_training_args,
    train_dataset=training_custom_dataset,
    eval_dataset=testing_custom_dataset,
    compute_metrics=compute_metrics
)



Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Fine-tune the model
custom_trainer.train()

# Evaluate the model
evaluation_results = custom_trainer.evaluate()

print(f"Test Accuracy: {evaluation_results['eval_accuracy']}")


Step,Training Loss,Validation Loss,Accuracy
10,0.5964,0.506244,0.85
20,0.4745,0.403295,0.866667
30,0.3129,0.34598,0.875
40,0.2859,0.330463,0.875
50,0.2417,0.324934,0.875
60,0.2271,0.33145,0.875
70,0.1826,0.332781,0.875


Test Accuracy: 0.875


In [None]:
import torch
# Predict and show sample results
predictions = custom_trainer.predict(testing_custom_dataset)
# Convert predictions to a PyTorch Tensor if it's a NumPy array
predicted_labels = torch.argmax(torch.Tensor(predictions.predictions), axis=1)

# Show some sample predictions
sample_testing_df = testing_df.copy()
sample_testing_df['Predicted Author'] = predicted_labels.numpy()
sample_testing_df['Predicted Author'] = sample_testing_df['Predicted Author'].apply(lambda x: 'Human' if x == 0 else 'ChatGPT')

# Display first 5 samples
sample_testing_df[['Requirement', 'labels', 'Predicted Author']].head()

Unnamed: 0,Requirement,labels,Predicted Author
209,Access must be provided to monitor and evaluat...,1,ChatGPT
280,Access to customized music therapy sessions fo...,1,Human
33,Having a wide device compatibility for the app...,0,Human
210,The system must ensure the security of user da...,1,ChatGPT
93,Providing multiple language support suitable f...,0,Human


In [None]:
sample_testing_df = testing_df.copy()
sample_testing_df['Predicted Author'] = predicted_labels.numpy()
sample_testing_df[['Requirement', 'labels', 'Predicted Author']].head()

Unnamed: 0,Requirement,labels,Predicted Author
209,Access must be provided to monitor and evaluat...,1,1
280,Access to customized music therapy sessions fo...,1,0
33,Having a wide device compatibility for the app...,0,0
210,The system must ensure the security of user da...,1,1
93,Providing multiple language support suitable f...,0,0
