<a href="https://colab.research.google.com/github/dasunhq/gptsniffer-implementations/blob/main/GPTSniffer_Impl_AIGCodeSet_V2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install transformers==4.27.0 datasets accelerate -U



In [2]:
import torch
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer

# 1. Load the dataset from Hugging Face
DATASET_NAME = "basakdemirok/AIGCodeSet"
raw_datasets = load_dataset(DATASET_NAME)

# 2. Define the pre-trained CodeBERT model
MODEL_CHECKPOINT = "microsoft/codebert-base"

# 3. Load the CodeBERT Tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

data/all_data_with_ada_embeddings_will_b(…):   0%|          | 0.00/265M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/7583 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/7583 [00:00<?, ? examples/s]



tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/498 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

In [3]:
def tokenize_function(examples):
    return tokenizer(
        examples["code"],
        padding="max_length",
        truncation=True
    )

# Apply the tokenization to the entire dataset
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)

# Prepare data for training: Rename 'label' column and remove unused columns
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets = tokenized_datasets.remove_columns(["code"])
tokenized_datasets.set_format("torch")

# Split the dataset for training and evaluation
train_dataset = tokenized_datasets["train"].shuffle(seed=42)
eval_dataset = tokenized_datasets["test"].shuffle(seed=42)

Map:   0%|          | 0/7583 [00:00<?, ? examples/s]

Map:   0%|          | 0/7583 [00:00<?, ? examples/s]

In [6]:
# 1. Load the model for sequence classification
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_CHECKPOINT,
    num_labels=2 # Binary classification (Human vs. AI)
)

# 2. Define Training Arguments
training_args = TrainingArguments(
    output_dir="./gptsnipper_results",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="./gptsnipper_logs",
    logging_steps=100,
    evaluation_strategy="epoch",
    save_strategy="epoch",  # Changed from save_steps=1000
    load_best_model_at_end=True,
    report_to="none"
)

# 3. Define the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,

)

# 4. Start Fine-Tuning
print("Starting GPTSniffer Fine-Tuning...")
trainer.train()
print("Fine-Tuning Complete. The resulting model is your GPTSniffer.")

Some weights of the model checkpoint at microsoft/codebert-base were not used when initializing RobertaForSequenceClassification: ['pooler.dense.bias', 'pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at microsoft/codebert-base and are newly initialized: ['classifier.out_proj.weight', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.dense.bias']
You should probably TRAIN this model on a down-stream task to be 

Starting GPTSniffer Fine-Tuning...




Epoch,Training Loss,Validation Loss
1,0.5841,0.534294
2,0.5328,0.51585
3,0.4849,0.479776


Fine-Tuning Complete. The resulting model is your GPTSniffer.


In [7]:
# 1. Evaluate the final model
print("\n--- Final Evaluation ---")
metrics = trainer.evaluate()
print(metrics)

# 2. Save the final GPTSniffer model
model.save_pretrained("./gptsnipper_final_model")
tokenizer.save_pretrained("./gptsnipper_final_model")


--- Final Evaluation ---


{'eval_loss': 0.4797757565975189, 'eval_runtime': 229.6612, 'eval_samples_per_second': 33.018, 'eval_steps_per_second': 4.128, 'epoch': 3.0}


('./gptsnipper_final_model/tokenizer_config.json',
 './gptsnipper_final_model/special_tokens_map.json',
 './gptsnipper_final_model/vocab.json',
 './gptsnipper_final_model/merges.txt',
 './gptsnipper_final_model/added_tokens.json',
 './gptsnipper_final_model/tokenizer.json')

In [9]:
from transformers import pipeline

# Load the saved model and tokenizer
model_path = "./gptsnipper_final_model"
tokenizer_path = "./gptsnipper_final_model"

# Create a pipeline for sequence classification
classifier = pipeline(
    "sentiment-analysis",
    model=model_path,
    tokenizer=tokenizer_path,
    device=0 if torch.cuda.is_available() else -1,
    framework="pt" # Specify PyTorch framework
)

In [11]:
# Prepare some example code snippets
code_1 = """
num = int(input("Enter number:"))

if(num%2 == 0):
	print(num, " is an even number.")
else:
	print(num, " is an odd number.")
"""

code_2 = """
def check_even_odd():
    try:
        number = int(input("Enter a number: "))
        if number % 2 == 0:
            print(f"{number} is an even number.")
        else:
            print(f"{number} is an odd number.")
    except ValueError:
        print("Please enter a valid integer.")

# Run the function
if __name__ == "__main__":
    check_even_odd()
"""

# Classify the example code snippets
results_code1 = classifier(code_1)
results_code2 = classifier(code_2)

# LABEL_0 - Human-written
# LABEL_1 - AI-generated

print(results_code1)
print(results_code2)

[{'label': 'LABEL_1', 'score': 0.9841429591178894}]
[{'label': 'LABEL_0', 'score': 0.8882263898849487}]


In [13]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [16]:
import torch
from transformers import pipeline

# Define the path to your saved model in Google Drive
model_path_drive = "/content/drive/My Drive/gptsnipper_model" # Replace with the actual path

# Create a pipeline for sequence classification, loading from Drive
classifier_from_drive = pipeline(
    "sentiment-analysis",
    model=model_path_drive,
    tokenizer=model_path_drive,
    device=0 if torch.cuda.is_available() else -1,
    framework="pt"
)


In [14]:
import shutil
import os

source_directory = "./gptsnipper_final_model"
destination_directory = "/content/drive/My Drive/gptsnipper_model"
# Create the destination directory if it doesn't exist
if not os.path.exists(destination_directory):
    os.makedirs(destination_directory)

# Copy the contents of the source directory to the destination directory
shutil.copytree(source_directory, destination_directory, dirs_exist_ok=True)

print(f"Model and tokenizer saved to: {destination_directory}")

Model and tokenizer saved to: /content/drive/My Drive/gptsnipper_model
