# Imports

In [154]:
import torch
import torch.nn as nn
from transformers import BertTokenizer, BertModel, BartTokenizer, BartForConditionalGeneration,  BertForSequenceClassification, Trainer, TrainingArguments
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
import random
from transformers import GPT2LMHeadModel, GPT2TokenizerFast, GPT2Tokenizer
import torch.nn.functional as F
import json
import textwrap
from sklearn.metrics import accuracy_score
from collections import Counter
import numpy as np
from sklearn.linear_model import LogisticRegression

In [155]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [156]:
articles = []

with open('articles.json', 'r', encoding='utf-8') as f:
    articles = json.load(f)  

model_name = "facebook/bart-large-cnn"
tokenizer = BartTokenizer.from_pretrained(model_name)
model = BartForConditionalGeneration.from_pretrained(model_name).to(device)

def summarize_text(text, max_len=80):
    inputs = tokenizer([text], max_length=1024, return_tensors="pt", truncation=True).to(device)
    summary_ids = model.generate(inputs["input_ids"], num_beams=4, max_length=max_len, early_stopping=True)
    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)



In [None]:
summaries = [summarize_text(article['text']) for article in articles]

with open("summaries.txt", "w", encoding='utf-8') as file:
    for summary in summaries:
        file.write(f"{summary}\n")

print("summaries.txt generated")"""

In [4]:
def generate_article(summary_sentences, min_words=200, max_words=1000, model_name='gpt2-medium'):
    # Load pre-trained model and tokenizer
    model = GPT2LMHeadModel.from_pretrained(model_name)
    tokenizer = GPT2Tokenizer.from_pretrained(model_name)

    # Encode the summary sentences
    input_text = " ".join(summary_sentences)
    input_ids = tokenizer.encode(input_text, return_tensors='pt')

    # Generate text
    output = model.generate(
        input_ids,
        max_length=max_words,
        min_length=min_words,
        num_beams=5,   # Controls the quality of the generated text
        no_repeat_ngram_size=2,  # Avoids repetition
        early_stopping=True,
        temperature=0.7,  # Controls randomness, lower is more focused
        top_k=50,  # Limits the sampling pool
        top_p=0.95  # Nucleus sampling
    )

    # Decode the output
    generated_text = tokenizer.decode(output[0], skip_special_tokens=True)

    # Return the generated article
    return generated_text


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


The importance of climate change is increasing globally. Governments and organizations are focusing more on reducing carbon emissions. Renewable energy sources are becoming more cost-effective and widely used.

The United Nations Framework Convention on Climate Change (UNFCCC) is an international agreement that aims to limit global warming to 2 degrees Celsius (3.6 degrees Fahrenheit) above pre-industrial levels by the end of this century. The agreement was signed in Paris in December 2015, and has been ratified by more than 190 countries. It sets a goal of limiting global temperature rise to 1.5 degrees C (2.7 degrees F) by 2100, with a target of keeping global average temperatures from rising above that level for at least the next 150 years. This goal is based on the assumption that emissions of carbon dioxide, methane, nitrous oxide, hydrofluorocarbons (HFCs), and other greenhouse gases will continue to decline over the course of the 21st century, as they have in the past.


In [8]:
summary_sentences = ["President Donald Trump lost.", "But Trumpism did not.","It won in the parts of the country and with the voters whom Trump catered to. Joe Biden defeated Trump to win the presidency, and is on pace to win up to 306 electoral votes.",
                     "In a typical election year, such a victory would mean Biden would have carried other Democrats along with him."
]

article1 = generate_article(summary_sentences)
print(article1)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


President Donald Trump lost. But Trumpism did not. It won in the parts of the country and with the voters whom Trump catered to. Joe Biden defeated Trump to win the presidency, and is on pace to win up to 306 electoral votes. In a typical election year, such a victory would mean Biden would have carried other Democrats along with him.

This is not to say that Trump's victory was a foregone conclusion. There are plenty of reasons to be skeptical of his victory, including the fact that he won the popular vote but lost the Electoral College. And there is no reason to believe that Democrats will be able to recapture the House of Representatives in 2018, which would be the first time since the Civil War that the party has won back the White House since Reconstruction. Still, it is important to remember that this election was not a referendum on Trump, or even on the Republican Party, but rather a contest between two very different visions for the future of American politics. The Democratic 

In [171]:
summary_sentences = ["Jumana Azam was working 16-hour days responding to an influx of coronavirus patients.", 
                     "During the worst of it, the 34-year-old respiratory therapist was facing multiple deaths a day.",
"Her hours have decreased since then, but another major event in Azam's life ended this year."]

article2 = generate_article(summary_sentences)
print(article2)


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Jumana Azam was working 16-hour days responding to an influx of coronavirus patients. During the worst of it, the 34-year-old respiratory therapist was facing multiple deaths a day. Her hours have decreased since then, but another major event in Azam's life ended this year.

Azam, who was born in Pakistan, was diagnosed with acute respiratory syndrome, or ARDS, when she was 4 years old. The condition is characterized by fever, cough, and shortness of breath, which can last for days or even weeks. It's caused by a virus that infects the airways, causing inflammation and inflammation of the lungs, leading to pneumonia, bronchitis, emphysema, sinusitis and even death. In the United States, more than 1,000 people die each year from the virus, according to the Centers for Disease Control and Prevention (CDC). The virus is spread through close contact with an infected person, such as coughing, sneezing or touching a sick person's face or mouth. A person can also contract the disease from ano

In [187]:
summary_sentences = ["One of the children died soon after being rescued, while a fourth child was still trapped.",
                    "The Friday afternoon quake that struck Turkey’s Aegean coast and north of the Greek island of Samos registered a magnitude of 6.6.",
                    "It toppled buildings in Izmir and triggered a small tsunami in the Seferihisar district."]

article3 = generate_article(summary_sentences)
print(article3)


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


One of the children died soon after being rescued, while a fourth child was still trapped. The Friday afternoon quake that struck Turkey’s Aegean coast and north of the Greek island of Samos registered a magnitude of 6.6. It toppled buildings in Izmir and triggered a small tsunami in the Seferihisar district.

Turkey's Prime Minister Ahmet Davutoğlu said the death toll was expected to rise as rescuers continued to search for survivors. "We are trying to find as many people as we can," he said, according to the state-run Anadolu news agency. He added that rescue teams had found the body of a woman who had been trapped in a collapsed building, but did not say whether she was among the dead. A rescue official told Reuters that the woman was believed to be in her late 20s or early 30s and was wearing a headscarf when she fell into the rubble. She was taken to a hospital for treatment, the official said.


In [189]:
summary_sentences = ["Nations are scrambling to ramp up vaccination campaigns in hopes of stemming the tide of infections.",
                     "Pfizer-BioNTech and Moderna vaccines, both based on new mRNA technology, have been approved for emergency use in multiple countries",
"Other vaccines, such as the Oxford-AstraZeneca vaccine, are in the final stages of approval.",
"distribution networks have been tested by the need to store and transport vaccines at extremely low temperatures.",
 "In many low-income countries, there are concerns about access to cold-chain infrastructure.",
                     "In some countries, misinformation and distrust of governments and pharmaceutical companies have led to doubts about the safety of the vaccines."]

article4 = generate_article(summary_sentences)
print(article4)


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


One of the children died soon after being rescued, while a fourth child was still trapped. The Friday afternoon quake that struck Turkey’s Aegean coast and north of the Greek island of Samos registered a magnitude of 6.6. It toppled buildings in Izmir and triggered a small tsunami in the Seferihisar district.

Turkey's Prime Minister Ahmet Davutoğlu said the death toll was expected to rise as rescuers continued to search for survivors. "We are trying to find as many people as we can," he said, according to the state-run Anadolu news agency. He added that rescue teams had found the body of a woman who had been trapped in a collapsed building, but did not say whether she was among the dead. A rescue official told Reuters that the woman was believed to be in her late 20s or early 30s and was wearing a headscarf when she fell into the rubble. She was taken to a hospital for treatment, the official said.


In [191]:
summary_sentences = ["California is enduring its worst wildfire season on record.",
                     "More than 3.1 million acres have been scorched and dozens of major blazes are still active.",
"The devastation has already claimed at least 25 lives, destroyed thousands of homes, and displaced hundreds of thousands of residents.",
                     "\"This is a crisis unlike anything we’ve seen,\" said Cal Fire Chief Thom Porter."]

article5 = generate_article(summary_sentences)
print(article5)


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


California is enduring its worst wildfire season on record. More than 3.1 million acres have been scorched and dozens of major blazes are still active. The devastation has already claimed at least 25 lives, destroyed thousands of homes, and displaced hundreds of thousands of residents. "This is a crisis unlike anything we’ve seen," said Cal Fire Chief Thom Porter.

The California Department of Forestry and Fire Protection (Cal Fire) has declared a state of emergency for the entire state, as well as the San Joaquin Valley, Kern County, Sacramento, San Francisco Bay Area, Northern California, Southern California and Northern Nevada. In addition, the National Weather Service has issued a severe thunderstorm watch for much of the state and the Pacific Northwest, including Washington, Oregon, Idaho, Montana, Nevada, Colorado, Utah, Arizona, New Mexico, Texas, Oklahoma, South Dakota, Nebraska, Kansas, Illinois, Indiana, Michigan, Wisconsin, Minnesota, Iowa, Missouri, Kentucky, Tennessee, Geo

In [193]:
summary_sentences = ["Margaret Keenan, 90, became the first person in the world to receive the Pfizer vaccine outside of clinical trials.",
                     "Officials have emphasized that patience will be needed before vaccines can bring an end to the pandemic.",
                     "\"This is a great Christmas present,\" Keenan says."]

article6 = generate_article(summary_sentences)
print(article6)


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Margaret Keenan, 90, became the first person in the world to receive the Pfizer vaccine outside of clinical trials. Officials have emphasized that patience will be needed before vaccines can bring an end to the pandemic. "This is a great Christmas present," Keenan says.

Keenan and her husband, John, have lived in New York City for more than 40 years. They have two children, a boy and a girl, and they have never had a flu shot. But when they heard about the vaccine, they knew they had to get it for their son, who has a rare form of the flu, called H1N1, which can cause severe illness and death. The vaccine is given to children between the ages of 6 months and 12 years old, but it can also be given at any age to adults who are at high risk of contracting the disease, such as pregnant women, people with weakened immune systems, or people who have recently traveled to an area where the virus has been circulating for a prolonged period of time.


In [157]:
gpt2_model = GPT2LMHeadModel.from_pretrained("gpt2").to(device)
gpt2_tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")

def compute_perplexity(text, model, tokenizer, device):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=128).to(device)
    with torch.no_grad():
        outputs = model(**inputs, labels=inputs["input_ids"])
        loss = outputs.loss
    return torch.exp(loss).item()

In [158]:
def compute_burstiness(text, tokenizer):
    tokenized = tokenizer.encode(text)
    token_counts = Counter(tokenized)
    frequencies = list(token_counts.values())
    if len(frequencies) > 1:
        mean_freq = np.mean(frequencies)
        variance = np.var(frequencies)
        burstiness_score = variance / mean_freq if mean_freq != 0 else 0
    else:
        burstiness_score = 0 
    
    return burstiness_score

In [15]:
perplexity = compute_perplexity(article1, gpt2_model, gpt2_tokenizer, device)
burstiness = compute_burstiness(article1, gpt2_tokenizer)
print(f"Perplexity: {perplexity}")
print(f"Burstiness: {burstiness}")
perplexity = compute_perplexity(articles[0]['text'], gpt2_model, gpt2_tokenizer, device)
burstiness = compute_burstiness(articles[0]['text'], gpt2_tokenizer)
print(f"Perplexity: {perplexity}")
print(f"Burstiness: {burstiness}")

Perplexity: 12.000144004821777
Burstiness: 2.3336466165413534


In [185]:
perplexity = compute_perplexity(article2, gpt2_model, gpt2_tokenizer, device)
burstiness = compute_burstiness(article2, gpt2_tokenizer)
print(f"Perplexity: {perplexity}")
print(f"Burstiness: {burstiness}")
perplexity = compute_perplexity(articles[10]['text'], gpt2_model, gpt2_tokenizer, device)
burstiness = compute_burstiness(articles[10]['text'], gpt2_tokenizer)
print(f"Perplexity: {perplexity}")
print(f"Burstiness: {burstiness}")

Perplexity: 11.710691452026367
Burstiness: 2.4543227856480865
Perplexity: 43.00416564941406
Burstiness: 1.288589599700711


In [188]:
perplexity = compute_perplexity(article3, gpt2_model, gpt2_tokenizer, device)
burstiness = compute_burstiness(article3, gpt2_tokenizer)
print(f"Perplexity: {perplexity}")
print(f"Burstiness: {burstiness}")
perplexity = compute_perplexity(articles[20]['text'], gpt2_model, gpt2_tokenizer, device)
burstiness = compute_burstiness(articles[20]['text'], gpt2_tokenizer)
print(f"Perplexity: {perplexity}")
print(f"Burstiness: {burstiness}")

Perplexity: 11.565011024475098
Burstiness: 1.2923367083581494
Perplexity: 21.6721248626709
Burstiness: 1.0806883675623025


In [190]:
perplexity = compute_perplexity(article4, gpt2_model, gpt2_tokenizer, device)
burstiness = compute_burstiness(article4, gpt2_tokenizer)
print(f"Perplexity: {perplexity}")
print(f"Burstiness: {burstiness}")
perplexity = compute_perplexity(articles[96]['text'], gpt2_model, gpt2_tokenizer, device)
burstiness = compute_burstiness(articles[96]['text'], gpt2_tokenizer)
print(f"Perplexity: {perplexity}")
print(f"Burstiness: {burstiness}")

Perplexity: 35.59250259399414
Burstiness: 2.3445199660152927
Perplexity: 21.9725341796875
Burstiness: 1.614270685067145


In [192]:
perplexity = compute_perplexity(article5, gpt2_model, gpt2_tokenizer, device)
burstiness = compute_burstiness(article5, gpt2_tokenizer)
print(f"Perplexity: {perplexity}")
print(f"Burstiness: {burstiness}")
perplexity = compute_perplexity(articles[92]['text'], gpt2_model, gpt2_tokenizer, device)
burstiness = compute_burstiness(articles[92]['text'], gpt2_tokenizer)
print(f"Perplexity: {perplexity}")
print(f"Burstiness: {burstiness}")

Perplexity: 10.235406875610352
Burstiness: 9.665434173669471
Perplexity: 13.818424224853516
Burstiness: 1.268199233716475


In [194]:
perplexity = compute_perplexity(article6, gpt2_model, gpt2_tokenizer, device)
burstiness = compute_burstiness(article6, gpt2_tokenizer)
print(f"Perplexity: {perplexity}")
print(f"Burstiness: {burstiness}")
perplexity = compute_perplexity(articles[99]['text'], gpt2_model, gpt2_tokenizer, device)
burstiness = compute_burstiness(articles[99]['text'], gpt2_tokenizer)
print(f"Perplexity: {perplexity}")
print(f"Burstiness: {burstiness}")

Perplexity: 10.209077835083008
Burstiness: 1.7292312024781757
Perplexity: 19.286537170410156
Burstiness: 0.348358585858586


In [159]:
texts = [article["text"] for article in articles]
labels = [0] * 50 + [1] * 50  # 1 = AI, 0 = Human

In [163]:
combined = list(zip(texts, labels))
random.shuffle(combined)
texts[:], labels[:] = zip(*combined)

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        enc = self.tokenizer(
            self.texts[idx],
            padding='max_length',
            truncation=True,
            max_length=self.max_len,
            return_tensors='pt'
        )
        return {
            'input_ids': enc['input_ids'].squeeze(0),
            'attention_mask': enc['attention_mask'].squeeze(0),
            'label': torch.tensor(self.labels[idx], dtype=torch.long)
        }

class BERTClassifier(nn.Module):
    def __init__(self, dropout=0.3):
        super().__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(self.bert.config.hidden_size, 2)

    def forward(self, input_ids, attention_mask):
        out = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled = self.dropout(out.pooler_output)
        return self.fc(pooled)

def train_model(model, dataloader, optimizer, loss_fn, device):
    model.train()
    total_loss = 0
    for batch in dataloader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask)
        loss = loss_fn(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(dataloader)

def evaluate_model(model, dataloader, device):
    model.eval()
    correct, total = 0, 0
    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)
            outputs = model(input_ids, attention_mask)
            preds = torch.argmax(outputs, dim=1)
            correct += (preds == labels).sum().item()
            total += labels.size(0)
    return correct / total

train_texts, val_texts, train_labels, val_labels = train_test_split(
    texts, labels, test_size=0.2, stratify=labels, random_state=42
)
train_ds = TextDataset(train_texts, train_labels, tokenizer)
val_ds = TextDataset(val_texts, val_labels, tokenizer)
train_loader = DataLoader(train_ds, batch_size=16, shuffle=True)
val_loader = DataLoader(val_ds, batch_size=16)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BERTClassifier().to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
loss_fn = nn.CrossEntropyLoss()

for epoch in range(25):
    train_loss = train_model(model, train_loader, optimizer, loss_fn, device)
    val_acc = evaluate_model(model, val_loader, device)
    print(f"Epoch {epoch+1}: Train Loss = {train_loss:.4f} | Val Accuracy = {val_acc:.4f}")

Epoch 1: Train Loss = 0.6766 | Val Accuracy = 0.6000
Epoch 2: Train Loss = 0.6348 | Val Accuracy = 0.8500
Epoch 3: Train Loss = 0.5367 | Val Accuracy = 0.9000
Epoch 4: Train Loss = 0.3993 | Val Accuracy = 0.8500
Epoch 5: Train Loss = 0.3649 | Val Accuracy = 0.9500
Epoch 6: Train Loss = 0.2641 | Val Accuracy = 0.9500
Epoch 7: Train Loss = 0.2095 | Val Accuracy = 1.0000
Epoch 8: Train Loss = 0.1671 | Val Accuracy = 0.9500
Epoch 9: Train Loss = 0.1097 | Val Accuracy = 1.0000
Epoch 10: Train Loss = 0.0809 | Val Accuracy = 1.0000
Epoch 11: Train Loss = 0.0700 | Val Accuracy = 1.0000
Epoch 12: Train Loss = 0.0444 | Val Accuracy = 0.9500
Epoch 13: Train Loss = 0.0339 | Val Accuracy = 1.0000
Epoch 14: Train Loss = 0.0296 | Val Accuracy = 1.0000
Epoch 15: Train Loss = 0.0237 | Val Accuracy = 1.0000
Epoch 16: Train Loss = 0.0193 | Val Accuracy = 1.0000
Epoch 17: Train Loss = 0.0154 | Val Accuracy = 1.0000
Epoch 18: Train Loss = 0.0136 | Val Accuracy = 1.0000
Epoch 19: Train Loss = 0.0128 | Val A

In [164]:
def predict(model, tokenizer, texts, device, max_len=128):
    model.eval()
    predictions = []
    with torch.no_grad():
        for text in texts:
            enc = tokenizer(
                text,
                padding='max_length',
                truncation=True,
                max_length=max_len,
                return_tensors='pt'
            )
            input_ids = enc['input_ids'].to(device)
            attention_mask = enc['attention_mask'].to(device)

            outputs = model(input_ids, attention_mask)
            pred = torch.argmax(outputs, dim=1).item()
            predictions.append(pred)
    return predictions

In [165]:
with open('testarticles.json', 'r', encoding='utf-8') as f:
    testarticles = json.load(f)  

test_texts = [test["text"] for test in testarticles]
truevalue = ["Human","Human","Human","Human","Human","AI","AI","AI","AI","AI"]
# Predict (0 = human, 1 = AI)
i = 0
predicted_classes = predict(model, tokenizer, test_texts, device)
for text, pred in zip(test_texts, predicted_classes):
    label = "AI" if pred == 1 else "Human"
    print(f"[{label}] {truevalue[i]}")
    i +=1

[Human] Human
[Human] Human
[Human] Human
[Human] Human
[AI] Human
[AI] AI
[AI] AI
[AI] AI
[AI] AI
[AI] AI


In [169]:
perplexities = [compute_perplexity(text, gpt2_model, gpt2_tokenizer, device) for text in texts]

X = np.array(perplexities).reshape(-1, 1)  
y = np.array(labels)  

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

classifier = LogisticRegression()
classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")

test_texts = [test["text"] for test in testarticles]
truevalue = ["Human","Human","Human","Human","Human","AI","AI","AI","AI","AI"]

i=0
for text in test_texts:
    new_perplexity = compute_perplexity(text, gpt2_model, gpt2_tokenizer, device)
    pred = classifier.predict([[new_perplexity]])[0]
    label = "AI" if pred == 1 else "Human"
    print(f"[{label}] {truevalue[i]} ({new_perplexity})")
    i +=1

Accuracy: 63.33%
[Human] Human (49.05767059326172)
[AI] Human (27.24824333190918)
[Human] Human (61.7662353515625)
[Human] Human (50.51237487792969)
[AI] Human (13.234869956970215)
[AI] AI (27.44577407836914)
[AI] AI (23.773452758789062)
[AI] AI (29.323810577392578)
[AI] AI (32.390220642089844)
[AI] AI (18.873767852783203)


In [170]:
combined = list(zip(texts, labels))
random.shuffle(combined)
texts[:], labels[:] = zip(*combined)

class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, gpt2_model, gpt2_tokenizer, max_len=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.gpt2_model = gpt2_model
        self.gpt2_tokenizer = gpt2_tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        enc = self.tokenizer(
            text,
            padding='max_length',
            truncation=True,
            max_length=self.max_len,
            return_tensors='pt'
        )
        
        perplexity = compute_perplexity(text, self.gpt2_model, self.gpt2_tokenizer, device)

        return {
            'input_ids': enc['input_ids'].squeeze(0),
            'attention_mask': enc['attention_mask'].squeeze(0),
            'label': torch.tensor(self.labels[idx], dtype=torch.long),
            'perplexity': torch.tensor(perplexity, dtype=torch.float)
        }

class BERTClassifier(nn.Module):
    def __init__(self, dropout=0.3):
        super().__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(self.bert.config.hidden_size + 1, 2)  # Add 1 to dimension for perplexity feature

    def forward(self, input_ids, attention_mask, perplexity):
        out = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled = self.dropout(out.pooler_output)
        
        x = torch.cat((pooled, perplexity.unsqueeze(1)), dim=1)  # Add 1 to dimension for perplexity feature
        return self.fc(x)

def train_model(model, dataloader, optimizer, loss_fn, device):
    model.train()
    total_loss = 0
    for batch in dataloader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)
        perplexity = batch['perplexity'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask, perplexity)
        loss = loss_fn(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(dataloader)

def evaluate_model(model, dataloader, device):
    model.eval()
    correct, total = 0, 0
    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)
            perplexity = batch['perplexity'].to(device)
            outputs = model(input_ids, attention_mask, perplexity)
            preds = torch.argmax(outputs, dim=1)
            correct += (preds == labels).sum().item()
            total += labels.size(0)
    return correct / total

train_texts, val_texts, train_labels, val_labels = train_test_split(
    texts, labels, test_size=0.2, stratify=labels, random_state=42
)
train_ds = TextDataset(train_texts, train_labels, bert_tokenizer, gpt2_model, gpt2_tokenizer)
val_ds = TextDataset(val_texts, val_labels, bert_tokenizer, gpt2_model, gpt2_tokenizer)
train_loader = DataLoader(train_ds, batch_size=16, shuffle=True)
val_loader = DataLoader(val_ds, batch_size=16)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BERTClassifier().to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
loss_fn = nn.CrossEntropyLoss()

for epoch in range(25):
    train_loss = train_model(model, train_loader, optimizer, loss_fn, device)
    val_acc = evaluate_model(model, val_loader, device)
    print(f"Epoch {epoch+1}: Train Loss = {train_loss:.4f} | Val Accuracy = {val_acc:.4f}")

Epoch 1: Train Loss = 0.9396 | Val Accuracy = 0.4000
Epoch 2: Train Loss = 0.8129 | Val Accuracy = 0.5500
Epoch 3: Train Loss = 0.6868 | Val Accuracy = 0.5000
Epoch 4: Train Loss = 0.5566 | Val Accuracy = 0.6500
Epoch 5: Train Loss = 0.4195 | Val Accuracy = 0.7500
Epoch 6: Train Loss = 0.3240 | Val Accuracy = 0.6000
Epoch 7: Train Loss = 0.2519 | Val Accuracy = 0.8000
Epoch 8: Train Loss = 0.1931 | Val Accuracy = 0.9000
Epoch 9: Train Loss = 0.1482 | Val Accuracy = 0.9000
Epoch 10: Train Loss = 0.1125 | Val Accuracy = 0.8000
Epoch 11: Train Loss = 0.0967 | Val Accuracy = 0.8000
Epoch 12: Train Loss = 0.0824 | Val Accuracy = 0.8000
Epoch 13: Train Loss = 0.0714 | Val Accuracy = 0.9000
Epoch 14: Train Loss = 0.0636 | Val Accuracy = 0.9000
Epoch 15: Train Loss = 0.0649 | Val Accuracy = 0.8500
Epoch 16: Train Loss = 0.0525 | Val Accuracy = 0.8500
Epoch 17: Train Loss = 0.0670 | Val Accuracy = 0.8000
Epoch 18: Train Loss = 0.0444 | Val Accuracy = 0.8000
Epoch 19: Train Loss = 0.0511 | Val A

In [93]:
def predict(text, model, tokenizer, gpt2_model, gpt2_tokenizer, device, max_len=128):
    encoding = tokenizer(
        text,
        padding='max_length',
        truncation=True,
        max_length=max_len,
        return_tensors='pt'
    ).to(device)
    
    perplexity = compute_perplexity(text, gpt2_model, gpt2_tokenizer, device)
    
    model.eval()
    with torch.no_grad():
        input_ids = encoding['input_ids']
        attention_mask = encoding['attention_mask']
        
        outputs = model(input_ids, attention_mask, torch.tensor([perplexity], dtype=torch.float).to(device))
        
        predicted_label = torch.argmax(outputs, dim=1).item()
    
    return predicted_label

In [94]:
with open('testarticles.json', 'r', encoding='utf-8') as f:
    testarticles = json.load(f)

test_texts = [test["text"] for test in testarticles]

truevalue = ["Human", "Human", "Human", "Human", "Human", "AI", "AI", "AI", "AI", "AI"]

predicted_classes = []
for text in test_texts:
    pred = predict(text, model, bert_tokenizer, gpt2_model, gpt2_tokenizer, device)
    predicted_classes.append(pred)
i = 0
for text, pred in zip(test_texts, predicted_classes):
    label = "AI" if pred == 1 else "Human"
    print(f"[{label}] {truevalue[i]}")
    i += 1

[Human] Human
[Human] Human
[Human] Human
[AI] Human
[AI] Human
[AI] AI
[Human] AI
[AI] AI
[AI] AI
[AI] AI


In [80]:
htext, aitext = texts[:50], texts[50:]
for text in htext:
    perplexity += compute_perplexity(text, gpt2_model, gpt2_tokenizer, device)
perplexity /= 50
print(f"Human Perplexity: {perplexity:.4f}")
for  text in aitext:
    perplexity += compute_perplexity(text, gpt2_model, gpt2_tokenizer, device)
perplexity /= 50
print(f"AI Perplexity: {perplexity:.4f}")

Human Perplexity: 37.7479
AI Perplexity: 29.0534


In [83]:
for text in htext:
    burstiness_score += compute_burstiness(text, tokenizer)
burstiness_score /= 50
print(f"Burstiness: {burstiness_score:.4f}")
for text in aitext:
    burstiness_score += compute_burstiness(text, tokenizer)
burstiness_score /= 50
print(f"Burstiness: {burstiness_score:.4f}")


Burstiness: 2.8731
Burstiness: 2.0008


In [177]:
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, gpt2_model, gpt2_tokenizer, max_len=256):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.gpt2_model = gpt2_model
        self.gpt2_tokenizer = gpt2_tokenizer
        self.perplexities = [compute_perplexity(t,gpt2_model, gpt2_tokenizer, device) for t in texts]
        self.burstinesses = [compute_burstiness(t, gpt2_tokenizer) for t in texts]

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        enc = self.tokenizer(
        text,
        padding='max_length',
        truncation=True,
        max_length=self.max_len,
        return_tensors='pt'
        )
    
        perplexity = compute_perplexity(text, self.gpt2_model, self.gpt2_tokenizer, device)
        burstiness = compute_burstiness(text, self.tokenizer)

        return {
        'input_ids': enc['input_ids'].squeeze(0),
        'attention_mask': enc['attention_mask'].squeeze(0),
        'label': torch.tensor(self.labels[idx], dtype=torch.long),
        'perplexity': torch.tensor(perplexity, dtype=torch.float),
        'burstiness': torch.tensor(burstiness, dtype=torch.float)
        }

In [178]:
class BERTClassifier(nn.Module):
    def __init__(self, dropout=0.3):
        super().__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.dropout = nn.Dropout(dropout)
        
        self.fc = nn.Linear(self.bert.config.hidden_size + 2, 2) # Add 2 for perplexity and burstiness

    def forward(self, input_ids, attention_mask, perplexity, burstiness):
        out = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled = self.dropout(out.pooler_output)

        x = torch.cat((pooled, perplexity.unsqueeze(1), burstiness.unsqueeze(1)), dim=1)
        return self.fc(x)

In [179]:
def train_model(model, dataloader, optimizer, loss_fn, device):
    model.train()
    total_loss = 0
    for batch in dataloader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)
        perplexity = batch['perplexity'].to(device)
        burstiness = batch['burstiness'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask, perplexity, burstiness)
        loss = loss_fn(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(dataloader)

In [180]:
def evaluate_model(model, dataloader, device):
    model.eval()
    correct, total = 0, 0
    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)
            perplexity = batch['perplexity'].to(device)
            burstiness = batch['burstiness'].to(device)

            outputs = model(input_ids, attention_mask, perplexity, burstiness)
            preds = torch.argmax(outputs, dim=1)
            correct += (preds == labels).sum().item()
            total += labels.size(0)
    return correct / total

In [181]:
train_texts, val_texts, train_labels, val_labels = train_test_split(
    texts, labels, test_size=0.2, stratify=labels, random_state=42
)

train_ds = TextDataset(train_texts, train_labels, bert_tokenizer, gpt2_model, gpt2_tokenizer, max_len=128)
val_ds = TextDataset(val_texts, val_labels, bert_tokenizer, gpt2_model, gpt2_tokenizer)

train_loader = DataLoader(train_ds, batch_size=16, shuffle=True)
val_loader = DataLoader(val_ds, batch_size=16)

model = BERTClassifier().to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
loss_fn = nn.CrossEntropyLoss()

for epoch in range(25):
    train_loss = train_model(model, train_loader, optimizer, loss_fn, device)
    val_acc = evaluate_model(model, val_loader, device)
    print(f"Epoch {epoch+1}: Train Loss = {train_loss:.4f} | Val Accuracy = {val_acc:.4f}")

Token indices sequence length is longer than the specified maximum sequence length for this model (1048 > 1024). Running this sequence through the model will result in indexing errors


Epoch 1: Train Loss = 0.6979 | Val Accuracy = 0.7500
Epoch 2: Train Loss = 0.6024 | Val Accuracy = 0.7000
Epoch 3: Train Loss = 0.4814 | Val Accuracy = 0.8000
Epoch 4: Train Loss = 0.2971 | Val Accuracy = 0.9000
Epoch 5: Train Loss = 0.2012 | Val Accuracy = 0.9500
Epoch 6: Train Loss = 0.1399 | Val Accuracy = 0.9500
Epoch 7: Train Loss = 0.0878 | Val Accuracy = 0.9500
Epoch 8: Train Loss = 0.0737 | Val Accuracy = 0.9500
Epoch 9: Train Loss = 0.0554 | Val Accuracy = 0.9500
Epoch 10: Train Loss = 0.0429 | Val Accuracy = 0.9500
Epoch 11: Train Loss = 0.0393 | Val Accuracy = 0.9500
Epoch 12: Train Loss = 0.0349 | Val Accuracy = 0.9500
Epoch 13: Train Loss = 0.0281 | Val Accuracy = 0.9500
Epoch 14: Train Loss = 0.0263 | Val Accuracy = 0.9500
Epoch 15: Train Loss = 0.0223 | Val Accuracy = 0.9500
Epoch 16: Train Loss = 0.0203 | Val Accuracy = 0.9500
Epoch 17: Train Loss = 0.0206 | Val Accuracy = 0.9500
Epoch 18: Train Loss = 0.0194 | Val Accuracy = 0.9500
Epoch 19: Train Loss = 0.0186 | Val A

In [182]:
def predict(text, model, tokenizer, gpt2_model, gpt2_tokenizer, device, max_len=128):
    encoding = tokenizer(
        text,
        padding='max_length',
        truncation=True,
        max_length=max_len,
        return_tensors='pt'
    ).to(device)
    
    perplexity = compute_perplexity(text, gpt2_model, gpt2_tokenizer, device)

    burstiness = compute_burstiness(text, tokenizer)

    model.eval()
    with torch.no_grad():

        input_ids = encoding['input_ids']
        attention_mask = encoding['attention_mask']
        

        outputs = model(input_ids, attention_mask, 
                         torch.tensor([perplexity], dtype=torch.float).to(device),
                         torch.tensor([burstiness], dtype=torch.float).to(device))
        

        predicted_label = torch.argmax(outputs, dim=1).item()
    
    return predicted_label

In [183]:
with open('testarticles.json', 'r', encoding='utf-8') as f:
    testarticles = json.load(f)

test_texts = [test["text"] for test in testarticles]

truevalue = ["Human", "Human", "Human", "Human", "Human", "AI", "AI", "AI", "AI", "AI"]

predicted_classes = []
for text in test_texts:
    pred = predict(text, model, bert_tokenizer, gpt2_model, gpt2_tokenizer, device)
    predicted_classes.append(pred)
i = 0
for text, pred in zip(test_texts, predicted_classes):
    label = "AI" if pred == 1 else "Human"
    print(f"[{label}] {truevalue[i]}")
    i += 1

[Human] Human
[Human] Human
[Human] Human
[Human] Human
[AI] Human
[Human] AI
[Human] AI
[AI] AI
[AI] AI
[AI] AI
