**This file is meant to be run on google colab to take use of the free resources as model saving.**

In [None]:
from google.colab import drive
import datetime

# Mount Google Drive
drive.mount('/content/gdrive')

In [None]:

import pandas as pd

# Replace with the actual file path
df = pd.read_parquet('/content/gdrive/MyDrive/chat_checker/Data/articles.parquet')
df



In [None]:

# Create a new DataFrame with only the 'title', 'Content', and 'generated_content' columns
new_df = df[['Title', 'Content', 'generated_content']]

# Melt the DataFrame to reshape it into a long format
long_df = new_df.melt(id_vars='Title', value_vars=['Content', 'generated_content'], var_name='source', value_name='text')

# Create a binary label to indicate whether the text is from the 'Content' or 'generated_content' column
long_df['label'] = long_df['source'].apply(lambda x: 1 if x == 'Content' else 0)

# Drop the 'source' column as it is no longer needed
long_df = long_df.drop(columns=['source'])

# Display the resulting long DataFrame
long_df


In [None]:
import torch
import torch.nn as nn
from transformers import BertModel, BertTokenizer, AdamW
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score


# Replace this with your actual dataset loading code
texts = long_df.text.to_list()
labels = long_df.label.to_list()

# Split the dataset into training and testing sets
train_texts, test_texts, train_labels, test_labels = train_test_split(texts, labels, test_size=0.25, random_state=42)

# Tokenize the text data
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
train_encodings = tokenizer(train_texts, truncation=True, padding=True, return_tensors='pt')
test_encodings = tokenizer(test_texts, truncation=True, padding=True, return_tensors='pt')

train_labels = torch.tensor(train_labels)
test_labels = torch.tensor(test_labels)

# Create DataLoader for training and testing sets
train_dataset = TensorDataset(train_encodings['input_ids'], train_encodings['attention_mask'], train_labels)
test_dataset = TensorDataset(test_encodings['input_ids'], test_encodings['attention_mask'], test_labels)

train_dataloader = DataLoader(train_dataset, batch_size=4, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=4, shuffle=False)

# Define the TransformerBinaryClassifier model
class TransformerBinaryClassifier(nn.Module):
    def __init__(self, pretrained_model_name='bert-base-uncased', hidden_size=768, num_classes=2):
        super(TransformerBinaryClassifier, self).__init__()
        self.bert = BertModel.from_pretrained(pretrained_model_name)
        self.dropout = nn.Dropout(0.1)
        self.fc = nn.Linear(hidden_size, num_classes)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids, attention_mask=attention_mask)
        pooled_output = outputs['pooler_output']
        pooled_output = self.dropout(pooled_output)
        logits = self.fc(pooled_output)
        probabilities = self.softmax(logits)
        return probabilities

    # Evaluation on the test set
def model_eval(model, test_dataloader):
    model.eval()
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for batch in test_dataloader:
            input_ids, attention_mask, labels = batch
            outputs = model(input_ids, attention_mask)
            _, preds = torch.max(outputs, dim=1)
            all_preds.extend(preds.tolist())
            all_labels.extend(labels.tolist())

    # Calculate accuracy
    accuracy = accuracy_score(all_labels, all_preds)
    print("Test Accuracy:", accuracy)
    return accuracy



In [None]:
# Run this to test that the file path is correct
model = TransformerBinaryClassifier()
model_eval(model, test_dataloader)
model_save_path = f'/content/gdrive/My Drive/chat_checker/final_weights/bert_binary_classifier.pth' #Your Path
torch.save(model.state_dict(), model_save_path)

In [None]:
# Instantiate the model
model = TransformerBinaryClassifier()

# Define optimizer and loss function
optimizer = AdamW(model.parameters(), lr=1e-5)
criterion = nn.CrossEntropyLoss()

# Training loop
num_epochs = 10
for epoch in range(num_epochs):
    print(f'Epoch: {epoch}')
    model.train()
    for batch in train_dataloader:
        input_ids, attention_mask, labels = batch
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

    # After training

    model_eval(model, test_dataloader)



    model_save_path = f'/content/gdrive/My Drive/Senior_Project/final_weights/{accuracy}bert_binary_classifier-epoch-{epoch}.pth'
    torch.save(model.state_dict(), model_save_path)

In [None]:
# Just save a model that is still in your session
model_save_path = f'/content/gdrive/My Drive/Senior_Project/final_weights/bert_binary_classifier-epoch-end.pth'
torch.save(model.state_dict(), model_save_path)

In [None]:
# Test loading your model
loaded_model = TransformerBinaryClassifier()
loaded_model.load_state_dict(torch.load('bert_binary_classifier.pth'))
loaded_model.eval()

## Load exisiting model to continue training

In [None]:
model = TransformerBinaryClassifier()
model.load_state_dict(torch.load('/content/gdrive/My Drive/Senior_Project/final_weights/0.9950835791543756bert_binary_classifier-epoch-0.pth'))
# Define optimizer and loss function
optimizer = AdamW(model.parameters(), lr=1e-5)
criterion = nn.CrossEntropyLoss()

epoch = 1
print(f'Epoch: {epoch}')
model.train()
for batch in train_dataloader:
    input_ids, attention_mask, labels = batch
    optimizer.zero_grad()
    outputs = model(input_ids, attention_mask)
    loss = criterion(outputs, labels)
    loss.backward()
    optimizer.step()

# After training

accuracy = model_eval(model, test_dataloader)



model_save_path = f'/content/gdrive/My Drive/Senior_Project/final_weights/{accuracy}bert_binary_classifier-epoch-{epoch}.pth'
torch.save(model.state_dict(), model_save_path)