In [2]:
import torch
from transformers import GPT2Model, GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments
import pandas as pd
import pickle
from tqdm import tqdm
import gc


In [8]:
MAX_LENGTH = 50
model_name = "gpt2"
# model = AutoModelCausalLM.from_pretrained('gpt2')
encode_model = GPT2Model.from_pretrained('distilgpt2', output_hidden_states=True)
model = GPT2LMHeadModel.from_pretrained(model_name)
# recon_model = GPT2LMHeadModel.from_pretrained('gpt2')
# recon_model = GPT2LMHeadModel.from_pretrained("./finetuned_gpt2_embeddings")
# recon_tokenizer = GPT2Tokenizer.from_pretrained("./finetuned_gpt2_embeddings")
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token

# device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
encode_model.to(device)
# recon_model.to(device)

GPT2Model(
  (wte): Embedding(50257, 768)
  (wpe): Embedding(1024, 768)
  (drop): Dropout(p=0.1, inplace=False)
  (h): ModuleList(
    (0-5): 6 x GPT2Block(
      (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (attn): GPT2Attention(
        (c_attn): Conv1D(nf=2304, nx=768)
        (c_proj): Conv1D(nf=768, nx=768)
        (attn_dropout): Dropout(p=0.1, inplace=False)
        (resid_dropout): Dropout(p=0.1, inplace=False)
      )
      (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (mlp): GPT2MLP(
        (c_fc): Conv1D(nf=3072, nx=768)
        (c_proj): Conv1D(nf=768, nx=3072)
        (act): NewGELUActivation()
        (dropout): Dropout(p=0.1, inplace=False)
      )
    )
  )
  (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
)

In [11]:
def embed(text):
    input_ids = tokenizer(text,
                          return_tensors="pt",
                          padding="max_length",
                          truncation=True,
                          max_length=MAX_LENGTH)['input_ids']
    input_ids = input_ids.to(device)
    with torch.no_grad():
        embeddings = encode_model(input_ids).last_hidden_state

    del input_ids
    gc.collect()
    return embeddings

In [None]:

# Ensure model is in train mode
model.train()

# Custom dataset class
class EmbeddingTextDataset(torch.utils.data.Dataset):
    def __init__(self, embeddings, texts, tokenizer):
        self.embeddings = embeddings  # Precomputed input embeddings (tensor shape: [num_samples, seq_len, hidden_dim])
        self.texts = texts  # Corresponding text from tweets
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.embeddings)

    def __getitem__(self, idx):
        input_embedding = self.embeddings[idx]  # Extract precomputed embedding
        text = self.texts[idx]
        tokenized_output = self.tokenizer(text, return_tensors="pt", padding="max_length", truncation=True, max_length=MAX_LENGTH)

        return {
            "inputs_embeds": input_embedding.squeeze(0),
            "labels": tokenized_output["input_ids"].squeeze(0)
        }

# Load both datasets
anti_brexit_df = pd.read_csv("./TweetDataset_AntiBrexit_Jan-Mar2022.csv")
pro_brexit_df = pd.read_csv("./TweetDataset_ProBrexit_Jan-Mar2022.csv")

# Extract the relevant text column
anti_brexit_texts = anti_brexit_df["Hit Sentence"].dropna().tolist()
pro_brexit_texts = pro_brexit_df["Hit Sentence"].dropna().tolist()

# Combine both datasets
texts = anti_brexit_texts + pro_brexit_texts

print(f"Loaded {len(texts)} tweets.")

# Generate embeddings
# embeddings = [embed(text) for text in tqdm(texts, desc="Generating Embeddings", leave=False)]
embeddings = list(map(embed, tqdm(texts, desc="Generating Embeddings", leave=False)))
# embeddings = list(map(embed, texts))


# Create dataset
dataset = EmbeddingTextDataset(embeddings, texts, tokenizer)
# def pickle_data(embeddings, texts):
def pickle_data(dataset):
    # dataset = {
    #     "embeddings": embeddings,
    #     "texts": texts
    # }

    # Save as a pickle file
    with open("synthetic_tweet_embeddings.pkl", "wb") as f:
        pickle.dump(dataset, f)

    print("Saved dataset to synthetic_tweet_embeddings.pkl")

pickle_data(dataset)


# Training arguments
training_args = TrainingArguments(
    output_dir="./gpt2_embedding_finetune",
    per_device_train_batch_size=8,
    num_train_epochs=3,
    save_steps=500,
    logging_dir="./logs",
    logging_steps=100,
    save_total_limit=2,
    report_to="none"
)

# Trainer setup
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset
)

# Train the model
trainer.train()

# Save fine-tuned model
model.save_pretrained("./finetuned_gpt2_embeddings")
tokenizer.save_pretrained("./finetuned_gpt2_embeddings")

Loaded 358205 tweets.


Generating Embeddings:  11%|█         | 38609/358205 [3:44:41<36:03:26,  2.46it/s]