In [None]:
pip install torch transformers




In [None]:
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer, GPT2Config
from torch.utils.data import Dataset, DataLoader

class MyDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=512):
        self.tokenizer = tokenizer
        self.data = data
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        text = self.data[idx]
        encoding = self.tokenizer(text, return_tensors='pt', max_length=self.max_length, truncation=True)
        return {'input_ids': encoding['input_ids']}

def fine_tune_gpt2(model, dataloader, num_epochs=3, learning_rate=1e-5):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

    for epoch in range(num_epochs):
        model.train()
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            labels = input_ids.clone()

            optimizer.zero_grad()
            outputs = model(input_ids, labels=labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()

    # Save the fine-tuned model
    model.save_pretrained('fine_tuned_model')

# Read your text data from a file
with open('/content/Laws_data.txt', 'r', encoding='utf-8') as file:
    text_data = file.readlines()

# Tokenize and create DataLoader
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
dataset = MyDataset(text_data, tokenizer)
dataloader = DataLoader(dataset, batch_size=1, shuffle=True)

# Load pre-trained GPT-2 model
gpt2_model = GPT2LMHeadModel.from_pretrained('gpt2')

# Fine-tune the model
fine_tune_gpt2(gpt2_model, dataloader)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [None]:
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from torch.utils.data import Dataset, DataLoader

class MyDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=512):
        self.tokenizer = tokenizer
        self.data = data
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        text = self.data[idx]
        encoding = self.tokenizer(text, return_tensors='pt', max_length=self.max_length, truncation=True)
        return {'input_ids': encoding['input_ids']}

def fine_tune_gpt2(model, dataloader, val_dataloader, num_epochs=5, learning_rate=1e-5):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

    for epoch in range(num_epochs):
        model.train()
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            labels = input_ids.clone()

            optimizer.zero_grad()
            outputs = model(input_ids, labels=labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()

        # Evaluate on the validation set
        model.eval()
        total_val_loss = 0.0
        with torch.no_grad():
            for val_batch in val_dataloader:
                input_ids_val = val_batch['input_ids'].to(device)
                labels_val = input_ids_val.clone()
                outputs_val = model(input_ids_val, labels=labels_val)
                total_val_loss += outputs_val.loss.item()

        avg_val_loss = total_val_loss / len(val_dataloader)
        print(f"Epoch {epoch + 1}/{num_epochs}, Average Validation Loss: {avg_val_loss:.4f}")

    # Save the fine-tuned model
    model.save_pretrained('fine_tuned_model')

# Read your text data from a file
with open('/content/Laws_data.txt', 'r', encoding='utf-8') as file:
    text_data = file.readlines()

# Split your data into training and validation sets
train_size = int(0.8 * len(text_data))
val_size = len(text_data) - train_size
train_dataset, val_dataset = torch.utils.data.random_split(text_data, [train_size, val_size])

# Tokenize and create DataLoader
tokenizer = GPT2Tokenizer.from_pretrained('gpt2', additional_special_tokens=['[SPECIAL1]', '[SPECIAL2]'])
train_dataloader = DataLoader(MyDataset(train_dataset, tokenizer), batch_size=1, shuffle=True)
val_dataloader = DataLoader(MyDataset(val_dataset, tokenizer), batch_size=1, shuffle=False)

# Load pre-trained GPT-2 model
gpt2_model = GPT2LMHeadModel.from_pretrained('gpt2')

# Fine-tune the model
fine_tune_gpt2(gpt2_model, train_dataloader, val_dataloader)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Epoch 1/5, Average Validation Loss: nan
Epoch 2/5, Average Validation Loss: nan
Epoch 3/5, Average Validation Loss: nan
Epoch 4/5, Average Validation Loss: nan
Epoch 5/5, Average Validation Loss: nan


In [None]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# Load the fine-tuned model
fine_tuned_model = GPT2LMHeadModel.from_pretrained('fine_tuned_model')
tokenizer = GPT2Tokenizer.from_pretrained('gpt2', additional_special_tokens=['[SPECIAL1]', '[SPECIAL2]'])

# Save the fine-tuned model and tokenizer locally
tokenizer.save_pretrained('fine_tuned_model')
fine_tuned_model.save_pretrained('fine_tuned_model')


In [None]:
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer

def generate_response(model, tokenizer, question, max_length=100):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Tokenize the input question
    input_ids = tokenizer.encode(question, return_tensors='pt').to(device)

    # Generate a response with attention mask
    with torch.no_grad():
        # Create attention mask tensor
        attention_mask = torch.ones(input_ids.shape, device=device)

        # Generate a response with attention mask
        output = model.generate(input_ids, max_length=max_length, num_beams=5, no_repeat_ngram_size=2, pad_token_id=tokenizer.eos_token_id, attention_mask=attention_mask)

    # Decode and return the generated response
    response = tokenizer.decode(output[0], skip_special_tokens=True)
    return response

# Load the fine-tuned model
fine_tuned_model = GPT2LMHeadModel.from_pretrained('fine_tuned_model')
tokenizer = GPT2Tokenizer.from_pretrained('gpt2', additional_special_tokens=['[SPECIAL1]', '[SPECIAL2]'])

# Ask a question and get a response
question = " give me the number of laws in this book"
response = generate_response(fine_tuned_model, tokenizer, question)

print("Question:", question)
print("Response:", response)


Question:  give me the number of laws in this book
Response:  give me the number of laws in this book.

The Laws of Power: The Power of the Law and the Rule of Law in a Lawless World. Copyright © 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024, 2525, 2626, 2727, 2828, 2929, 30


In [None]:


from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
# Assuming 'fine_tuned_model' is your GPT-2 model
save_directory = 'https://drive.google.com/drive/folders/1ETQ7FBf4sm1vkVroOnGPcS-YdOrsGLqy'

# Save the model and tokenizer
fine_tuned_model.save_pretrained(save_directory)
tokenizer.save_pretrained(save_directory)

# Optionally, save additional information
fine_tuned_model.config.save_pretrained(save_directory)
