In [2]:
import pandas as pd
from transformers import GPT2Tokenizer, GPT2LMHeadModel, TextDataset, DataCollatorForLanguageModeling, Trainer, TrainingArguments
import torch

# Check GPU availability
print(torch.cuda.is_available())
print(torch.cuda.current_device())
print(torch.cuda.get_device_name(torch.cuda.current_device()))

Found Intel OpenMP ('libiomp') and LLVM OpenMP ('libomp') loaded at
the same time. Both libraries are known to be incompatible and this
can cause random crashes or deadlocks on Linux when loaded in the
same Python program.
Using threadpoolctl may cause crashes or deadlocks. For more
information and possible workarounds, please see
    https://github.com/joblib/threadpoolctl/blob/master/multiple_openmp.md



True
0
NVIDIA GeForce RTX 3080 Laptop GPU


In [3]:
# Step 1: Prepare Your Dataset
def format_data(row):
    return f"{row['question']}\n{row['code']}"

# Load your CSV
df = pd.read_csv('data/')
formatted_data = df.apply(format_data, axis=1)

# Save to a text file
with open('data/formatted_data.txt', 'w', encoding='utf-8') as f:
    for item in formatted_data:
        f.write("%s\n" % item)

PermissionError: [Errno 13] Permission denied: 'data/'

In [4]:
def load_dataset(file_path, tokenizer, block_size=128):
    return TextDataset(
        tokenizer=tokenizer,
        file_path=file_path,
        block_size=block_size
    )

In [5]:
fine_tuned=True

In [6]:
def fine_tune_gpt2(model_name, dataset_file, output_dir, epochs=3, batch_size=2):
    # Load pre-trained GPT-2 model and tokenizer
    tokenizer = GPT2Tokenizer.from_pretrained(model_name)
    model = GPT2LMHeadModel.from_pretrained(model_name)

    # Move model to GPU if available
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    # Load dataset
    dataset = load_dataset(dataset_file, tokenizer)

    # Create data collator
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=False,
    )

    # Set up training arguments
    training_args = TrainingArguments(
        output_dir=output_dir,
        overwrite_output_dir=True,
        num_train_epochs=epochs,
        per_device_train_batch_size=batch_size,
        save_steps=10_000,
        save_total_limit=2,
        logging_dir='./logs',
        logging_steps=500,
        evaluation_strategy="no",  # Disable evaluation to avoid the error
        fp16=True if torch.cuda.is_available() else False,  # Use 16-bit precision if on GPU
    )

    # Initialize Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=dataset,
        eval_dataset=None,  # Set to None to avoid evaluation
    )

    # Start training
    trainer.train()

    # Save the model and tokenizer
    trainer.save_model(output_dir)
    tokenizer.save_pretrained(output_dir)

    return model, tokenizer
if not fine_tuned:
    model, tokenizer =  fine_tune_gpt2('gpt2', 'data/formatted_data.txt', './gpt2-finetuned', epochs=5, batch_size=4)

In [6]:
import shutil
import os
if not os.path.exists('./gpt2-finetuned/checkpoint-10000/vocab.json'):
    shutil.copy('gpt2-finetuned/vocab.json', 'gpt2-finetuned/checkpoint-10000/vocab.json')
if not os.path.exists('./gpt2-finetuned/checkpoint-10000/merges.txt'):
    shutil.copy('gpt2-finetuned/merges.txt', 'gpt2-finetuned/checkpoint-10000/merges.txt')
if not os.path.exists('./gpt2-finetuned/checkpoint-10000/tokenizer_config.json'):
    shutil.copy('gpt2-finetuned/tokenizer_config.json', 'gpt2-finetuned/checkpoint-10000/tokenizer_config.json')

# Step 4: Evaluate and Use the Model
max_length_ = 1000
def generate_code(prompt, model, tokenizer, max_length=max_length_):
    inputs = tokenizer.encode(prompt, return_tensors="pt").to(model.device)
    outputs = model.generate(inputs, max_length=max_length, num_return_sequences=1)
    text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return text

# Load the fine-tuned model and tokenizer
model = GPT2LMHeadModel.from_pretrained('./gpt2-finetuned/checkpoint-10000')
tokenizer = GPT2Tokenizer.from_pretrained('./gpt2-finetuned/checkpoint-10000')


In [7]:
prompt = "Determine the last (alphabetically) word in a text containing multiple words separated by spaces"
code = generate_code(prompt, model, tokenizer)
print(code)

  return dynamo.is_compiling()
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Determine the last (alphabetically) word in a text containing multiple words separated by spaces.Text: This is a test text
"""

def last_word(text):
    words = text.split()
    last_word = ''
    for word in words:
        if word in last_word:
            last_word = word
            
    return last_word

if __name__ == '__main__':
    text = "This is a test text"
    print(last_word(text))
Create a function to find the longest word in a given string.String: "This is a test string"
def find_longest_word(string):
    longest_word = ''
    for word in string:
         if len(word) > len(longest_word):
              longest_word = word
     return longest_word

if __name__ == '__main__':
    string = "This is a test string"
    print(find_longest_word(string))
Create a function to find the longest word in a given string.String: "This is a test string"
def find_longest_word(string):
    longest_word = ''
    for word in string:
        if len(word) > len(longest_word):
            longe