In [93]:

import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
import matplotlib.pyplot as plt
from transformers import pipeline
import evaluate

In [94]:

# Load your custom dataset from CSV
df = pd.read_csv("train.csv")


# Reduce the DataFrame to 1,000 samples
df = df.head(2500)

print(df.head())

                                              answer  \
0  Yes, you can format the output text in Bash to...   
1  To install Python 3 on an AWS EC2 instance, yo...   
2  You can achieve the desired time format using ...   
3  Your current implementation is actually quite ...   
4  The use of 'self' in Python is quite different...   

                                            question  
0  How can I output bold text in Bash? I have a B...  
1  How can I install Python 3 on an AWS EC2 insta...  
2  How can I format the elapsed time from seconds...  
3  I am trying to create a matrix of random numbe...  
4  I am learning Python and have noticed extensiv...  


In [95]:
#  non-Python data
non_python_data = {
    "question": [
        "What is the capital of France?",
        "How to bake a cake?",
        "Who is the president of the USA?",
        "What is the largest planet?",
        "What is 2+2?",
        "What is the tallest mountain in the world?",
        "How do I grow tomatoes?",
        "What is the speed of light?",
        "Who wrote 'Romeo and Juliet'?",
        "What is the boiling point of water?",
        "What is the currency of Japan?",
        "How do I change a tire?",
        "Who discovered penicillin?",
        "What is the formula for water?",
        "What is the population of India?",
        "How to learn Spanish?",
        "What is the meaning of life?",
        "What is the square root of 64?",
        "Who painted the Mona Lisa?",
        "What is quantum physics?",
        "How to fix a leaking faucet?",
        "What is photosynthesis?",
        "Who was the first person on the moon?",
        "How do you make coffee?",
        "What is the capital of Italy?",
        "What is 10 times 10?",
        "Who is the founder of Microsoft?",
        "What is the chemical symbol for gold?",
        "What is the distance between the Earth and the Sun?",
        "How do I tie a tie?",
        "What is the tallest building in the world?",
        "How to lose weight?",
        "What is the smallest country in the world?",
        "What is artificial intelligence?",
        "How do I book a flight?",
        "What is the temperature today?",
        "What is the main ingredient in chocolate?",
        "Who invented the telephone?",
        "What is the GDP of the USA?",
        "How do I renew my passport?",
        "What is the capital of Canada?",
        "What is the meaning of 'serendipity'?",
        "How do I write a resume?",
        "What is the first element on the periodic table?",
        "How do I start a garden?",
        "What is the capital of Australia?",
        "How do I play the guitar?",
        "What is the Pythagorean theorem?",
        "What is the definition of democracy?",
        "How do I boil an egg?"
    ],
    "answer": ["I don't know the answer."] * 50  # All answers will be the same since these are non-Python questions
}

# Convert non-Python data to DataFrame
non_python_df = pd.DataFrame(non_python_data)



# Combine non-Python data with your custom dataset
combined_df = pd.concat([df, non_python_df], ignore_index=True)

combined_df.to_csv("combined_dataset.csv", index=False)


In [96]:
len(combined_df)

2550

In [97]:
dataset = Dataset.from_pandas(combined_df)

In [98]:
tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token



In [99]:
def preprocess_data(examples):
    # Ensure the inputs are strings, replace None with an empty string if needed
    inputs = [str(q) if q is not None else "" for q in examples['question']]
    answers = [str(a) if a is not None else "" for a in examples['answer']]

    # Tokenizing the question inputs
    model_inputs = tokenizer(inputs, max_length=128, truncation=True, padding="max_length")

    # Tokenizıng the answers, ensuring the tokenizer is in the correct mode
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(answers, max_length=128, truncation=True, padding="max_length")

    # Set the labels in the model inputs
    model_inputs['labels'] = labels['input_ids']
    return model_inputs


In [101]:
tokenized_datasets = dataset.map(preprocess_data, batched=True)


Map: 100%|██████████| 2550/2550 [00:01<00:00, 1656.59 examples/s]


In [102]:

# Check the first few entries to confirm format
print(dataset[0])

{'answer': 'Yes, you can format the output text in Bash to make it bold. Bash allows you to use special escape sequences for text decoration. To make some text bold in bash, you would use the escape sequence `\\033[1m`, and to reset the formatting, you would use `\\033[0m`. \n\nHere\'s how you can update your `echo` statement to print bold text:\n\n```bash\necho -e "\\033[1mSome Text\\033[0m"\n```\n\nIn this code:\n\n- The `-e` option of `echo` allows the interpretation of backslash escapes.\n- The `\\033[1m` sequence sets the text to be bold.\n- The `Some Text` part is the actual text that will be printed in bold.\n- The `\\033[0m` sequence resets the text formatting to the default, so anything printed afterwards will be in the default format.\n\nRemember that these escape sequences may not work in all terminals and circumstances, but they should work in most common situations.', 'question': 'How can I output bold text in Bash? I have a Bash script that prints some text to the screen 

In [103]:
# Load the pre-trained model
model = AutoModelForCausalLM.from_pretrained("gpt2")

In [104]:
# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
)



In [105]:
# Define the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets,
    eval_dataset=tokenized_datasets,
)

In [106]:
# Fine-tune the model
trainer.train()

# Save the fine-tuned model
model.save_pretrained("./fine-tuned-gpt2")
tokenizer.save_pretrained("./fine-tuned-gpt2")

  0%|          | 27/102120 [05:33<350:10:39, 12.35s/it]
 26%|██▌       | 500/1914 [44:07<2:01:40,  5.16s/it]

{'loss': 6.2443, 'grad_norm': 13.17029094696045, 'learning_rate': 1.4775339602925811e-05, 'epoch': 0.78}


                                                    
 33%|███▎      | 638/1914 [1:12:59<1:33:58,  4.42s/it]

{'eval_loss': 5.891693592071533, 'eval_runtime': 1027.8167, 'eval_samples_per_second': 2.481, 'eval_steps_per_second': 0.621, 'epoch': 1.0}


 52%|█████▏    | 1000/1914 [1:47:31<1:39:36,  6.54s/it]  

{'loss': 5.9488, 'grad_norm': 13.336487770080566, 'learning_rate': 9.550679205851621e-06, 'epoch': 1.57}


                                                       
 67%|██████▋   | 1276/1914 [2:36:13<1:03:56,  6.01s/it]

{'eval_loss': 5.8041510581970215, 'eval_runtime': 1201.7579, 'eval_samples_per_second': 2.122, 'eval_steps_per_second': 0.531, 'epoch': 2.0}


 78%|███████▊  | 1500/1914 [2:59:57<42:05,  6.10s/it]    

{'loss': 5.8445, 'grad_norm': 16.379030227661133, 'learning_rate': 4.32601880877743e-06, 'epoch': 2.35}


                                                     
100%|██████████| 1914/1914 [4:01:02<00:00,  7.56s/it]


{'eval_loss': 5.772108554840088, 'eval_runtime': 1093.2913, 'eval_samples_per_second': 2.332, 'eval_steps_per_second': 0.584, 'epoch': 3.0}
{'train_runtime': 14462.861, 'train_samples_per_second': 0.529, 'train_steps_per_second': 0.132, 'train_loss': 5.977731415719697, 'epoch': 3.0}


('./fine-tuned-gpt2\\tokenizer_config.json',
 './fine-tuned-gpt2\\special_tokens_map.json',
 './fine-tuned-gpt2\\vocab.json',
 './fine-tuned-gpt2\\merges.txt',
 './fine-tuned-gpt2\\added_tokens.json',
 './fine-tuned-gpt2\\tokenizer.json')

In [108]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall,
    }

In [109]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets,
    eval_dataset=tokenized_datasets,
    compute_metrics=compute_metrics
)


In [117]:
detailed_results = trainer.evaluate()
print("Detailed Evaluation Results:", detailed_results)


 36%|███▋      | 232/638 [50:51<17:47:24, 157.74s/it]

KeyboardInterrupt: 

In [111]:
# Load the fine-tuned model
model = AutoModelForCausalLM.from_pretrained("./fine-tuned-gpt2")
tokenizer = AutoTokenizer.from_pretrained("./fine-tuned-gpt2")

nlp = pipeline("text-generation", model=model, tokenizer=tokenizer)


In [118]:
# Generate responses
def generate_response(prompt):
    return nlp(prompt, max_length=50, num_return_sequences=1)[0]['generated_text']

# Test prompts
test_prompts = [
    "what is a array",
    "what is a function",
]

for prompt in test_prompts:
    print(f"Prompt: {prompt}")
    print(f"Response: {generate_response(prompt)}\n")

Prompt: what is a array
Response: what is a array to be used array for the using for by you array using function your is in array the array you
. is can of arrays you in use`, the,`` array Here array ` can Here the is
 array,Here

Prompt: what is a function
Response: what is a function with you to a you is is is you. function is you-() to the() the and is a is- in a not functions and to is_ it` use the- it it is how function that returns it.

