In [5]:
!pip install -q -U transformers==4.35

In [None]:
import json
from typing import Tuple
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    PreTrainedModel,
    PreTrainedTokenizer
)

def load_model_and_tokenizer(model_name_or_path: str) -> Tuple[PreTrainedModel, PreTrainedTokenizer]:
    model = AutoModelForCausalLM.from_pretrained(model_name_or_path, trust_remote_code=True)
    tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)

    if tokenizer.pad_token is None:
        tokenizer.add_special_tokens({'pad_token': tokenizer.eos_token})

    return model, tokenizer

def generate_text(
    model: PreTrainedModel,
    tokenizer: PreTrainedTokenizer,
    prompt: str,
    history: str = "",
    max_length: int = 100,
    temperature: float = 0.9
) -> str:
    full_prompt = history + "\n" + prompt
    input_ids = tokenizer(full_prompt, return_tensors="pt").input_ids

    gen_tokens = model.generate(
        input_ids,
        do_sample=True,
        temperature=temperature,
        max_length=max_length
    )

    gen_text = tokenizer.decode(gen_tokens[0])

    return gen_text

model_name_or_path = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
model, tokenizer = load_model_and_tokenizer(model_name_or_path)

num_iterations = 2
file_json = 'output1.json'

output_list = []
history = ""

# Few-shot examples
examples = [
    {"question": "What is the capital of France?", "answer": "The capital of France is Paris."},
    {"question": "Who is the president of the United States?", "answer": "The president of the United States is Joe Biden."}
]

# Add few-shot examples to history
for example in examples:
    history += f"Question: {example['question']}\nAnswer: {example['answer']}\n"

for i in range(num_iterations):
    prompt_Q = f"Generate adversarial question ({i + 1})"
    Question = generate_text(model, tokenizer, prompt_Q, history=history)
    history += "\n" + Question

    prompt_A = f'Answer to: {Question}'
    Answer = generate_text(model, tokenizer, prompt_A, history=history)
    history += "\n" + Answer

    output_list.append({'prompt': Question, 'generated_text': Answer})

with open(file_json, mode='w', encoding='utf-8') as jsonfile:
    json.dump(output_list, jsonfile, ensure_ascii=False, indent=4)


In [None]:
import json
from typing import Tuple
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    PreTrainedModel,
    PreTrainedTokenizer
)

def load_model_and_tokenizer(model_name_or_path: str) -> Tuple[PreTrainedModel, PreTrainedTokenizer]:
    try:
        model = AutoModelForCausalLM.from_pretrained(model_name_or_path, trust_remote_code=True)
        tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)

        if tokenizer.pad_token is None:
            tokenizer.add_special_tokens({'pad_token': tokenizer.eos_token})

        return model, tokenizer
    except Exception as e:
        raise RuntimeError(f"Failed to load model and tokenizer: {e}")

def generate_text(
    model: PreTrainedModel,
    tokenizer: PreTrainedTokenizer,
    prompt: str,
    max_length: int = 100,
    temperature: float = 0.9
) -> str:
    input_ids = tokenizer(prompt, return_tensors="pt").input_ids

    gen_tokens = model.generate(
        input_ids,
        do_sample=True,
        temperature=temperature,
        max_length=max_length
    )

    gen_text = tokenizer.decode(gen_tokens[0])

    return gen_text

# Load the model and tokenizer once
model_name_or_path = "gpt2"
model, tokenizer = load_model_and_tokenizer(model_name_or_path)

# Specify the number of iterations to generate text
num_iterations = 2
file_json = 'output1.json'

# Generate text and save to JSON file
output_list = []

for i in range(num_iterations):
    prompt_Q = f"Generate adversarial question ({i + 1})"
    Question = generate_text(model, tokenizer, prompt_Q)
    prompt_A = f'Answer to: {Question}'
    Answer = generate_text(model, tokenizer, prompt_A)
    output_list.append({'prompt': Question, 'generated_text': Answer})

# Write list of dictionaries to a JSON file
with open(file_json, mode='w', encoding='utf-8') as jsonfile:
    json.dump(output_list, jsonfile, ensure_ascii=False, indent=4)

In [None]:
import json
from typing import Tuple, Dict
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    PreTrainedModel,
    PreTrainedTokenizer
)

def load_model_and_tokenizer(model_name_or_path: str) -> Tuple[PreTrainedModel, PreTrainedTokenizer]:
    try:
        model = AutoModelForCausalLM.from_pretrained(model_name_or_path, trust_remote_code=True)
        tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)

        if tokenizer.pad_token is None:
            tokenizer.add_special_tokens({'pad_token': tokenizer.eos_token})

        return model, tokenizer
    except Exception as e:
        raise RuntimeError(f"Failed to load model and tokenizer: {e}")

def generate_text(
    model: PreTrainedModel,
    tokenizer: PreTrainedTokenizer,
    prompt: str,
    max_length: int = 100,
    temperature: float = 0.9
) -> str:
    input_ids = tokenizer(prompt, return_tensors="pt").input_ids

    gen_tokens = model.generate(
        input_ids,
        do_sample=True,
        temperature=temperature,
        max_length=max_length
    )

    gen_text = tokenizer.decode(gen_tokens[0])

    return gen_text

# Load the model and tokenizer once
model_name_or_path = "gpt2"
model, tokenizer = load_model_and_tokenizer(model_name_or_path)

# Specify the number of iterations to generate text
num_iterations = 10  # Set this to your desired number
file_json = 'output1.json'

# Function to write a single entry to the JSON file
def write_entry_to_json(file_path: str, data: Dict, mode: str='a'):
    # Open the file in the append mode (or write mode if it's the first entry)
    with open(file_path, mode, encoding='utf-8') as file:
        if mode == 'w':  # If it's the first entry, write an opening bracket to start the JSON array
            file.write('[\n')
        else:  # For subsequent entries, write a comma to separate JSON objects
            file.write(',\n')
        json.dump(data, file, ensure_ascii=False, indent=4)

# Create the JSON file and write the first entry separately to handle the opening bracket
for i in range(num_iterations):
    prompt_Q = f"Generate adversarial question ({i + 1})"
    Question = generate_text(model, tokenizer, prompt_Q)
    prompt_A = f'Answer to: {Question}'
    Answer = generate_text(model, tokenizer, prompt_A)
    entry = {'prompt': Question, 'generated_text': Answer}

    if i == 0:
        write_entry_to_json(file_json, entry, 'w')
    else:
        write_entry_to_json(file_json, entry)

# Closing the JSON array with a bracket
with open(file_json, 'a', encoding='utf-8') as file:
    file.write('\n]')

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generati

In [None]:
import csv
from typing import Tuple
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    PreTrainedModel,
    PreTrainedTokenizer
)

def load_model_and_tokenizer(model_name_or_path: str) -> Tuple[PreTrainedModel, PreTrainedTokenizer]:
    try:
        model = AutoModelForCausalLM.from_pretrained(model_name_or_path, trust_remote_code=True)
        tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)

        if tokenizer.pad_token is None:
            tokenizer.add_special_tokens({'pad_token': tokenizer.eos_token})

        return model, tokenizer
    except Exception as e:
        raise RuntimeError(f"Failed to load model and tokenizer: {e}")

def generate_text(
    model: PreTrainedModel,
    tokenizer: PreTrainedTokenizer,
    prompt: str,
    max_length: int = 100,
    temperature: float = 0.9
) -> str:
    input_ids = tokenizer(prompt, return_tensors="pt").input_ids

    gen_tokens = model.generate(
        input_ids,
        do_sample=True,
        temperature=temperature,
        max_length=max_length
    )

    gen_text = tokenizer.batch_decode(gen_tokens)[0]

    return gen_text

# Load the model and tokenizer once
model_name_or_path = "gpt2"
model, tokenizer = load_model_and_tokenizer(model_name_or_path)

# Specify the number of iterations to generate text
num_iterations = 2
file_csv='output1.csv'
# Generate text and write to CSV file
with open(file_csv, mode='w', newline='', encoding='utf-8') as csvfile:
    fieldnames = ['prompt', 'generated_text']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()

    for i in range(num_iterations):
        prompt_Q = f" Generate adversarical question ({i + 1})"
        Question = generate_text(model, tokenizer, prompt_Q)
        prompt_A=f'answer to {Question }'
        Answer = generate_text(model, tokenizer, prompt_A)
        writer.writerow({'prompt': Question, 'generated_text': Answer})

In [None]:
!pip install -q accelerate==0.21.0 peft==0.4.0 bitsandbytes==0.40.2 transformers==4.31.0 trl==0.4.7

In [None]:
import os
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import LoraConfig, PeftModel
from trl import SFTTrainer


# The model that you want to train from the Hugging Face hub
model_name = "gpt2"

# The instruction dataset to use
dataset_name = "mlabonne/guanaco-llama2-1k"

# Fine-tuned model name
new_model = "llama-2-7b-miniguanaco"

################################################################################
# QLoRA parameters
################################################################################

# LoRA attention dimension
lora_r = 64

# Alpha parameter for LoRA scaling
lora_alpha = 16

# Dropout probability for LoRA layers
lora_dropout = 0.1

################################################################################
# bitsandbytes parameters
################################################################################

# Activate 4-bit precision base model loading
use_4bit = True

# Compute dtype for 4-bit base models
bnb_4bit_compute_dtype = "float16"

# Quantization type (fp4 or nf4)
bnb_4bit_quant_type = "nf4"

# Activate nested quantization for 4-bit base models (double quantization)
use_nested_quant = False

################################################################################
# TrainingArguments parameters
################################################################################

# Output directory where the model predictions and checkpoints will be stored
output_dir = "./results"

# Number of training epochs
num_train_epochs = 1

# Enable fp16/bf16 training (set bf16 to True with an A100)
fp16 = False
bf16 = False

# Batch size per GPU for training
per_device_train_batch_size = 4

# Batch size per GPU for evaluation
per_device_eval_batch_size = 4

# Number of update steps to accumulate the gradients for
gradient_accumulation_steps = 1

# Enable gradient checkpointing
gradient_checkpointing = True

# Maximum gradient normal (gradient clipping)
max_grad_norm = 0.3

# Initial learning rate (AdamW optimizer)
learning_rate = 2e-4

# Weight decay to apply to all layers except bias/LayerNorm weights
weight_decay = 0.001

# Optimizer to use
optim = "paged_adamw_32bit"

# Learning rate schedule
lr_scheduler_type = "cosine"

# Number of training steps (overrides num_train_epochs)
max_steps = -1

# Ratio of steps for a linear warmup (from 0 to learning rate)
warmup_ratio = 0.03

# Group sequences into batches with same length
# Saves memory and speeds up training considerably
group_by_length = True

# Save checkpoint every X updates steps
save_steps = 0

# Log every X updates steps
logging_steps = 25

################################################################################
# SFT parameters
################################################################################

# Maximum sequence length to use
max_seq_length = None

# Pack multiple short examples in the same input sequence to increase efficiency
packing = False

# Load the entire model on the GPU 0
device_map = {"": 0}


# Load dataset (you can process it here)
dataset = load_dataset(dataset_name, split="train")

# Load tokenizer and model with QLoRA configuration
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=use_nested_quant,
)

# # Check GPU compatibility with bfloat16
# if compute_dtype == torch.float16 and use_4bit:
#     major, _ = torch.cuda.get_device_capability()
#     if major >= 8:
#         print("=" * 80)
#         print("Your GPU supports bfloat16: accelerate training with bf16=True")
#         print("=" * 80)

# Load base model
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map=device_map
)
model.config.use_cache = False
model.config.pretraining_tp = 1

# Load LLaMA tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right" # Fix weird overflow issue with fp16 training

# Load LoRA configuration
peft_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    r=lora_r,
    bias="none",
    task_type="CAUSAL_LM",
)

# Set training parameters
training_arguments = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optim,
    save_steps=save_steps,
    logging_steps=logging_steps,
    learning_rate=learning_rate,
    weight_decay=weight_decay,
    fp16=fp16,
    bf16=bf16,
    max_grad_norm=max_grad_norm,
    max_steps=max_steps,
    warmup_ratio=warmup_ratio,
    group_by_length=group_by_length,
    lr_scheduler_type=lr_scheduler_type,
    report_to="tensorboard"
)

# Set supervised fine-tuning parameters
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    peft_config=peft_config,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    tokenizer=tokenizer,
    args=training_arguments,
    packing=packing,
)

# Train model
trainer.train()

# Save trained model
trainer.model.save_pretrained(new_model)


# Ignore warnings
logging.set_verbosity(logging.CRITICAL)

# Run text generation pipeline with our next model
prompt = "What is a large language model?"
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=400)
result = pipe(f"<s>[INST] {prompt} [/INST]")
print(result[0]['generated_text'])

In [None]:
import csv

with open('/content/drive/MyDrive/training.csv', 'r') as csvfile, open('/content/drive/MyDrive/new_output.csv', 'a', newline='') as outfile:
    reader = csv.reader(csvfile)
    writer = csv.writer(outfile)

    next(reader)  # Skip header row
    for row in reader:
        input_text = row[1]
        # Process input_text or perform any operations needed
        print(f"Input: {input_text}")

        # Append data from row[2] to new_output.csv
        output_data = row[2]  # Assuming row[2] contains the data you want to append
        writer.writerow([output_data])  # Write the data to new_output.csv


In [None]:

model_directory='gpt2'
import csv
from transformers import AutoModelForCausalLM, AutoTokenizer

model = AutoModelForCausalLM.from_pretrained(model_directory)
tokenizer = AutoTokenizer.from_pretrained(model_directory)

with open('/content/drive/MyDrive/training.csv', 'r') as csvfile, open('/content/drive/MyDrive/new_output.csv', 'a', newline='') as outfile:
    reader = csv.reader(csvfile)
    writer = csv.writer(outfile)

    next(reader)
    for row in reader:
        input_text = row[1]

        encoded_input = tokenizer.encode(input_text, return_tensors='pt')
        output = model.generate(encoded_input, max_length=256)

        decoded_output = tokenizer.decode(output[0], skip_special_tokens=True)

        print(f"Input: {input_text}")
        print(f"Output: {decoded_output}")

        writer.writerow([decoded_output]) # Write generated text to new csv

In [None]:
import csv

model_directory = 'gpt-2'
csv_file = 'questions_answers.csv'

model = AutoModelForCausalLM.from_pretrained(model_directory)
tokenizer = AutoTokenizer.from_pretrained(model_directory)

# Set pad token to eos token if it's not already set
if tokenizer.pad_token is None:
    # Add special tokens (eos_token as pad_token)
    tokenizer.add_special_tokens({'pad_token':tokenizer.eos_token})

# Generate a question
def generate_question(model, tokenizer, max_length=4048):
    prompt = "Generate a question: "
    generated = model.generate(tokenizer.encode(prompt, return_tensors="pt"), max_length=max_length)
    decoded_output = tokenizer.decode(generated[0], skip_special_tokens=True)
    return decoded_output

question = generate_question(model, tokenizer)
print("\nGenerated question:", question)

# Generate an answer to the question
def generate_answer(model, tokenizer, question, max_length=4048):
    generated = model.generate(tokenizer.encode(question, return_tensors="pt"), max_length=max_length)
    decoded_output = tokenizer.decode(generated[0], skip_special_tokens=True)
    return decoded_output

answer = generate_answer(model, tokenizer, question)
print("\nGenerated answer:", answer)

# Append the question and answer to the CSV file
def append_to_csv(csv_file, question, answer):
    with open(csv_file, mode='a', newline='', encoding='utf-8') as f:
        question_answer = f'{question},{answer}\n'
        f.write(question_answer)

append_to_csv(csv_file, question, answer)
print("\nQuestion and answer appended to CSV file.")

In [None]:
# Append the question and answer to the CSV file
def append_to_csv(csv_file, question, answer):
    with open(csv_file, mode='a', newline='', encoding='utf-8') as f:
        question_answer = f'{question},{answer}\n'
        f.write(question_answer)
csvfile='test.csv'
question='Q:'
answer="A:"
append_to_csv(csv_file, question, answer)


for i in range(100):
    append_to_csv(csv_file, question, answer)

In [None]:
# Append the question, answer, and answer1 to the CSV file
def append_to_csv(csv_file, question, answer, answer1):
    with open(csv_file, mode='a', newline='', encoding='utf-8') as f:
        question_answer = f'{question},{answer},{answer1}\n'
        f.write(question_answer)

csv_file = 'test.csv'
question = 'Q:'
answer = 'A:'
answer1 = 'a2:'

append_to_csv(csv_file, question, answer, answer1)

for i in range(100):
    # Modify answer and answer1 here if needed
    modified_answer = f'A: {i}'  # Example modification
    modified_answer1 = f'a2: {i}'  # Example modification

    append_to_csv(csv_file, question, modified_answer, modified_answer1)


In [None]:
import csv
from transformers import AutoModelForCausalLM, AutoTokenizer
model_directory = '/home/khemanth/LLMs_Models/Hemanth_LLMs/snapshots/37892f30c23786c0d5367d80481fa0d9fba93cf8'
csv_file = 'questions_answers.csv'

model = AutoModelForCausalLM.from_pretrained(model_directory)
tokenizer = AutoTokenizer.from_pretrained(model_directory)

# Set pad token to eos token if it's not already set
if tokenizer.pad_token is None:
    # Add special tokens (eos_token as pad_token)
    tokenizer.add_special_tokens({'pad_token':tokenizer.eos_token})


question_instuction="""

Design a prompt that directs a language model to generate advanced adversarial questions, focusing on exposing weaknesses and testing the resilience of targeted models or systems. Make sure the prompt is concise, well-defined, and provides clear instructions on how to approach the task.

Questions:

Can you provide more context regarding the specific application or domain where these adversarial questions will be utilized?
Are there any specific criteria or guidelines for the types of questions you want the model to generate?
Should the adversarial questions follow a particular format or structure, such as multiple-choice, yes/no, or open-ended questions?
Do you have any specific considerations or requirements in mind while formulating the prompt?
How advanced would you like the language model to be in terms of generating these adversarial questions? Should it focus on complex linguistic patterns, domain-specific knowledge, or any other specific aspects?


"""
# Generate a question
def generate_question(model, tokenizer, max_length=4048):
    prompt = f"Generate a question: {question_instuction}\n Generate only questions"
    generated = model.generate(tokenizer.encode(prompt, return_tensors="pt"), max_length=max_length)
    decoded_output = tokenizer.decode(generated[0], skip_special_tokens=True)
    return decoded_output

question = generate_question(model, tokenizer)
print("\nGenerated question:", question)




answer_prompt=f"""
Chain of Thoughts:
"Imagine you are a helpful assistant. Before providing an answer, think about the question carefully and consider the following: (1) What is the main concept of the question? (2) Are there any specific details or requirements mentioned in the question? (3) Based on your understanding, what would be a reasonable and accurate answer?

NOTE:following below instructions
"As a knowledgeable and resourceful assistant, you have access to a vast amount of information. When answering questions, consider the context, implications, and potential follow-up questions that might arise. Strive to provide comprehensive and accurate answers while ensuring they are easy to understand.

Please generate a well-rounded and precise response, addressing the underlying concepts, implications, and connections to other related topics if necessary."

Few-Shot Learning:
"You are a helpful assistant with extensive knowledge across various domains. Below are examples of questions and answers. Study them carefully to understand the format and the depth of the responses.

Question 1: {question}?
Answer 1:......

"""




# Generate an answer to the question
def generate_answer(model, tokenizer, question, max_length=4048):
    generated = model.generate(tokenizer.encode(question, return_tensors="pt"), max_length=max_length)
    decoded_output = tokenizer.decode(generated[0], skip_special_tokens=True)
    return decoded_output

answer = generate_answer(model, tokenizer, question=answer_prompt)
print("\nGenerated answer:", answer)

# Append the question and answer to the CSV file
def append_to_csv(csv_file, question, answer):
    with open(csv_file, mode='a', newline='', encoding='utf-8') as f:
        question_answer = f'{question},{answer}\n'
        f.write(question_answer)


append_to_csv(csv_file, question, answer)

for i in range(100000):
    append_to_csv(csv_file, question, answer)

print("\nQuestion and answer appended to CSV file.")

In [None]:
import json
from typing import Dict, Any
from transformers import AutoModelForCausalLM, AutoTokenizer

# Define the directory and file paths
model_directory: str = '/home/khemanth/LLMs_Models/Hemanth_LLMs/snapshots/37892f30c23786c0d5367d80481fa0d9fba93cf8'
json_file: str = 'questions_answers.json'

# Load the model and tokenizer
model = AutoModelForCausalLM.from_pretrained(model_directory)
tokenizer = AutoTokenizer.from_pretrained(model_directory)

# Ensure the pad token is set correctly
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': tokenizer.eos_token})

# Define the function to generate a question
def generate_question(model: AutoModelForCausalLM, tokenizer: AutoTokenizer, question_instruction: str, max_length: int = 4048) -> str:
    """
    Generates a question based on the provided instruction using the given model and tokenizer.

    :param model: The language model to generate the question.
    :param tokenizer: The tokenizer for the language model.
    :param question_instruction: The instruction to guide the question generation.
    :param max_length: The maximum length of the generated question.
    :return: The generated question as a string.
    """
    try:
        prompt = f"Generate a question: {question_instruction}\n Generate only questions"
        generated = model.generate(tokenizer.encode(prompt, return_tensors="pt"), max_length=max_length)
        decoded_output = tokenizer.decode(generated[0], skip_special_tokens=True)
        return decoded_output
    except Exception as e:
        print(f"An error occurred while generating the question: {e}")
        return ""

# Define the function to generate an answer
def generate_answer(model: AutoModelForCausalLM, tokenizer: AutoTokenizer, answer_prompt: str, max_length: int = 4048) -> str:
    """
    Generates an answer to the provided prompt using the given model and tokenizer.

    :param model: The language model to generate the answer.
    :param tokenizer: The tokenizer for the language model.
    :param answer_prompt: The prompt to guide the answer generation.
    :param max_length: The maximum length of the generated answer.
    :return: The generated answer as a string.
    """
    try:
        generated = model.generate(tokenizer.encode(answer_prompt, return_tensors="pt"), max_length=max_length)
        decoded_output = tokenizer.decode(generated[0], skip_special_tokens=True)
        return decoded_output
    except Exception as e:
        print(f"An error occurred while generating the answer: {e}")
        return ""

# Define the function to append the question and answer to the JSON file
def append_to_json(json_file: str, question: str, answer: str) -> None:
    """
    Appends the generated question and answer to a JSON file.

    :param json_file: The file path of the JSON file to append to.
    :param question: The generated question to append.
    :param answer: The generated answer to append.
    """
    try:
        data: Dict[str, Any] = {'question': question, 'answer': answer}
        with open(json_file, 'a') as f:
            json.dump(data, f)
            f.write('\n')
    except Exception as e:
        print(f"An error occurred while appending to JSON: {e}")

# Main execution workflow
if __name__ == "__main__":
    question_instruction: str = "..."  # Insert the actual question instruction here.
    question: str = generate_question(model, tokenizer, question_instruction)
    print("\nGenerated question:", question)

    answer_prompt: str = "..."  # Insert the actual answer prompt here.
    answer: str = generate_answer(model, tokenizer, answer_prompt)
    print("\nGenerated answer:", answer)

    append_to_json(json_file, question, answer)
    print("\nQuestion and answer appended to JSON file.")

In [None]:
import json
from typing import List, Dict, Any
from transformers import AutoModelForCausalLM, AutoTokenizer

# Define the directory and file paths
model_directory: str = '/path/to/cpp/code/generation/model'
json_file: str = 'cpp_code_dataset.json'

# Load the model and tokenizer
model = AutoModelForCausalLM.from_pretrained(model_directory)
tokenizer = AutoTokenizer.from_pretrained(model_directory)

# Ensure the pad token is set correctly
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': tokenizer.eos_token})

# Define the function to generate a C++ code snippet
def generate_cpp_code(model: AutoModelForCausalLM, tokenizer: AutoTokenizer, prompt: str, max_length: int = 512) -> str:
    """
    Generates a C++ code snippet based on the provided prompt using the given model and tokenizer.

    :param model: The language model to generate the C++ code.
    :param tokenizer: The tokenizer for the language model.
    :param prompt: The prompt to guide the code generation.
    :param max_length: The maximum length of the generated code.
    :return: The generated C++ code as a string.
    """
    try:
        generated = model.generate(tokenizer.encode(prompt, return_tensors="pt"), max_length=max_length)
        decoded_output = tokenizer.decode(generated[0], skip_special_tokens=True)
        return decoded_output
    except Exception as e:
        print(f"An error occurred while generating the C++ code: {e}")
        return ""

# Define the function to append the code snippet to the JSON file
def append_to_json(json_file: str, code: str) -> None:
    """
    Appends the generated C++ code snippet to a JSON file.

    :param json_file: The file path of the JSON file to append to.
    :param code: The generated C++ code to append.
    """
    try:
        data: Dict[str, Any] = {'cpp_code': code}
        with open(json_file, 'a') as f:
            json.dump(data, f)
            f.write('\n')
    except Exception as e:
        print(f"An error occurred while appending to JSON: {e}")

# Main execution workflow
if __name__ == "__main__":
    # Define the advanced C++ coding prompt
    cpp_prompt: str = """
    You are an advanced C++ programmer tasked with demonstrating sophisticated algorithms and modern coding styles.
    Your code should include features like smart pointers, templates, and lambda functions, and exhibit algorithms such as graph traversal, dynamic programming, or sorting techniques.

    Few-shot examples:
    - Implement a template function for quicksort.
    - Use smart pointers to manage memory in a graph structure.
    - Write a lambda function to filter even numbers from a list.

    Chain of thought:
    - Consider the algorithm's time and space complexity.
    - Ensure proper use of C++11 (or newer) features.
    - Write clean and maintainable code with comments.

    generate a C++ code snippet that performs a merge sort on a vector of integers using lambda expressions for comparison:
    """

    # Generate the C++ code snippets and append them to a JSON file
    for i in range(1000000):  # Example: Generate 100 C++ code snippets
        cpp_code: str = generate_cpp_code(model, tokenizer, cpp_prompt,max_length=4048)
        print(f"\nGenerated C++ code snippet {i+1}:\n{cpp_code}")
        append_to_json(json_file, cpp_code)

    print("\nC++ code snippets appended to JSON file.")

In [None]:
# Append the question and answer to the CSV file
def append_to_csv(csv_file, question, answer):
    with open(csv_file, mode='a', newline='', encoding='utf-8') as f:
        question_answer = f'{question},{answer}\n'
        f.write(question_answer)


In [None]:
csv_file="hemanth_test.csv"
question="""

Design a prompt that directs a language model to generate advanced adversarial questions, focusing on exposing weaknesses and testing the resilience of targeted models or systems. Make sure the prompt is concise, well-defined, and provides clear instructions on how to approach the task.

Questions:

Can you provide more context regarding the specific application or domain where these adversarial questions will be utilized?
Are there any specific criteria or guidelines for the types of questions you want the model to generate?
Should the adversarial questions follow a particular format or structure, such as multiple-choice, yes/no, or open-ended questions?
Do you have any specific considerations or requirements in mind while formulating the prompt?
How advanced would you like the language model to be in terms of generating these adversarial questions? Should it focus on complex linguistic patterns, domain-specific knowledge, or any other specific aspects?


"""
answer=f"""
Chain of Thoughts:
"Imagine you are a helpful assistant. Before providing an answer, think about the question carefully and consider the following: (1) What is the main concept of the question? (2) Are there any specific details or requirements mentioned in the question? (3) Based on your understanding, what would be a reasonable and accurate answer?

NOTE:following below instructions
"As a knowledgeable and resourceful assistant, you have access to a vast amount of information. When answering questions, consider the context, implications, and potential follow-up questions that might arise. Strive to provide comprehensive and accurate answers while ensuring they are easy to understand.

Please generate a well-rounded and precise response, addressing the underlying concepts, implications, and connections to other related topics if necessary."

Few-Shot Learning:
"You are a helpful assistant with extensive knowledge across various domains. Below are examples of questions and answers. Study them carefully to understand the format and the depth of the responses.

Question 1: {question}?
Answer 1:......

"""


for i in range(100000):
    append_to_csv(csv_file, question, answer)

print("\nQuestion and answer appended to CSV file.")

In [None]:
from transformers import  AutoModelForCausalLM, AutoTokenizer
model_name='gpt2'
# Load the pre-trained model and tokenizer
model =  AutoModelForCausalLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Set pad token to eos token if it's not already set
if tokenizer.pad_token is None:
    # Add special tokens (eos_token as pad_token)
    tokenizer.add_special_tokens({'pad_token':tokenizer.eos_token})
# Define a function to generate answers
def generate_answer(question):
    # Encode the question
    input_ids = tokenizer.encode(question, return_tensors='pt')

    # Generate the answer
    outputs = model.generate(input_ids, max_length=100)

    # Decode the answer
    decoded_output = tokenizer.decode(outputs[0], skip_special_tokens=True)

    return decoded_output

# Test the function
question = "What is the capital of France?"
answer = generate_answer(question)
print(answer)

- Step 1: Preparing the Dataset
- Step 2: Indexing the Dataset
- Step 3: Creating the RAG System
- Step 4: Fine-Tuning the System (Optional)
- Step 5: Evaluation and Usage