In [66]:
from transformers import RagTokenizer, RagRetriever, RagTokenForGeneration




In [67]:
# Initialize RAG tokenizer, retriever, and generator
tokenizer = RagTokenizer.from_pretrained("facebook/rag-token-base")
retriever = RagRetriever.from_pretrained("facebook/rag-token-base", index_name="exact", use_dummy_dataset=True)
generator = RagTokenForGeneration.from_pretrained("facebook/rag-token-base", retriever=retriever)



The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called from is 'DPRQuestionEncoderTokenizer'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called from is 'DPRQuestionEncoderTokenizerFast'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called from is 'BartTokenizer'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called fr

ImportError: 
RagRetriever requires the faiss library but it was not found in your environment. Checkout the instructions on the
installation page of its repo: https://github.com/facebookresearch/faiss/blob/master/INSTALL.md and follow the ones
that match your environment. Please note that you may need to restart your runtime after installation.


In [None]:
# Example question
question = "What is the capital of France?"

# Get answer from RAG model
input_dict = tokenizer(question, return_tensors="pt")
outputs = generator.generate(input_dict["input_ids"], num_return_sequences=1)
answer = tokenizer.decode(outputs[0], skip_special_tokens=True)

# Provide feedback (correct answer)
correct_answer = "The capital of France is Paris."

# Update retriever knowledge base with feedback
feedback = {
    "question": question,
    "answer": correct_answer
}
retriever.update_index([feedback])

# Re-ask the question
outputs = generator.generate(input_dict["input_ids"], num_return_sequences=1)
updated_answer = tokenizer.decode(outputs[0], skip_special_tokens=True)

print("Original answer:", answer)
print("Corrected answer:", updated_answer)

In [68]:
from transformers import pipeline

def generate_employee_onboarding_json():
    # Initialize the text generation pipeline
    generator = pipeline("text-generation", model="EleutherAI/gpt-neo-125M")

    # Define the prompt
    prompt = "Generate JSON structure for employee onboarding:\n"
    prompt += "Employee ID: EMP123\n"
    # Add more details as needed

    # Generate response
    response = generator(prompt, max_length=400, num_return_sequences=1, do_sample=True)[0]["generated_text"]

    # Extract JSON from the generated response
    start_index = response.find("{")
    end_index = response.rfind("}") + 1
    generated_json = response[start_index:end_index]

    return generated_json



In [70]:
from transformers import pipeline

def generate_employee_onboarding_json():
    # Initialize the text generation pipeline
    generator = pipeline("text-generation", model="EleutherAI/gpt-neo-125M")

    # Define the prompt
    prompt = "Generate JSON structure for employee onboarding:\n"
    prompt += "{\n"
    prompt += "    $totals = $.parse(data);\n\n"
    prompt += "    if(typeof totals === \"number\") {\n"
    prompt += "        return totals;\n"
    prompt += "    }\n\n"
    prompt += "    var $this = this;\n\n"
    prompt += "    while($totals.index($totals)!== -1) {\n"
    prompt += "        $totals[$totals.text()] = $totals[$totals.text()];\n"
    prompt += "        ++$totals.length;\n"
    prompt += "    }\n\n"
    prompt += "    return $totals;\n"
    prompt += "}\n"

    # Generate response
    response = generator(prompt, max_length=400, num_return_sequences=1, do_sample=True)[0]["generated_text"]

    # Extract JSON from the generated response
    start_index = response.find("{")
    end_index = response.rfind("}") + 1
    generated_json = response[start_index:end_index]

    return generated_json

# Generate and print the JSON
generated_json = generate_employee_onboarding_json()
print(generated_json)


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


{
    $totals = $.parse(data);

    if(typeof totals === "number") {
        return totals;
    }

    var $this = this;

    while($totals.index($totals)!== -1) {
        $totals[$totals.text()] = $totals[$totals.text()];
        ++$totals.length;
    }

    return $totals;
}

When I set a custom function which is called automatically from the JSON data structure, it is outputting $totals as a String[] and not the whole json with my custom code. The user needs to remove that data as well as to insert some string into it. When this function is called for an empty list of JSON objects (which is fine), I need to print them to screen in a function. What I need to do is to create a function to read the values in the string.

A:

You can either write something like the following:
$this->functions->set($data,'totals');

Or you can do it the following way:
$this->functions = function(array $data, $key, $value) {
    if ('totals' === $data) {
            print 'Your Data '.$key.'\n '.$value.'\

In [71]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from transformers import TextDataset, DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments


In [72]:
# Load pre-trained GPT-2 model and tokenizer
model_name = "gpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

In [87]:
# Define dataset paths
train_dataset_path = "/home/gautam/Documents/wspace/video_Search/summary/ASR/train_dataset.json"
eval_dataset_path = "/home/gautam/Documents/wspace/video_Search/summary/eval_dataset_path.json"


In [88]:
# Define training arguments
training_args = TrainingArguments(
    output_dir="./gpt2-finetuned-employee-onboarding",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=4,
    save_steps=10_000,
    save_total_limit=2,
)

In [89]:
# Load datasets
train_dataset = TextDataset(
    tokenizer=tokenizer,
    file_path=train_dataset_path,
    block_size=128,  # Adjust block size as needed
)
eval_dataset = TextDataset(
    tokenizer=tokenizer,
    file_path=eval_dataset_path,
    block_size=128,  # Adjust block size as needed
)

In [90]:
# Define data collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
)

In [91]:
# Create Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
)


In [92]:
# Fine-tune the model
trainer.train()


  0%|          | 0/3 [00:00<?, ?it/s]

100%|██████████| 3/3 [00:12<00:00,  4.10s/it]

{'train_runtime': 12.2906, 'train_samples_per_second': 0.732, 'train_steps_per_second': 0.244, 'train_loss': 2.3442653020222983, 'epoch': 3.0}





TrainOutput(global_step=3, training_loss=2.3442653020222983, metrics={'train_runtime': 12.2906, 'train_samples_per_second': 0.732, 'train_steps_per_second': 0.244, 'train_loss': 2.3442653020222983, 'epoch': 3.0})

In [97]:













# Save the model
trainer.save_model("./employee-onboarding")


In [94]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer



In [98]:
# Load fine-tuned GPT-2 model and tokenizer
model_path = "/home/gautam/Documents/wspace/video_Search/summary/employee-onboarding"
tokenizer = GPT2Tokenizer.from_pretrained(model_path)
model = GPT2LMHeadModel.from_pretrained(model_path)




OSError: Can't load tokenizer for '/home/gautam/Documents/wspace/video_Search/summary/employee-onboarding'. If you were trying to load it from 'https://huggingface.co/models', make sure you don't have a local directory with the same name. Otherwise, make sure '/home/gautam/Documents/wspace/video_Search/summary/employee-onboarding' is the correct path to a directory containing all relevant files for a GPT2Tokenizer tokenizer.

In [None]:
# Generate JSON
prompt = "{\"employee_onboarding\": {\"employee_details\": {\"name\": \"John Smith\", \"position\": \"Data Scientist\"}}}"
inputs = tokenizer.encode(prompt, return_tensors="pt")
outputs = model.generate(inputs, max_length=512, num_return_sequences=1, pad_token_id=tokenizer.eos_token_id)

# Decode and print generated JSON
generated_json = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(generated_json)

In [100]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from transformers import Trainer, TrainingArguments

# Load pre-trained GPT-2 model and tokenizer
model_name = "gpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./gpt2-finetuned-json-generation",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=4,
    save_steps=10_000,
    save_total_limit=2,
)

# Define dataset paths
train_dataset_path = "train_dataset.json"
eval_dataset_path = "eval_dataset.json"

# Define training and evaluation datasets (assuming they are in JSON format)
# Use your custom dataset class if needed
train_dataset = JSONDataset(
    tokenizer=tokenizer,
    file_path=train_dataset_path,
    block_size=128,  # Adjust block size as needed
)
eval_dataset = JSONDataset(
    tokenizer=tokenizer,
    file_path=eval_dataset_path,
    block_size=128,  # Adjust block size as needed
)

# Define data collator (if needed)
# Use your custom data collator if needed

# Create Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
)

# Fine-tune the model
trainer.train()

# Save the model
trainer.save_model("./gpt2-finetuned-json-generation")


TypeError: 'str' object is not callable