<a href="https://colab.research.google.com/github/devansh1109/n8n_Workflow_Generator/blob/main/n8n_workflow.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import json
import re
from transformers import pipeline, GPT2Tokenizer

# Load GPT-2 tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("distilgpt2")

# Load summarization model (BART or T5)
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

# Load dataset
input_file = "n8n_ai_workflows.json"
output_file = "n8n_workflows_cleaned.jsonl"

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/762 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Device set to use cuda:0


In [None]:
with open(input_file, "r", encoding="utf-8") as f:
    data = json.load(f)

cleaned_data = []  # Using a set to prevent duplicates

In [None]:
for item in data:
    # Step 1: Extract relevant fields
    title = item.get("title", "").strip()
    workflow_json = item.get("json", "").strip()

    # Step 2: Clean text (remove excessive spaces, escape sequences)
    title = re.sub(r"\s+", " ", title)  # Replace multiple spaces with a single space
    workflow_json = re.sub(r"\s+", " ", workflow_json)  # Normalize spaces
    workflow_json = workflow_json.replace("\n", " ").replace("\r", "")  # Remove newlines

    # Step 3: Tokenize & truncate long workflows
    tokenized_workflow = tokenizer.encode(workflow_json, truncation=True, max_length=512)
    decoded_workflow = tokenizer.decode(tokenized_workflow)

    # Step 4: Create formatted entry
    if title and decoded_workflow:  # Ensure both prompt & completion exist
        formatted_entry = {"prompt": f"Create an n8n workflow for: {title}", "completion": decoded_workflow}
        cleaned_data.append(formatted_entry)

In [None]:
with open(output_file, "w", encoding="utf-8") as f:
    for entry in cleaned_data:
        f.write(json.dumps(entry) + "\n")

print(f"✅ Cleaned dataset saved to {output_file} with {len(cleaned_data)} entries!")

✅ Cleaned dataset saved to n8n_workflows_cleaned.jsonl with 331 entries!


In [None]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.3.2-py3-none-any.whl (485 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m485.4/485.4 kB[0m [31m15.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading multiprocess-0.70.16-py311-none-any.whl (143 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m13.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading

In [None]:
from transformers import GPT2Tokenizer
from datasets import load_dataset
import json

# ✅ Load tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("distilgpt2")
tokenizer.pad_token = tokenizer.eos_token  # GPT-2 requires an EOS token

# ✅ Load dataset
dataset = load_dataset("json", data_files="/content/n8n_workflows_cleaned.jsonl")

# Preprocess to ensure consistent format
def preprocess_dataset(example):
    try:
        if isinstance(example["completion"], str):
            # Try to parse it as JSON
            json.loads(example["completion"])
        else:
            # Convert to string if it's already an object
            example["completion"] = json.dumps(example["completion"])
    except json.JSONDecodeError:
        pass  # Leave as is if it's not valid JSON
    return example

# Apply preprocessing
dataset = dataset.map(preprocess_dataset)

# ✅ Correct Tokenization Function
def tokenize_function(example):
    # Ensure prompt is a string
    prompt = example["prompt"].strip()

    # Handle the completion field - regardless of whether it's already a JSON object or a string
    try:
        # If it's a string representation of JSON, parse it
        if isinstance(example["completion"], str):
            completion_json = json.loads(example["completion"])
        else:
            # If it's already a JSON object/dictionary
            completion_json = example["completion"]

        # Convert to formatted JSON string
        completion = json.dumps(completion_json, indent=4)
    except (json.JSONDecodeError, TypeError):
        # If parsing fails, use as-is
        completion = str(example["completion"]).strip()

    # Format for GPT-2 with explicit JSON markers
    text = f"### Instruction:\n{prompt}\n\n### Response (JSON):\n```json\n{completion}\n```"

    # Tokenize properly
    tokenized = tokenizer(text, truncation=True, padding="max_length", max_length=512)

    # Add labels for loss computation
    tokenized["labels"] = tokenized["input_ids"].copy()

    return tokenized

# ✅ Apply tokenization
tokenized_datasets = dataset.map(tokenize_function, batched=False, remove_columns=["prompt", "completion"])

# ✅ Print sample to verify
print(tokenized_datasets["train"][-1])
print(tokenizer.decode(tokenized_datasets["train"][-1]["input_ids"]))


Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/331 [00:00<?, ? examples/s]

Map:   0%|          | 0/331 [00:00<?, ? examples/s]

{'input_ids': [21017, 46486, 25, 198, 16447, 281, 299, 23, 77, 30798, 329, 25, 13610, 11, 4296, 11, 290, 651, 257, 7034, 287, 11054, 5109, 9552, 24997, 363, 1831, 282, 1558, 930, 604, 812, 2084, 930, 3232, 198, 198, 21017, 18261, 357, 40386, 2599, 198, 15506, 63, 17752, 198, 90, 198, 220, 220, 220, 366, 312, 1298, 366, 16799, 1600, 198, 220, 220, 220, 366, 3672, 1298, 366, 16447, 11, 4296, 11, 290, 651, 257, 7034, 287, 11054, 5109, 9552, 1600, 198, 220, 220, 220, 366, 77, 4147, 1298, 685, 198, 220, 220, 220, 220, 220, 220, 220, 1391, 198, 220, 220, 220, 220, 220, 220, 220, 220, 220, 220, 220, 366, 3672, 1298, 366, 2202, 12264, 705, 41049, 6, 1600, 198, 220, 220, 220, 220, 220, 220, 220, 220, 220, 220, 220, 366, 4906, 1298, 366, 77, 23, 77, 12, 77, 4147, 12, 8692, 13, 805, 723, 48344, 1600, 198, 220, 220, 220, 220, 220, 220, 220, 220, 220, 220, 220, 366, 9150, 1298, 685, 198, 220, 220, 220, 220, 220, 220, 220, 220, 220, 220, 220, 220, 220, 220, 220, 26481, 11, 198, 220, 220, 220, 220, 2

In [None]:
import random
import torch
from datasets import DatasetDict
from transformers import (
    GPT2LMHeadModel,
    TrainingArguments,
    Trainer
)

# ✅ Ensure GPU is used
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"🔥 Using device: {device.upper()}")

# ✅ Shuffle & Split Dataset (Train: 90%, Test: 10%)
split_ratio = 0.9  # 90% training, 10% testing
data = tokenized_datasets["train"].shuffle(seed=42)  # ✅ Access "train" split

train_size = int(split_ratio * len(data))
train_data = data.select(range(train_size))
test_data = data.select(range(train_size, len(data)))

# ✅ Create DatasetDict
dataset = DatasetDict({
    "train": train_data,
    "test": test_data
})

print(f"📊 Dataset split: {len(dataset['train'])} train / {len(dataset['test'])} test")

# ✅ Load pre-trained DistilGPT-2
model = GPT2LMHeadModel.from_pretrained("distilgpt2").to(device)

# ✅ Define Training Arguments
training_args = TrainingArguments(
    output_dir="./distilgpt2-n8n",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=10,
    weight_decay=0.01,
    learning_rate=5e-5,
    save_total_limit=2,
    logging_dir="./logs",
    report_to="none",  # Disable WandB logs
    fp16=True,  # ✅ Force mixed precision training on GPU
    push_to_hub=False
)

# ✅ Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
)

# ✅ Start Fine-Tuning 🚀
trainer.train()

# # ✅ Save the fine-tuned model
model.save_pretrained("./distilgpt2-n8n")
tokenizer.save_pretrained("./distilgpt2-n8n")
print("✅ Model training complete & saved!")


🔥 Using device: CUDA
📊 Dataset split: 297 train / 34 test


model.safetensors:   0%|          | 0.00/353M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Epoch,Training Loss,Validation Loss
1,No log,3.188287
2,No log,3.059629
3,No log,2.99119
4,No log,2.954287
5,No log,2.934397
6,No log,2.924622
7,2.987600,2.910562
8,2.987600,2.905667
9,2.987600,2.905737
10,2.987600,2.903521


✅ Model training complete & saved!


In [None]:
from transformers import pipeline, GPT2LMHeadModel, GPT2Tokenizer

# ✅ Load fine-tuned model & tokenizer
model_path = "./distilgpt2-n8n"
model = GPT2LMHeadModel.from_pretrained(model_path)
tokenizer = GPT2Tokenizer.from_pretrained(model_path)

# ✅ Set up text generation pipeline
generator = pipeline("text-generation", model=model, tokenizer=tokenizer)


Device set to use cuda:0


In [None]:
prompt = "Give the JSON code for a n8n workflow, to send an automated message to a particular gmail address list once in a day"
output1 = generator(prompt, max_length=300, num_return_sequences=1, do_sample=True)

print(output1[0]['generated_text'])


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Give the JSON code for a n8n workflow, to send an automated message to a particular gmail address list once in a day. How To Use This workflow is a breeze. You can send a single text message if you want it formatted in n8n or OpenAI. How To Enter Your Email: Fill out this form with your preferred email address: https://www.n8n.io/ How to Setup: Import the Email: OAuth2 credentials OpenAI credentials https://api.openai.com/v1/opena-authentication-classifier OpenAi-for-a-webhook-generator OpenAi-for-a-webhook-generator Configure an email URL: https://docs.openai.com/gmail-additional-api?r=2 | Obtain the latest OpenAi API key Ensure you create an email URL before sending your message (if you can't upload it as an image file in n8n). Fill out this form with your preferred email address. Set up the OpenAi API Key: Check the Gmail trigger (you should see this form in action). Create the Email: OpenAi API Key: Use the Gmail Trigger node to create your email. Use a set of OpenAi credentials to