<a href="https://colab.research.google.com/github/fikrifaizz/llm-task-instruction-generator/blob/main/src/data_generator/generator.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import json
import random
import re
from collections import Counter
import logging

In [16]:
class EnhancedDatasetGenerator:
    def __init__(self):
        logging.basicConfig(level=logging.INFO)
        self.logger = logging.getLogger(__name__)
        self.instruction_templates = {
            "reset_password": {
                "Shopee": [
                    "Open the Shopee mobile app",
                    "Tap on 'Me' tab at the bottom",
                    "Select 'Settings' from the menu",
                    "Tap on 'Account & Security'",
                    "Choose 'Password' option",
                    "Tap 'Forgot Password'",
                    "Enter your registered phone number or email",
                    "Verify with OTP sent to your device",
                    "Create and confirm your new password"
                ],
                "Tokopedia": [
                    "Open Tokopedia app or website",
                    "Go to login page",
                    "Click 'Lupa Password' (Forgot Password)",
                    "Enter your registered email or phone number",
                    "Click 'Kirim Kode' (Send Code)",
                    "Check SMS or email for verification code",
                    "Enter the verification code",
                    "Create a new password",
                    "Confirm the new password",
                    "Click 'Simpan' (Save)"
                ],
                "Lazada": [
                    "Open Lazada application",
                    "Navigate to sign-in page",
                    "Tap 'Forgot Password'",
                    "Enter your registered mobile number",
                    "Request OTP verification",
                    "Enter the OTP received via SMS",
                    "Set your new password",
                    "Confirm password by typing again",
                    "Save the new password"
                ],
                "Blibli": [
                    "Launch Blibli app or visit website",
                    "Click on 'Masuk' (Sign In)",
                    "Select 'Lupa Password'",
                    "Input your registered email address",
                    "Click 'Kirim Email Reset'",
                    "Check your email inbox",
                    "Click the reset link in email",
                    "Enter new password twice",
                    "Submit the password change"
                ]
            },
            "track_order": {
                "Shopee": [
                    "Open Shopee app and log in",
                    "Tap 'Me' at bottom navigation",
                    "Select 'Pembelian' (Purchases)",
                    "Find your order from the list",
                    "Tap on the order you want to track",
                    "View shipping status and timeline",
                    "Check estimated delivery date",
                    "Tap 'Lacak Paket' for detailed tracking"
                ],
                "Tokopedia": [
                    "Open Tokopedia application",
                    "Go to 'Daftar Transaksi' (Transaction List)",
                    "Locate the order you want to track",
                    "Tap on the specific order",
                    "View current order status",
                    "Check shipping progress",
                    "See estimated delivery time",
                    "Copy tracking number if needed"
                ],
                "Lazada": [
                    "Launch Lazada app",
                    "Sign in to your account",
                    "Tap 'Account' at bottom menu",
                    "Select 'My Orders'",
                    "Find the order to track",
                    "Tap 'Track Order'",
                    "View shipment progress",
                    "Check delivery status updates"
                ],
                "Blibli": [
                    "Open Blibli application",
                    "Access 'Akun Saya' (My Account)",
                    "Choose 'Pesanan Saya' (My Orders)",
                    "Select the order to track",
                    "View order timeline",
                    "Check current shipping status",
                    "Monitor delivery progress"
                ]
            },
            "add_payment": {
                "Shopee": [
                    "Open Shopee app and sign in",
                    "Tap 'Me' tab at bottom",
                    "Select 'ShopeePay' or 'Wallet'",
                    "Tap 'Add Payment Method'",
                    "Choose payment type (Card/Bank)",
                    "Enter payment details accurately",
                    "Add security verification if required",
                    "Confirm and save payment method",
                    "Verify with OTP if prompted"
                ],
                "Tokopedia": [
                    "Access Tokopedia app or website",
                    "Go to account settings",
                    "Select 'Payment' or 'Pembayaran'",
                    "Tap 'Tambah Metode Pembayaran'",
                    "Choose your preferred payment type",
                    "Fill in payment information",
                    "Verify account details",
                    "Save the payment method",
                    "Test with small transaction if needed"
                ],
                "Lazada": [
                    "Open Lazada application",
                    "Navigate to 'Account' section",
                    "Select 'Payment Options'",
                    "Tap 'Add Payment Method'",
                    "Choose payment type (Credit/Debit/Wallet)",
                    "Enter payment credentials",
                    "Verify through bank authentication",
                    "Confirm payment method addition",
                    "Set as default if desired"
                ],
                "Blibli": [
                    "Launch Blibli app",
                    "Go to profile/account settings",
                    "Select 'Metode Pembayaran'",
                    "Click 'Tambah Metode Baru'",
                    "Choose payment category",
                    "Input payment details",
                    "Complete verification process",
                    "Save payment method",
                    "Confirm successful addition"
                ]
            },
            "return_item": {
                "Shopee": [
                    "Open Shopee app and log in",
                    "Go to 'Me' then 'Pembelian'",
                    "Find the order with item to return",
                    "Tap 'Minta Pengembalian' (Request Return)",
                    "Select the item to return",
                    "Choose return reason from options",
                    "Upload photos of the item",
                    "Fill in additional details",
                    "Submit return request",
                    "Wait for seller confirmation",
                    "Follow return shipping instructions"
                ],
                "Tokopedia": [
                    "Access Tokopedia account",
                    "Navigate to 'Daftar Transaksi'",
                    "Locate order with return item",
                    "Click 'Ajukan Komplain/Return'",
                    "Select specific product",
                    "State reason for return",
                    "Provide supporting evidence",
                    "Submit return request",
                    "Track return status",
                    "Await resolution from seller"
                ],
                "Lazada": [
                    "Open Lazada application",
                    "Go to 'My Orders' section",
                    "Find order containing return item",
                    "Tap 'Return Item' button",
                    "Select items for return",
                    "Indicate return reason",
                    "Upload item condition photos",
                    "Confirm return details",
                    "Submit return application",
                    "Follow provided return instructions"
                ],
                "Blibli": [
                    "Launch Blibli app",
                    "Access 'Pesanan Saya'",
                    "Locate relevant order",
                    "Select 'Return Barang'",
                    "Choose item to return",
                    "Select return category",
                    "Describe the issue",
                    "Attach supporting images",
                    "Submit return request",
                    "Monitor return progress"
                ]
            }
        }
        self.sensitive_patterns = [
            r'\b\d{4}[-\s]?\d{4}[-\s]?\d{4}[-\s]?\d{4}\b',  # Credit card
            r'\b\d{3}-?\d{2}-?\d{4}\b',  # SSN-like
            r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b',  # Email
            r'\b\d{10,15}\b',  # Phone numbers
            r'\bpassword:\s*\w+\b',  # Passwords
        ]

    def sanitize_sensitive_information(self, text):
        sanitized = text
        sanitized = re.sub(r'\b\d{4}[-\s]?\d{4}[-\s]?\d{4}[-\s]?\d{4}\b',
                           '[CARD_NUMBER]', sanitized)
        sanitized = re.sub(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b',
                           '[EMAIL_ADDRESS]', sanitized)
        sanitized = re.sub(r'\b\d{10,15}\b', '[PHONE_NUMBER]', sanitized)
        if any(re.search(pattern, text) for pattern in self.sensitive_patterns):
            self.logger.warning(f"Sensitive information detected and sanitized: {text[:50]}...")
        return sanitized

    def ensure_diversity_and_generalization(self, dataset):
        category_counts = Counter(sample['category'] for sample in dataset)
        app_counts = Counter(sample['app'] for sample in dataset)

        self.logger.info("Dataset Distribution Analysis:")
        self.logger.info(f"Categories: {dict(category_counts)}")
        self.logger.info(f"Apps: {dict(app_counts)}")
        balanced_dataset = self.balance_dataset(dataset, category_counts)
        diverse_dataset = self.add_linguistic_diversity(balanced_dataset)
        generalized_dataset = self.add_cross_domain_samples(diverse_dataset)

        return generalized_dataset

    def balance_dataset(self, dataset, category_counts):
        target_count = sorted(category_counts.values())[len(category_counts) // 2]
        self.logger.info(f"Target samples per category: {target_count}")

        balanced_dataset = []

        for category in category_counts:
            category_samples = [s for s in dataset if s['category'] == category]
            current_count = len(category_samples)

            if current_count > target_count:
                # Undersample: randomly select target_count samples
                balanced_samples = random.sample(category_samples, target_count)
                self.logger.info(f"Undersampled {category}: {current_count} -> {target_count}")

            elif current_count < target_count:
                needed = target_count - current_count
                additional_samples = []

                for _ in range(needed):
                    base_sample = random.choice(category_samples).copy()
                    varied_sample = self.add_variation_to_sample(base_sample)
                    additional_samples.append(varied_sample)

                balanced_samples = category_samples + additional_samples
                self.logger.info(f"Oversampled {category}: {current_count} -> {target_count}")
            else:
                balanced_samples = category_samples

            balanced_dataset.extend(balanced_samples)

        random.shuffle(balanced_dataset)
        return balanced_dataset

    def add_variation_to_sample(self, sample):
        varied_sample = sample.copy()
        intent_variations = {
            "How do I": ["How can I", "What's the process to", "Steps to"],
            "Steps to": ["How to", "Process for", "Instructions to"],
            "How to": ["Steps to", "Process to", "Way to"]
        }

        original_intent = varied_sample['user_intent']
        for pattern, alternatives in intent_variations.items():
            if pattern in original_intent:
                new_pattern = random.choice(alternatives)
                varied_sample['user_intent'] = original_intent.replace(pattern, new_pattern)
                break
        instruction_variations = {
            "Open": ["Launch", "Start", "Access"],
            "Tap": ["Click", "Select", "Press"],
            "Click": ["Tap", "Select", "Press"],
            "Select": ["Choose", "Pick", "Tap"],
            "Enter": ["Input", "Type", "Fill in"]
        }

        varied_instructions = []
        for instruction in varied_sample['structured_instructions']:
            varied_instruction = instruction
            for original, alternatives in instruction_variations.items():
                if original in instruction:
                    replacement = random.choice(alternatives)
                    varied_instruction = instruction.replace(original, replacement, 1)
                    break
            varied_instructions.append(varied_instruction)

        varied_sample['structured_instructions'] = varied_instructions
        return varied_sample

    def add_linguistic_diversity(self, dataset):
        diverse_dataset = dataset.copy()
        paraphrase_patterns = [
            ("Navigate to", "Go to"),
            ("Locate", "Find"),
            ("Input", "Enter"),
            ("Choose", "Select"),
            ("Verify", "Confirm"),
            ("Monitor", "Check"),
            ("Access", "Open")
        ]

        for sample in diverse_dataset:
            if random.random() < 0.3:
                for original, replacement in paraphrase_patterns:
                    sample['structured_instructions'] = [
                        inst.replace(original, replacement)
                        for inst in sample['structured_instructions']
                    ]

        return diverse_dataset

    def add_cross_domain_samples(self, dataset):
        additional_templates = {
            "productivity": {
                "Google_Docs": [
                    "Open Google Docs in your browser",
                    "Click 'Create' or '+' for new document",
                    "Choose 'Blank document'",
                    "Start typing your content",
                    "Use toolbar for formatting",
                    "Click 'Share' to collaborate",
                    "Set sharing permissions",
                    "Save automatically to Drive"
                ],
                "Notion": [
                    "Open Notion app or website",
                    "Click 'New Page' in sidebar",
                    "Choose page template",
                    "Add title and content",
                    "Use blocks for organization",
                    "Share with team members",
                    "Set page permissions"
                ]
            }
        }
        cross_domain_samples = []
        for domain, apps in additional_templates.items():
            for app, instructions in apps.items():
                sample = {
                    "user_intent": f"How to create document in {app.replace('_', ' ')}",
                    "structured_instructions": [f"{i + 1}. {step}" for i, step in enumerate(instructions)],
                    "domain": domain,
                    "category": "create_document",
                    "app": app,
                    "complexity": self.assess_complexity(instructions),
                    "step_count": len(instructions)
                }
                cross_domain_samples.append(sample)

        self.logger.info(f"Added {len(cross_domain_samples)} cross-domain samples")

        return dataset + cross_domain_samples

    def quality_validation(self, dataset):
        validation_issues = []

        for i, sample in enumerate(dataset):
            required_fields = ['user_intent', 'structured_instructions', 'domain', 'category']
            missing_fields = [field for field in required_fields if field not in sample]
            if missing_fields:
                validation_issues.append(f"Sample {i}: Missing fields {missing_fields}")
            instructions = sample.get('structured_instructions', [])
            for j, instruction in enumerate(instructions):
                if not re.match(r'^\d+\.', instruction):
                    validation_issues.append(f"Sample {i}, Instruction {j}: Invalid format")
                if any(re.search(pattern, instruction) for pattern in self.sensitive_patterns):
                    validation_issues.append(f"Sample {i}, Instruction {j}: Contains sensitive info")
            if sample.get('step_count') != len(instructions):
                validation_issues.append(f"Sample {i}: Step count mismatch")

        if validation_issues:
            self.logger.warning(f"Found {len(validation_issues)} validation issues")
            for issue in validation_issues[:10]:
                self.logger.warning(issue)
        else:
            self.logger.info("Dataset passed all quality validations")

        return len(validation_issues) == 0

    def generate_variations(self, base_instructions, variation_count=3):
        variations = []
        variations.append(base_instructions.copy())

        for i in range(variation_count - 1):
            variant = []
            for step in base_instructions:
                if "tap" in step.lower():
                    variant.append(step.replace("Tap", "Click").replace("tap", "click"))
                elif "click" in step.lower():
                    variant.append(step.replace("Click", "Select").replace("click", "select"))
                else:
                    variant.append(step)
            variations.append(variant)
        return variations

    def create_intent_from_template(self, action, app):
        intent_patterns = {
            "reset_password": [
                f"How do I reset my password in {app}",
                f"Steps to reset password on {app}",
                f"How to change password in {app} app"
            ],
            "track_order": [
                f"Steps to track my order in {app}",
                f"How to track order on {app}",
                f"How do I check my order status in {app}"
            ],
            "add_payment": [
                f"Add payment method to {app}",
                f"How to add payment method in {app}",
                f"Steps to link payment method on {app}"
            ],
            "return_item": [
                f"How to return an item on {app}",
                f"Steps to return product in {app}",
                f"How do I return item bought on {app}"
            ]
        }
        return random.choice(intent_patterns[action])

    def assess_complexity(self, instructions):
        step_count = len(instructions)
        complex_indicators = ['verification', 'authentication', 'OTP', 'confirm', 'security']
        complex_count = sum(1 for step in instructions
                            for indicator in complex_indicators
                            if indicator.lower() in step.lower())
        if step_count <= 5 and complex_count <= 1:
            return "low"
        elif step_count <= 8 and complex_count <= 3:
            return "medium"
        else:
            return "high"

    def generate_dataset(self, samples_per_action=30):
        self.logger.info("Starting enhanced dataset generation...")
        raw_dataset = []
        for action, app_templates in self.instruction_templates.items():
            for app, base_instructions in app_templates.items():
                variations = self.generate_variations(base_instructions, 3)
                for i in range(samples_per_action):
                    instructions = random.choice(variations)
                    user_intent = self.create_intent_from_template(action, app)
                    sanitized_instructions = [
                        self.sanitize_sensitive_information(inst) for inst in instructions
                    ]

                    sample = {
                        "user_intent": user_intent,
                        "structured_instructions": [f"{i + 1}. {step}" for i, step in
                                                    enumerate(sanitized_instructions)],
                        "domain": "e-commerce",
                        "category": action,
                        "app": app,
                        "step_count": len(sanitized_instructions)
                    }
                    raw_dataset.append(sample)
        balanced_dataset = self.ensure_diversity_and_generalization(raw_dataset)
        is_valid = self.quality_validation(balanced_dataset)
        if is_valid:
            self.logger.info("Dataset generation completed successfully")
        else:
            self.logger.warning("Dataset generated with some quality issues")

        random.shuffle(balanced_dataset)
        return balanced_dataset

    def save_dataset(self, dataset, filename="dataset.json"):
        """Save dataset with comprehensive statistics"""

        with open(filename, 'w', encoding='utf-8') as f:
            json.dump(dataset, f, indent=2, ensure_ascii=False)

        # Generate comprehensive statistics
        stats = self.generate_dataset_statistics(dataset)

        self.logger.info(f"Dataset saved: {len(dataset)} samples to {filename}")

        return stats

    def generate_dataset_statistics(self, dataset):
        stats = {
            "total_samples": len(dataset),
            "categories": dict(Counter(sample['category'] for sample in dataset)),
            "apps": dict(Counter(sample['app'] for sample in dataset)),
            "domains": dict(Counter(sample['domain'] for sample in dataset)),
            "step_count_distribution": dict(Counter(sample['step_count'] for sample in dataset)),
            "average_steps": sum(sample['step_count'] for sample in dataset) / len(dataset),
            "min_steps": min(sample['step_count'] for sample in dataset),
            "max_steps": max(sample['step_count'] for sample in dataset)
        }

        return stats

In [17]:
if __name__ == "__main__":
    print("Generating PDF-compliant enhanced dataset...")
    generator = EnhancedDatasetGenerator()
    dataset = generator.generate_dataset(samples_per_action=30)
    stats = generator.save_dataset(dataset)

    print(f"\nDataset Statistics:")
    print(f"Total samples: {stats['total_samples']}")
    print(f"Categories: {stats['categories']}")

    print(f"\nSample Generated Data:")
    print(json.dumps(dataset[0], indent=2, ensure_ascii=False))

Generating PDF-compliant enhanced dataset...

Dataset Statistics:
Total samples: 482
Categories: {'add_payment': 120, 'reset_password': 120, 'track_order': 120, 'return_item': 120, 'create_document': 2}

Sample Generated Data:
{
  "user_intent": "How to add payment method in Blibli",
  "structured_instructions": [
    "1. Launch Blibli app",
    "2. Go to profile/account settings",
    "3. Select 'Metode Pembayaran'",
    "4. Select 'Tambah Metode Baru'",
    "5. Choose payment category",
    "6. Input payment details",
    "7. Complete verification process",
    "8. Save payment method",
    "9. Confirm successful addition"
  ],
  "domain": "e-commerce",
  "category": "add_payment",
  "app": "Blibli",
  "step_count": 9
}
