In [1]:
#Imports
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from sentence_transformers import SentenceTransformer
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset

Dataset of 200 distinct tasks per employee for 5 different roles (Backend, HR, Sales, etc.).

Input: Tasks: Text descriptions of daily responsibilities.

Output: A structured DataFrame containing

*   task_text
*   employee_id
*   persona_name
*   embedding vector of the task


In [19]:
# CREATION OF THE SYNTHETIC DATASET:

import itertools
import random

import itertools
import random
import pandas as pd

# ==========================================
# üéØ STRATEGY: EXPANDED TEMPLATE
# ==========================================
# We define distinct "Components" for each role.
# - VERBS: Action words (some are unique, some overlap!)
# - TARGETS: The object of the work (highly specific to the role)
# - CONTEXTS: The reason/tool (adds realism and noise)
# ==========================================

personas = {
    # ---------------------------------------------------------
    # üíª BACKEND DEVELOPER
    # ---------------------------------------------------------
    "backend_dev": {
        "verbs": [
            "Debug", "Refactor", "Optimize", "Deploy", "Migrate", "Document",
            "Architect", "Secure", "Patch", "Monitor", "Scale", "Rollback",
            "Investigate", "Configure", "Test"
        ],
        "targets": [
            "the API endpoints", "the database schema", "the microservices architecture",
            "the caching layer", "the authentication service", "the legacy code",
            "the CI/CD pipeline", "the RESTful interface", "the GraphQL resolver",
            "the serverless function", "the load balancer", "the message queue",
            "the redis cluster", "the kubernetes pod", "the payment gateway"
        ],
        "contexts": [
            "to reduce latency", "to fix a critical bug", "for the new feature release",
            "to improve scalability", "before the code freeze", "as requested by the tech lead",
            "to handle high traffic", "to prevent SQL injection", "using Docker containers",
            "following TDD practices", "to reduce cloud costs", "after the security audit",
            "to fix the memory leak", "during the sprint planning", "for the v2 migration"
        ]
    },

    # ---------------------------------------------------------
    # üìä DATA SCIENTIST
    # ---------------------------------------------------------
    "data_scientist": {
        "verbs": [
            "Train", "Evaluate", "Clean", "Visualize", "Fine-tune", "Collect",
            "Preprocess", "Annotate", "Cluster", "Forecast", "Interpret", "Benchmark",
            "Analyze", "Model", "Extract"
        ],
        "targets": [
            "the classification model", "the customer dataset", "the training pipeline",
            "the exploratory analysis", "the hyperparameters", "the ground truth labels",
            "the neural network", "the predictive algorithm", "the A/B test results",
            "the sentiment analysis", "the churn prediction model", "the feature engineering script",
            "the user behavior logs", "the regression baseline", "the anomaly detection system"
        ],
        "contexts": [
            "to improve accuracy", "for the quarterly report", "using PyTorch",
            "to reduce overfitting", "for the recommendation engine", "to detect anomalies",
            "using Python and Pandas", "to minimize bias", "for the stakeholder presentation",
            "using a Random Forest", "to increase recall", "on the cloud cluster",
            "to support the product team", "using Jupyter notebooks", "to validate the hypothesis"
        ]
    },

    # ---------------------------------------------------------
    # ü§ù HR SPECIALIST
    # ---------------------------------------------------------
    "hr_specialist": {
        "verbs": [
            "Interview", "Onboard", "Draft", "Review", "Organize", "Update",
            "Negotiate", "Schedule", "Mediate", "Approve", "Survey", "Announce",
            "Coordinate", "Manage", "Screen"
        ],
        "targets": [
            "the new candidates", "the employment contracts", "the team building event",
            "the performance reviews", "the employee handbook", "the benefits package",
            "the salary bands", "the exit interviews", "the diversity initiative",
            "the PTO policy", "the visa applications", "the quarterly feedback",
            "the payroll discrepancies", "the training workshop", "the recruitment drive"
        ],
        "contexts": [
            "for the engineering team", "to ensure compliance", "for the annual retreat",
            "before the hiring freeze", "to improve retention", "regarding the new policy",
            "using the ATS system", "to boost company culture", "following labor laws",
            "for the leadership role", "to address complaints", "during the open enrollment",
            "to align with budget", "for the summer interns", "to support mental health"
        ]
    },

    # ---------------------------------------------------------
    # üéß CUSTOMER SUPPORT
    # ---------------------------------------------------------
    "customer_support": {
        "verbs": [
            "Resolve", "Escalate", "Document", "Reply to", "Verify", "Troubleshoot",
            "Refund", "Apologize for", "Clarify", "Walk through", "Prioritize", "Close",
            "Follow up on", "Investigate", "Merge"
        ],
        "targets": [
            "the user ticket", "the billing dispute", "the feature request",
            "the angry email", "the bug report", "the login issue",
            "the subscription cancellation", "the password reset", "the shipping delay",
            "the enterprise account", "the chat transcript", "the duplicate ticket",
            "the service outage complaint", "the refund request", "the account lockout"
        ],
        "contexts": [
            "within the SLA time", "to the technical team", "in the helpdesk system",
            "to improve CSAT score", "for the VIP client", "via live chat",
            "per the refund policy", "to de-escalate the situation", "using the admin panel",
            "before the shift ends", "to avoid negative reviews", "with the manager's approval",
            "after verifying identity", "to prevent churn", "as a high priority"
        ]
    },

    # ---------------------------------------------------------
    # üì¢ MARKETING SPECIALIST
    # ---------------------------------------------------------
    "marketing_specialist": {
        "verbs": [
            "Create", "Launch", "Optimize", "Analyze", "Schedule", "Proofread",
            "Design", "Brainstorm", "Promote", "Target", "Sponsor", "Publish",
            "Draft", "Monitor", "A/B test"
        ],
        "targets": [
            "the social media campaign", "the email newsletter", "the SEO keywords",
            "the blog post", "the ad budget", "the promotional video",
            "the landing page copy", "the brand guidelines", "the webinar series",
            "the influencer partnership", "the case study", "the press release",
            "the google ads account", "the content calendar", "the viral tweet"
        ],
        "contexts": [
            "for the holiday season", "to increase brand awareness", "on Instagram and LinkedIn",
            "using Google Analytics", "to drive website traffic", "before the product launch",
            "to improve conversion rate", "for the target audience", "with the design team",
            "to maximize ROI", "for the Q4 strategy", "using Canva templates",
            "to grow the mailing list", "referencing the new logo", "to engage followers"
        ]
    }
}

# ==========================================
# ‚öôÔ∏è GENERATION LOOP
# ==========================================
all_synthetic_data = []

print("üöÄ Generating Synthetic Data...")

# Loop through each persona
for persona_name, components in personas.items():
    # 1. Generate ALL possible combinations (Cartesian Product)
    #    (15 verbs * 15 targets * 15 contexts = 3,375 unique sentences per role!)
    combinations = list(itertools.product(
        components["verbs"],
        components["targets"],
        components["contexts"]
    ))

    # 2. Shuffle to randomize
    random.shuffle(combinations)

    # 3. Select exactly 500 tasks per role
    #    This ensures a perfectly balanced dataset.
    selected = combinations[:500]

    for verb, target, context in selected:
        # Create the sentence structure
        sentence = f"{verb} {target} {context}."

        all_synthetic_data.append({
            "task_description": sentence,
            "role": persona_name
        })

# Create the DataFrame
df = pd.DataFrame(all_synthetic_data)

# Shuffle the final dataset so roles are mixed up
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

# Generate IDs for the roles automatically
role_to_id = {role: i for i, role in enumerate(df['role'].unique())}
df['employee_id'] = df['role'].map(role_to_id)

print(f"\n‚úÖ SUCCESS! Generated {len(df)} unique tasks.")
print(f"   - Roles: {list(role_to_id.keys())}")
print(f"   - Samples per role: {len(df) // len(role_to_id)}")
print("\nSample Data:")
print(df[['role', 'task_description']].head(10))

# OPTIONAL: Save to pickle if you want to use it immediately
# df.to_pickle("employee_tasks_v3.pkl")


üöÄ Generating Synthetic Data...

‚úÖ SUCCESS! Generated 2500 unique tasks.
   - Roles: ['hr_specialist', 'marketing_specialist', 'customer_support', 'data_scientist', 'backend_dev']
   - Samples per role: 500

Sample Data:
                   role                                   task_description
0         hr_specialist  Manage the employment contracts for the summer...
1         hr_specialist  Interview the performance reviews to support m...
2         hr_specialist  Screen the diversity initiative using the ATS ...
3  marketing_specialist     Promote the promotional video to maximize ROI.
4      customer_support  Investigate the duplicate ticket to avoid nega...
5        data_scientist  Evaluate the ground truth labels using Python ...
6      customer_support  Troubleshoot the bug report before the shift e...
7  marketing_specialist         Analyze the press release to maximize ROI.
8           backend_dev  Migrate the caching layer to prevent SQL injec...
9  marketing_specialist  

In [20]:
# 1. Imports:
#Embedding Library
!pip install -q sentence-transformers

import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

In [21]:
embedder = SentenceTransformer('all-MiniLM-L6-v2')

Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


In [22]:
# Map employees to ids
employee_id_map = {
    "backend_dev": 0,
    "data_scientist": 1,
    "hr_specialist": 2,
    "customer_support": 3,
    "marketing_specialist": 4
}
from sentence_transformers import SentenceTransformer
import pandas as pd
import numpy as np

# 1. Extract lists from our generated dictionary
# We need to separate the data into simple lists for the DataFrame
tasks = [item['task_description'] for item in all_synthetic_data]
personas = [item['role'] for item in all_synthetic_data]

# 2. Create Employee IDs (Map roles to numbers 0, 1, 2, 3, 4)
# We create a mapping dictionary automatically
unique_roles = sorted(list(set(personas)))
role_to_id = {role: i for i, role in enumerate(unique_roles)}
emp_ids = [role_to_id[role] for role in personas]

print(f"‚úÖ Extracted {len(tasks)} tasks.")
print(f"   Role Mapping: {role_to_id}")

# 3. Generate Embeddings
# This converts your text into the "numbers" the AI needs
print("\n‚è≥ Generating Embeddings (this might take a moment)...")
embedder = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = embedder.encode(tasks, show_progress_bar=True)

# 4. Form the DataFrame
# We build it exactly as you requested
df = pd.DataFrame({
    'task_description': tasks,
    'role': personas,
    'employee_id': emp_ids,
})

# Add embeddings (convert the big matrix into a column of arrays)
df['task_embedding'] = list(embeddings)

# 5. Reorder columns
desired_order = ["task_description", "task_embedding", "role", "employee_id"]
df = df[desired_order]

# 6. Shuffle the final dataset
# (Important! Otherwise all Backend tasks are at the top, and Marketing at the bottom)
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

print("\n--- DATASET GENERATED ---")
print(f"Total Samples: {len(df)}")
print(f"Embedding Dimension: {embeddings.shape[1]}")
print(df.head())

# 7. Save to Pickle (Preserves the Embeddings correctly!)
df.to_pickle("employee_tasks_v3.pkl")
print("\nüíæ Saved to 'employee_tasks_v3.pkl'")

‚úÖ Extracted 2500 tasks.
   Role Mapping: {'backend_dev': 0, 'customer_support': 1, 'data_scientist': 2, 'hr_specialist': 3, 'marketing_specialist': 4}

‚è≥ Generating Embeddings (this might take a moment)...


Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


Batches:   0%|          | 0/79 [00:00<?, ?it/s]


--- DATASET GENERATED ---
Total Samples: 2500
Embedding Dimension: 384
                                    task_description  \
0  Manage the employment contracts for the summer...   
1  Interview the performance reviews to support m...   
2  Screen the diversity initiative using the ATS ...   
3     Promote the promotional video to maximize ROI.   
4  Investigate the duplicate ticket to avoid nega...   

                                      task_embedding                  role  \
0  [-0.024037765, 0.01650209, 0.037274428, 0.0190...         hr_specialist   
1  [0.019758852, 0.07619918, -0.062207058, 0.0389...         hr_specialist   
2  [-0.013584737, -0.022343501, -0.082616396, -0....         hr_specialist   
3  [-0.0052972673, -0.040276837, -0.060003854, -0...  marketing_specialist   
4  [-0.021659266, 0.022119937, 0.03243109, -0.036...      customer_support   

   employee_id  
0            3  
1            3  
2            3  
3            4  
4            1  

üíæ Saved to 'empl

## Second Approach: LLM generated tasks

Step 1: Set the Context (System Prompt)
If you are using a chat interface (ChatGPT/Gemini), paste this first.

System / Context: You are an expert dataset generator for a task classification AI. Your goal is to generate distinct, realistic, and semantically diverse task descriptions for specific corporate roles.

Rules for Generation:

No Overlap: Ensure tasks are specific to the role. A Backend Developer task should not sound like a Data Scientist task (e.g., avoid generic "Analyze data" for Backend; use "Analyze API logs" instead).

High Diversity: Use different verbs (Implement, Debug, Design, Review, Deprecate).

Professional Tone: Use the language actual professionals use in Jira/Asana tickets.

No Numbering/Bullet points: Output raw text, one task per line.
Wait for my input in order to begin.

In [23]:
backend_dev = [
    "Implement a multi-stage GraphQL mutation for batch inventory updates",
    "Design a RESTful API endpoint for partial resource updates using PATCH",
    "Define OpenAPI 3.0 specifications for the customer loyalty service",
    "Develop a versioning strategy for breaking changes in the mobile gateway",
    "Implement HATEOAS links in the order service response metadata",
    "Create a middleware for validating request payloads using JSON schema",
    "Design an idempotent endpoint for processing stripe payment callbacks",
    "Build a reverse proxy to aggregate responses from three internal microservices",
    "Implement content negotiation for exporting reports in CSV and JSON formats",
    "Optimize API response size by implementing field masking and sparse fieldsets",
    "Integrate Scalar or Swagger UI for real-time API documentation testing",
    "Develop a rate-limiting strategy using a Redis-backed leaky bucket algorithm",
    "Configure CORS policies to allow cross-origin requests from partner domains",
    "Design a WebSocket API for streaming live order status updates",
    "Build a mock server for frontend teams to simulate high-latency API responses",
    "Implement cursor-based pagination for the global product search endpoint",
    "Create a webhook registry for third-party developer integrations",
    "Develop a service discovery client for dynamic routing in the backend mesh",
    "Handle multipart/form-data for high-speed uploads of user profile images",
    "Optimize API latency by enabling Gzip and Brotli compression on the edge",
    "Refactor API error responses to follow the RFC 7807 Problem Details standard",
    "Implement a request correlator ID for end-to-end API request tracking",
    "Design a bulk-upload API endpoint with progress status polling",
    "Integrate a GraphQL gateway to federate multiple backend subgraphs",
    "Implement request signing for internal service-to-service API calls",
    "Develop a heart-beat endpoint for load balancer health checks",
    "Create a throttling mechanism for public-facing unauthenticated endpoints",
    "Optimize serialization performance using MessagePack for internal traffic",
    "Implement a circuit breaker for external weather API dependencies",
    "Design a data-driven API for generating dynamic form schemas",
    "Implement strict type checking for all incoming API request parameters",
    "Build an API analytics dashboard to track endpoint usage and latency",
    "Create a deprecation header for retired API versions in production",
    "Design a server-sent events (SSE) endpoint for real-time log streaming",
    "Implement a custom serializer for complex nested JSON objects",
    "Develop a policy-based API for handling GDPR data deletion requests",
    "Create a standardized response wrapper for all microservice endpoints",
    "Implement query parameter filtering for complex database lookups",
    "Build a proxy for sanitizing sensitive data in third-party API logs",
    "Design a file-stream API for downloading large encrypted datasets",
    "Implement a cache-control header strategy for CDN-level optimization",
    "Develop a robust retry logic for transient upstream API failures",
    "Create a schema registry to manage breaking changes in event payloads",
    "Implement a health-check suite for all downstream service dependencies",
    "Design a high-concurrency API for processing flash-sale transactions",
    "Build a gateway for translating legacy SOAP requests to modern REST",
    "Implement a language-switching middleware based on Accept-Language headers",
    "Develop a unified authentication wrapper for diverse API protocols",
    "Create a resource-locking API for distributed editing of shared documents",
    "Implement a custom validation decorator for recurring date-time patterns",
    "Optimize slow-running PostgreSQL queries using EXPLAIN ANALYZE",
    "Implement a database sharding strategy for the high-volume user table",
    "Write a flyway migration script to add partitioned tables for audit logs",
    "Configure Redis as a write-through cache for frequently accessed products",
    "Design a normalized schema for a multi-lingual product catalog",
    "Optimize database connection pooling settings for a high-concurrency app",
    "Implement a soft-delete mechanism with automated archival tasks",
    "Create a Gin-index for high-performance full-text search in Postgres",
    "Resolve a deadlock issue in the high-frequency transaction log table",
    "Set up a read-replica for offloading heavy analytical reporting queries",
    "Implement a Change Data Capture (CDC) pipeline using Debezium",
    "Design a time-series database schema for storing IoT telemetry data",
    "Optimize memory usage of a Redis cluster by implementing eviction policies",
    "Perform a database vacuum to reclaim storage from deleted records",
    "Implement row-level security policies for a multi-tenant database",
    "Design a secondary index strategy to optimize multi-column filtering",
    "Migrate legacy unstructured data into a strictly typed SQL schema",
    "Configure a NoSQL database for high-availability across multiple regions",
    "Implement a database audit trigger to track sensitive field modifications",
    "Optimizing cold-storage retrieval for archived database snapshots",
    "Set up a database monitoring dashboard for tracking slow query logs",
    "Implement a pessimistic locking strategy for concurrent booking attempts",
    "Design a data retention worker to purge expired session data",
    "Create a materialized view for pre-aggregating daily sales metrics",
    "Implement a failover strategy for the primary database cluster",
    "Optimize join performance by denormalizing specific high-read tables",
    "Configure SSL/TLS for all database connections in production",
    "Implement a custom database adapter for a legacy proprietary system",
    "Design a graph database schema for representing social connections",
    "Perform a data integrity audit to identify orphaned records",
    "Implement a bulk-insert optimization for the data ingestion pipeline",
    "Configure automatic database backups with point-in-time recovery",
    "Optimize indexing for range queries on timestamped log data",
    "Design a database migration rollback plan for a major release",
    "Implement a distributed lock using Redis for task synchronization",
    "Analyze and fix a memory leak in the database connection driver",
    "Create a composite key strategy for the multi-tenant billing table",
    "Implement a database-level encryption-at-rest for PII fields",
    "Develop a script to identify and remove unused database indexes",
    "Set up an automated alert for database disk space usage",
    "Optimize recursive CTE queries for deep tree structure traversal",
    "Configure a cache-aside pattern for reducing database load",
    "Implement a sequence-based ID generator for distributed databases",
    "Design a partitioned table strategy for historical payment data",
    "Build a database health-check service for the container orchestrator",
    "Implement a data-seeding script for development and staging environments",
    "Optimize the database buffer cache hit ratio for read-heavy workloads",
    "Create a view to simplify complex joins for the reporting frontend",
    "Implement a transactional outbox pattern for reliable event delivery",
    "Design a schema for a flexible attribute-value pair system",
    "Integrate OAuth2.0 and OpenID Connect for third-party logins",
    "Implement a JWT rotation and revocation strategy using a blacklist",
    "Secure user passwords using the Argon2 hashing algorithm with salt",
    "Perform a penetration test fix for a detected SQL injection vulnerability",
    "Configure Role-Based Access Control (RBAC) for the management portal",
    "Implement a Multi-Factor Authentication (MFA) flow via TOTP",
    "Secure sensitive environment variables using AWS Secrets Manager",
    "Implement a rate-limiter for the login endpoint to prevent brute force",
    "Fix a cross-site request forgery (CSRF) vulnerability in the API",
    "Audit the backend for insecure direct object references (IDOR)",
    "Implement a secure password reset flow with signed, expiring tokens",
    "Configure security headers including HSTS and X-Content-Type-Options",
    "Implement an Attribute-Based Access Control (ABAC) engine",
    "Encrypt PII data in the database using a field-level encryption SDK",
    "Set up a vulnerability scanner like Snyk in the CI/CD pipeline",
    "Implement mutual TLS (mTLS) for microservice-to-microservice calls",
    "Refactor the authentication middleware to support multiple providers",
    "Fix a timing attack vulnerability in the API key validation logic",
    "Implement a secure session management system with HttpOnly cookies",
    "Develop an audit log for all successful and failed login attempts",
    "Sanitize all user-generated content to prevent stored XSS attacks",
    "Configure IAM roles for the backend following least-privilege principles",
    "Implement a secure file upload validator for MIME types and headers",
    "Review and update SSL/TLS cipher suites for the production server",
    "Implement a system for periodic API key rotation for external partners",
    "Fix an issue where sensitive data was leaking into application logs",
    "Design a secure invitation system with single-use onboarding tokens",
    "Implement a firewall rule to restrict backend access to the VPN",
    "Audit third-party libraries for known CVEs and update as needed",
    "Implement a custom authorization guard for multi-tenant isolation",
    "Review the backend for potential XML External Entity (XXE) attacks",
    "Implement a secure vault for storing third-party integration tokens",
    "Develop a script to identify users with weak or compromised passwords",
    "Fix a privilege escalation bug in the user profile update service",
    "Implement a CAPTCHA verification step for suspicious sign-up traffic",
    "Configure a Web Application Firewall (WAF) to block known malicious IPs",
    "Implement a secure logout that invalidates both local and server sessions",
    "Review the backend for insecure cryptographic storage of old backups",
    "Implement a data masking service for non-production environments",
    "Develop a secure sandbox for executing user-provided script logic",
    "Fix a directory traversal vulnerability in the file-serving module",
    "Implement an automated lock-out policy after repeated failed logins",
    "Review the API for excessive data exposure in the JSON responses",
    "Implement a secure way to share temporary access to internal files",
    "Audit the backend for proper use of nonces in security handshakes",
    "Implement a robust identity provider (IdP) integration for SSO",
    "Review the code for hardcoded secrets or sensitive credentials",
    "Implement a secure proxy for outgoing requests to external APIs",
    "Configure automatic security patches for the production OS",
    "Implement a cross-service authentication token for the service mesh",
    "Fix a vulnerability in the password-reset token generation logic",
    "Decompose the monolithic billing service into three microservices",
    "Write a multi-stage Dockerfile for a Go-based backend service",
    "Configure a GitHub Actions pipeline for automated backend testing",
    "Debug a memory leak in the long-running image processing worker",
    "Optimize system latency by implementing a global CDN strategy",
    "Implement a circuit breaker pattern using Hystrix or Resilience4j",
    "Design a message-driven architecture using RabbitMQ and SQS",
    "Configure a Kubernetes Horizontal Pod Autoscaler for the API service",
    "Implement distributed tracing across the backend using Jaeger",
    "Build a CI/CD pipeline stage for automated canary deployments",
    "Optimize the backend container startup time for faster scaling",
    "Implement a centralized logging system using the ELK stack",
    "Design a disaster recovery plan with a 15-minute RTO/RPO",
    "Configure an Nginx load balancer with a round-robin strategy",
    "Implement auto-healing for backend pods that fail health checks",
    "Optimize the size of the Docker production image using Alpine",
    "Build an automated infrastructure provisioning script with Terraform",
    "Implement a service mesh for advanced traffic splitting and retries",
    "Configure an S3 bucket for high-availability static asset hosting",
    "Fix a bottleneck in the distributed task queue during peak load",
    "Implement a blue-green deployment strategy for the core API",
    "Optimize the backend build process to reduce CI pipeline runtime",
    "Configure a Prometheus exporter for tracking custom system metrics",
    "Implement a dead-letter queue for handling failed message processing",
    "Design a multi-region deployment for the user authentication service",
    "Implement a graceful shutdown handler for backend worker processes",
    "Optimize the database connection pool for serverless function execution",
    "Configure an API Gateway for unified authentication and routing",
    "Implement a system-wide telemetry strategy with OpenTelemetry",
    "Fix a race condition in the distributed locking mechanism",
    "Design a high-availability architecture for the Redis cache layer",
    "Implement a shadow deployment to test new features with live data",
    "Optimize the backend for ARM64 architecture to reduce cloud costs",
    "Configure a private VPC for secure backend-to-database traffic",
    "Implement a container orchestration strategy for background jobs",
    "Build an automated roll-back mechanism for failed production builds",
    "Optimize network throughput between microservices in the cluster",
    "Implement a self-service infrastructure portal for developers",
    "Fix a scaling issue where the load balancer was misrouting traffic",
    "Design a system for real-time monitoring of business-level KPIs",
    "Implement a global search service using a distributed indexer",
    "Configure automated vulnerability scanning for Docker images",
    "Implement a multi-tenant isolation strategy at the infrastructure level",
    "Optimize the backend's resource limits for better pod density",
    "Implement a strategy for managing database migrations in Kubernetes",
    "Design an event-sourcing architecture for the financial ledger",
    "Refactor a legacy monolithic function into a clean service layer",
    "Update outdated npm and pip dependencies to their latest stable versions",
    "Deprecate an unused legacy API endpoint and notify consumers",
    "Write unit tests for a complex business logic module with 0% coverage",
    "Replace a deprecated third-party library with a modern alternative",
    "Refactor the legacy database connector to use a modern ORM",
    "Document a legacy system's undocumented business rules and logic",
    "Clean up redundant code and dead functions in the core backend",
    "Update the backend project's README with setup and testing guides",
    "Rewrite a legacy Python 2 module into modern Python 3",
    "Implement integration tests for the legacy payment processing flow",
    "Refactor a 1000-line controller into smaller, testable components",
    "Migrate legacy environment variables into a secure secrets manager",
    "Improve the performance of an old, inefficient data export feature",
    "Add logging to a legacy service that is currently a black box",
    "Refactor the legacy error-handling logic to be more consistent",
    "Update the legacy CI pipeline to use modern build runners",
    "Deprecate the legacy XML API in favor of the new JSON-based API",
    "Rewrite an old shell-script cron job as a robust backend worker",
    "Perform a code cleanup to adhere to the latest style guidelines",
    "Identify and remove unused configuration files from the repository",
    "Refactor a legacy utility class to use dependency injection",
    "Update the project's Docker base image to a modern, secure version",
    "Migrate a legacy monolithic database into separate microservice DBs",
    "Refactor a complex switch-case block into a clean Strategy pattern",
    "Document the database schema for a legacy module using ER diagrams",
    "Increase the test coverage of the legacy user auth module to 80%",
    "Remove hardcoded configuration values and move them to .env files",
    "Refactor the legacy logging utility to support multiple output levels",
    "Replace an old, insecure hashing method with a modern algorithm",
    "Fix a long-standing bug in the legacy reporting engine",
    "Update the legacy backend's build scripts for better performance",
    "Refactor a legacy service to use async/await for better concurrency",
    "Document the API of a legacy service for the frontend team",
    "Migrate legacy file storage from the local disk to an S3 bucket",
    "Refactor a legacy data-seeding script to be idempotent",
    "Clean up the legacy project's Git history and remove large binaries",
    "Refactor the legacy routing logic to support dynamic path matching",
    "Update the legacy backend's testing framework to a modern version",
    "Replace a legacy SOAP client with a modern REST client",
    "Refactor a legacy singleton class to be thread-safe",
    "Document the deployment process for a legacy backend application",
    "Refactor a legacy validation logic to use a reusable validator",
    "Update the legacy project's license and dependency files",
    "Migrate legacy user data to a new, more efficient schema",
    "Refactor a legacy module to reduce its cyclomatic complexity",
    "Clean up the legacy project's temporary files and build artifacts",
    "Refactor a legacy notification service to support multiple channels",
    "Update the legacy backend's documentation on a shared wiki",
    "Migrate a legacy backend service from a VM to a container",
    "Refactor the legacy configuration loader for better error handling"
]

In [24]:
data_scientist = [
    # --- Data Cleaning & Preprocessing (50) ---
    "Clean raw customer data by removing duplicate entries in Pandas",
    "Impute missing age values using the median strategy",
    "Normalize numerical features using Min-Max scaling for neural nets",
    "Detect and remove outliers in transaction data using the IQR method",
    "Write a regex script to extract email domains from unstructured text",
    "Convert categorical variables into one-hot encoded vectors",
    "Handle time-zone discrepancies in global user activity logs",
    "Merge three disparate CSV datasets using a common user ID key",
    "Reshape the dataset from wide to long format using Pandas melt",
    "Fix character encoding issues in a legacy dataset export",
    "Standardize date formats across multiple data sources",
    "Filter out bot traffic from web analytics raw logs",
    "Create interaction features by multiplying correlated columns",
    "Bin continuous age data into categorical age groups",
    "Tokenize and lemmatize text data for NLP preprocessing",
    "Remove stop words from a corpus of customer reviews",
    "Handle class imbalance by upsampling the minority class with SMOTE",
    "Parse nested JSON columns into a flat tabular structure",
    "Calculate and fill missing values using K-Nearest Neighbors imputation",
    "Validate data schema constraints before ingestion into the pipeline",
    "Trim whitespace and sanitize string inputs in the address column",
    "Convert currency strings to floating-point numbers",
    "Drop features with zero variance or constant values",
    "Align sampling rates for two different time-series datasets",
    "Mask PII data to ensure compliance before analysis",
    "Correct spelling mistakes in user input using fuzzy matching",
    "winsorize extreme values to reduce the impact of outliers",
    "Calculate rolling averages to smooth out noisy sensor data",
    "Lag features to create historical context for time-series forecasting",
    "Decorrelate features using Principal Component Analysis (PCA)",
    "Split the dataset into stratified train, validation, and test sets",
    "Encode ordinal variables preserving the inherent order",
    "Extract day, month, and year components from a timestamp column",
    "Calculate the Haversine distance between two geospatial coordinates",
    "Log-transform skewed data distributions to approach normality",
    "Remove special characters and html tags from scraped web text",
    "Interpolate missing values in a time-series sequence",
    "Aggregate daily transaction logs into monthly summaries",
    "Identify and drop collinear features using a correlation matrix",
    "Parse user-agent strings to extract device and browser information",
    "Standardize text case to lowercase for consistent matching",
    "Map country codes to full country names using an external library",
    "Calculate the time difference between two consecutive events",
    "Flatten a hierarchical XML file into a pandas DataFrame",
    "Reindex the dataframe to fill in missing dates with zero values",
    "Validate the integrity of foreign keys between two dataframes",
    "Convert image data into flattened numpy arrays for processing",
    "Clean audio data by trimming silence and normalizing volume",
    "Extract hashtags and mentions from social media text data",
    "Consolidate multiple parquet files into a single dataset",

    # --- Machine Learning Models (50) ---
    "Train a Random Forest classifier to predict customer churn",
    "Fine-tune hyperparameters using Grid Search for an XGBoost model",
    "Implement a Logistic Regression model for binary classification",
    "Build a K-Means clustering algorithm for market segmentation",
    "Train a Convolutional Neural Network (CNN) for image recognition",
    "Develop a Recurrent Neural Network (RNN) for time-series forecasting",
    "Optimize model performance using Bayesian Optimization with Hyperopt",
    "Train a Support Vector Machine (SVM) with a radial basis function kernel",
    "Implement a Gradient Boosting Regressor for housing price prediction",
    "Build a collaborative filtering recommendation engine using Matrix Factorization",
    "Fine-tune a pre-trained BERT model for sentiment analysis",
    "Train a Decision Tree and visualize the feature importance split",
    "Implement a Naive Bayes classifier for spam email detection",
    "Build an autoencoder for anomaly detection in network traffic",
    "Train a LightGBM model on a large-scale tabular dataset",
    "Implement a Lasso regression to perform feature selection",
    "Train a deeply connected neural network using PyTorch",
    "Develop a reinforcement learning agent for a simulation environment",
    "Build a Latent Dirichlet Allocation (LDA) model for topic modeling",
    "Train a Prophet model to forecast seasonal sales trends",
    "Implement an Isolation Forest algorithm for fraud detection",
    "Tune the learning rate and batch size for a deep learning model",
    "Train a multi-layer perceptron (MLP) for non-linear regression",
    "Implement a k-Nearest Neighbors (k-NN) classifier for pattern recognition",
    "Build an ensemble model using stacking and blending techniques",
    "Train a sequence-to-sequence model for language translation",
    "Implement a Principal Component Analysis (PCA) for dimensionality reduction",
    "Train a Generative Adversarial Network (GAN) for synthetic data creation",
    "Optimize the objective function of a custom loss function",
    "Train a survival analysis model using Cox Proportional Hazards",
    "Implement a DBSCAN algorithm to find density-based clusters",
    "Build a hierarchy clustering model using a dendrogram",
    "Train a word2vec model to generate semantic word embeddings",
    "Implement a graph neural network (GNN) for social network analysis",
    "Train a CatBoost model to handle categorical features natively",
    "Develop a content-based recommendation system using cosine similarity",
    "Train a Ridge regression model to prevent overfitting",
    "Implement a Hidden Markov Model (HMM) for sequential data analysis",
    "Build a text summarization model using a Transformer architecture",
    "Train a fastText classifier for efficient text categorization",
    "Implement a one-class SVM for novelty detection",
    "Train a Siamese network for image similarity comparison",
    "Develop a Q-learning algorithm for optimizing a game strategy",
    "Train a U-Net architecture for biomedical image segmentation",
    "Implement a spectral clustering algorithm for complex structures",
    "Build a demand forecasting model using ARIMA",
    "Train a YOLO model for real-time object detection",
    "Implement a Self-Organizing Map (SOM) for visualization",
    "Train a neural collaborative filtering model for user ratings",
    "Optimize a neural network using the AdamW optimizer",

    # --- Visualization (50) ---
    "Generate a correlation heatmap using Seaborn to identify relationships",
    "Create an interactive dashboard in Tableau for executive reporting",
    "Plot a time-series line chart to visualize sales trends over years",
    "Design a box plot to show the distribution of user ages",
    "Create a scatter plot matrix to visualize pair-wise feature interactions",
    "Build a geographic choropleth map to show regional revenue",
    "Plot a confusion matrix to visualize classifier performance",
    "Generate a histogram to analyze the frequency distribution of errors",
    "Create a funnel chart to visualize conversion rates at each stage",
    "Design a bar chart comparing quarterly performance across teams",
    "Plot a ROC curve to evaluate the trade-off between sensitivity and specificity",
    "Create a word cloud to visualize the most frequent terms in reviews",
    "Build a Sankey diagram to visualize user flow through the website",
    "Plot a precision-recall curve for an imbalanced classification task",
    "Create a violin plot to visualize probability density of the data",
    "Design a radar chart to compare product features against competitors",
    "Generate a residual plot to check for heteroscedasticity in regression",
    "Create a 3D scatter plot to visualize clusters in three dimensions",
    "Build a treemap to show the hierarchical structure of expenses",
    "Plot a cumulative gain chart to assess model lift",
    "Create a waterfall chart to visualize cumulative effect of sequential values",
    "Design a sunburst chart to display multi-level hierarchical data",
    "Generate a pair plot to inspect feature distributions and correlations",
    "Create a dual-axis chart to compare two variables with different scales",
    "Build a Streamlit app for interactive model inference demos",
    "Plot a validation curve to diagnose overfitting and underfitting",
    "Create a heatmap to visualize missing data patterns",
    "Design a Gantt chart to visualize the timeline of the data project",
    "Generate a silhouette plot to evaluate the quality of clusters",
    "Create a bubble chart to visualize three dimensions of data",
    "Build a candlestick chart for analyzing stock price movements",
    "Plot the learning curves of a neural network training session",
    "Create a hexbin plot to visualize density in large datasets",
    "Design a dashboard in Power BI for real-time KPI monitoring",
    "Generate a swarm plot to show individual data points in distributions",
    "Create a donut chart to show percentage breakdown of a category",
    "Build a network graph to visualize connections between entities",
    "Plot a lag plot to check for autocorrelation in time series",
    "Create a strip plot to visualize 1D distributions",
    "Design a Pareto chart to identify the most significant factors",
    "Generate a dendrogram to visualize hierarchical clustering results",
    "Create a stacked area chart to show part-to-whole changes over time",
    "Build a custom matplotlib figure with multiple subplots",
    "Plot a calibration curve to assess the reliability of predicted probabilities",
    "Create a facet grid to compare distributions across multiple categories",
    "Design a bump chart to visualize changes in rank over time",
    "Generate a streamgraph to visualize organic changes in data themes",
    "Create a joy plot (ridgeline plot) for comparing distributions",
    "Build an interactive Plotly graph for web-based presentations",
    "Plot a feature importance bar chart for a random forest model",

    # --- Statistical Analysis (50) ---
    "Conduct an A/B test to compare two website landing pages",
    "Calculate the p-value to determine statistical significance",
    "Perform a Chi-square test of independence between two categorical variables",
    "Calculate the 95% confidence interval for the population mean",
    "Conduct a t-test to compare the means of two independent samples",
    "Perform an ANOVA test to compare means across three or more groups",
    "Calculate the Pearson correlation coefficient between two variables",
    "Perform a power analysis to determine the required sample size",
    "Test the normality of a distribution using the Shapiro-Wilk test",
    "Calculate the variance inflation factor (VIF) to detect multicollinearity",
    "Perform a Mann-Whitney U test for non-parametric comparison",
    "Calculate the standard error of the mean for a sample dataset",
    "Conduct a hypothesis test for the difference in proportions",
    "Perform a Granger causality test on time-series data",
    "Calculate the Z-score to standardize data points",
    "Conduct a paired t-test for before-and-after analysis",
    "Perform a Kruskal-Wallis test for non-parametric ANOVA",
    "Calculate the Spearman rank correlation for non-linear relationships",
    "Conduct a Levene's test to assess equality of variances",
    "Perform a Kolmogorov-Smirnov test to compare two distributions",
    "Calculate the odds ratio for a logistic regression coefficient",
    "Conduct a Tukey's HSD test for post-hoc analysis",
    "Perform a Durbin-Watson test to detect autocorrelation in residuals",
    "Calculate the coefficient of determination (R-squared) for a model",
    "Conduct a McNemar's test for paired nominal data",
    "Perform a Breusch-Pagan test for heteroscedasticity",
    "Calculate the kurtosis and skewness of a distribution",
    "Conduct a Fisher's exact test for small sample sizes",
    "Perform a log-likelihood ratio test to compare nested models",
    "Calculate the effect size using Cohen's d",
    "Conduct a sign test for non-parametric paired data",
    "Perform a Welch's t-test for samples with unequal variances",
    "Calculate the Bayesian Information Criterion (BIC) for model selection",
    "Conduct a Friedman test for repeated measures analysis",
    "Perform an Anderson-Darling test for goodness of fit",
    "Calculate the mean absolute error (MAE) for a regression model",
    "Conduct a Cochran's Q test for matched samples",
    "Perform a Bartlett's test for homogeneity of variances",
    "Calculate the root mean squared error (RMSE) for model evaluation",
    "Conduct a one-sample t-test against a known benchmark",
    "Perform a Wilcoxon signed-rank test for paired samples",
    "Calculate the Gini coefficient to measure inequality",
    "Conduct a factor analysis to identify underlying latent variables",
    "Perform a canonical correlation analysis between two sets of variables",
    "Calculate the Akaike Information Criterion (AIC) for model comparison",
    "Conduct a multivariate analysis of variance (MANOVA)",
    "Perform a cluster analysis to identify natural groupings",
    "Calculate the harmonic mean for rates and ratios",
    "Conduct a survival analysis using the Kaplan-Meier estimator",
    "Perform a sensitivity analysis to test model robustness",

    # --- Deployment & Pipelines (50) ---
    "Serialize the trained model using Python's pickle module",
    "Create a Docker container to serve the machine learning model",
    "Set up an MLflow experiment to track model parameters and metrics",
    "Develop a Flask API endpoint to serve model predictions",
    "Monitor the deployed model for data drift and concept drift",
    "Build a feature store to manage and serve features consistently",
    "Implement a CI/CD pipeline for automated model retraining",
    "Version control the large dataset using DVC (Data Version Control)",
    "Optimize the model for inference using ONNX Runtime",
    "Deploy the model to AWS SageMaker endpoints",
    "Schedule a daily batch prediction job using Apache Airflow",
    "Log model inference requests and responses for auditing",
    "Implement a shadow mode to test the new model against the old one",
    "Create a Kubernetes manifest for scaling the inference service",
    "Write unit tests for the data transformation pipeline",
    "Quantize the model weights to reduce memory footprint",
    "Set up alerts for model performance degradation via PagerDuty",
    "Wrap the model in a FastAPI application with async support",
    "Implement A/B testing logic at the inference load balancer level",
    "Save the model artifacts to an S3 bucket with versioning",
    "Create a reproducible training environment using Conda",
    "Implement input data validation using Great Expectations",
    "Deploy the model using TensorFlow Serving for high performance",
    "Build a dashboard to visualize production model latency",
    "Script the automated teardown of training resources after completion",
    "Implement a rollback strategy for failed model deployments",
    "Containerize the preprocessing steps alongside the model",
    "Set up a private PyPI repository for custom ML packages",
    "Configure a GPU-accelerated inference server",
    "Implement specific logging for prediction confidence intervals",
    "Deploy a serverless inference function using AWS Lambda",
    "Create a model card document to explain model usage and limitations",
    "Implement a caching layer for frequent prediction requests",
    "Set up a cron job to retrain the model on fresh data",
    "Benchmark the inference speed (RPS) under load",
    "Implement a blue-green deployment strategy for the ML service",
    "Secure the prediction API with an API key and rate limiting",
    "Monitor GPU utilization during inference using NVIDIA-SMI tools",
    "Integrate the model predictions into the production database",
    "Use TorchScript to serialize a PyTorch model for production",
    "Automate the generation of model performance reports post-deployment",
    "Implement a feedback loop to capture true labels for future training",
    "Optimize the Docker image size for faster cold starts",
    "Configure a load balancer to distribute traffic to model replicas",
    "Deploy the model to the edge using TensorFlow Lite",
    "Implement a custom health check endpoint for the inference service",
    "Set up distributed training across multiple nodes",
    "Create a script to warm up the model cache before serving traffic",
    "Implement a feature engineering pipeline using Scikit-learn pipelines",
    "Deploy a model registry to manage the lifecycle of ML models"
]

In [25]:
hr_specialist = [
    # --- Recruitment (50) ---
    "Screen resumes for the Senior Marketing Manager position",
    "Schedule a panel interview for the Engineering Director candidate",
    "Draft a job description for the new UX Designer role",
    "Extend a verbal offer to the top Sales candidate",
    "Source candidates for the specialized IT role on LinkedIn Recruiter",
    "Coordinate travel arrangements for out-of-town interviewees",
    "Review and update the applicant tracking system (ATS) status",
    "Conduct initial phone screenings to assess candidate fit",
    "Collaborate with hiring managers to define role requirements",
    "Post job openings on industry-specific job boards",
    "Manage the employee referral program bonus payouts",
    "Reject candidates who did not meet the role qualifications",
    "Negotiate salary and benefits with the prospective hire",
    "Prepare interview kits and scorecards for the hiring team",
    "Attend a university career fair to recruit entry-level talent",
    "Analyze time-to-fill metrics for the last quarter",
    "Conduct reference checks for the finalist candidate",
    "Send a pre-interview assessment to the Data Analyst applicant",
    "Organize a recruitment open house event",
    "Revise the offer letter template to reflect new legal clauses",
    "Monitor the diversity metrics of the applicant pool",
    "Follow up with passive candidates to gauge interest",
    "Schedule a debrief meeting with the interview panel",
    "Create a social media campaign for the summer internship program",
    "Verify the educational credentials of the potential hire",
    "Optimize the careers page for better search engine visibility",
    "Train hiring managers on unconscious bias in interviewing",
    "Managing the correspondence with third-party recruitment agencies",
    "Screen internal applications for the transfer request",
    "Calibrate the salary range for the new open position",
    "Conduct a background check initiation for the selected hire",
    "Schedule a technical assessment for the developer role",
    "Review the candidate experience survey results",
    "Draft a rejection email that encourages future applications",
    "Create a pipeline of potential candidates for future openings",
    "Discuss the counter-offer strategy with the department head",
    "Organize the logistics for a virtual hiring event",
    "Update the job posting to clarify remote work policies",
    "Screen cover letters for specific project experience",
    "Coordinate the final round presentation for the executive role",
    "Manage the intake meeting for a new requisition",
    "Review the recruitment budget for the fiscal year",
    "Assess the effectiveness of different job boards",
    "Conduct a video interview for the international candidate",
    "Prepare the formal written offer package",
    "Track the source of hire data for reporting",
    "Advise the hiring manager on market compensation trends",
    "Schedule a coding challenge review for the tech team",
    "Maintain communication with candidates in the talent pool",
    "Verify work authorization eligibility for the new recruit",

    # --- Onboarding & Offboarding (50) ---
    "Facilitate the new hire orientation session on Monday",
    "Conduct an exit interview to gather feedback from the departing employee",
    "Coordinate the return of company equipment with IT",
    "Revoke system access for the terminated employee",
    "Prepare the welcome kit and swag for the new starter",
    "Assign a buddy or mentor to the new team member",
    "Schedule the first-week check-in meeting with the new hire",
    "Process the resignation letter and notify relevant stakeholders",
    "Review the onboarding checklist to ensure all steps are complete",
    "Organize a team lunch to welcome the new employee",
    "Update the employee directory with the new hire's details",
    "Explain the company culture and values during orientation",
    "Collect the signed non-disclosure agreement (NDA)",
    "Coordinate the setup of the new hire's workstation",
    "Calculate the final paycheck including unused PTO",
    "Distribute the exit survey link to the resigning staff member",
    "Schedule a benefits overview session for the new cohort",
    "Notify the payroll department of the employee's start date",
    "Create the new employee's profile in the HRIS",
    "Conduct a 30-day onboarding satisfaction survey",
    "Process the COBRA continuation coverage paperwork",
    "Draft a farewell announcement for the retiring employee",
    "Ensure the I-9 verification documents are valid",
    "Review the offboarding protocol with the manager",
    "Assign mandatory compliance training to the new joiner",
    "Schedule a security badge photo session",
    "Provide the new hire with the building access key card",
    "Answer questions about the dress code and office hours",
    "Retrieve the corporate credit card from the departing manager",
    "Review the final expense report before reimbursement",
    "Introduce the new hire to key cross-functional partners",
    "Update the organizational chart with the recent changes",
    "Conduct a 90-day probationary period review",
    "Process the relocation stipend for the transferee",
    "Archive the personnel file of the former employee",
    "Send the 'What to Expect' email before the first day",
    "Confirm the mailing address for the final W-2 form",
    "Facilitate a knowledge transfer session before departure",
    "Review the non-compete clauses with the exiting executive",
    "Coordinate the remote setup for the work-from-home employee",
    "Provide instructions on how to roll over the 401k",
    "Schedule a meeting with the CEO for the new director",
    "Verify that the new hire has completed the tax forms",
    "Order business cards for the new sales representative",
    "Analyze the turnover data from exit interviews",
    "Update the onboarding presentation slides",
    "Resolve any login issues for the new hire's accounts",
    "Clarify the holiday schedule during orientation",
    "Process the sign-on bonus payment",
    "Confirm the termination date with the benefits provider",

    # --- Employee Relations (50) ---
    "Mediate a conflict resolution session between two coworkers",
    "Conduct a performance improvement plan (PIP) meeting",
    "Investigate a complaint regarding workplace misconduct",
    "Facilitate a feedback session for the annual review cycle",
    "Document a verbal warning for attendance issues",
    "Review the employee engagement survey results",
    "Advise a manager on how to deliver constructive criticism",
    "Organize a team-building activity to improve morale",
    "Handle a grievance filed by a staff member",
    "Conduct a stay interview to retain high-potential talent",
    "Draft a disciplinary action memo for policy violation",
    "Coach a new manager on effective leadership techniques",
    "Analyze trends in employee absenteeism",
    "Organize a town hall meeting for company updates",
    "Address concerns regarding the return-to-office policy",
    "Facilitate a focus group on workplace culture",
    "Review the promotion nomination for the senior analyst",
    "Investigate a report of bullying in the department",
    "Advise on the appropriate termination procedure",
    "Coordinate the recognition awards for the quarter",
    "Manage the communication regarding a restructuring",
    "Provide guidance on handling a difficult conversation",
    "Review the mid-year performance appraisal forms",
    "Investigate an anonymous tip from the ethics hotline",
    "Organize a stress management workshop",
    "Clarify the policy on outside employment",
    "Facilitate a discussion on career development goals",
    "Document the investigation findings in a formal report",
    "Advise the leadership team on employee sentiment",
    "Manage the logistics for the holiday party",
    "Address a complaint about an unsafe work environment",
    "Review the compensation adjustment request",
    "Conduct a training on effective communication skills",
    "Handle a request for a flexible work arrangement",
    "Investigate a claim of unfair treatment",
    "Facilitate a conflict management workshop",
    "Review the peer feedback for the 360 review",
    "Draft a memo regarding the dress code policy enforcement",
    "Advise on the protocol for a medical leave of absence",
    "Organize a diversity and inclusion listening session",
    "Address rumors circulating in the office",
    "Conduct a climate survey for a specific department",
    "Review the eligibility for the employee of the month",
    "Handle a dispute over shift scheduling",
    "Investigate a breach of the social media policy",
    "Facilitate a goal-setting workshop for the team",
    "Advise on the appropriate response to a client complaint against staff",
    "Manage the internal transfer process for an employee",
    "Conduct a follow-up meeting after a mediation",
    "Review the documentation for a termination for cause",

    # --- Benefits & Payroll (50) ---
    "Process the open enrollment changes for health insurance",
    "Answer employee questions about the 401k match",
    "Resolve a payroll discrepancy for the last pay period",
    "Manage the FMLA leave request documentation",
    "Audit the monthly benefits invoice for accuracy",
    "Process a garnishment order from the court",
    "Explain the high-deductible health plan options",
    "Update the payroll system with the new salary data",
    "Calculate the pro-rated pay for a mid-cycle hire",
    "Coordinate the annual flu shot clinic",
    "Review the disability claim status with the provider",
    "Manage the tuition reimbursement program applications",
    "Address a question about the commuter benefits deduction",
    "Process the year-end bonus payments",
    "Reconcile the PTO balances in the timekeeping system",
    "Explain the stock option vesting schedule",
    "Update the beneficiary information for life insurance",
    "Resolve an issue with the direct deposit setup",
    "Coordinate the wellness program challenge",
    "Review the worker's compensation claim details",
    "Process the parental leave pay",
    "Conduct a compensation benchmarking analysis",
    "Answer questions about the FSA submission deadline",
    "Audit the payroll tax filings",
    "Manage the gym membership subsidy program",
    "Explain the vision care coverage limits",
    "Process a salary advance request",
    "Update the tax withholding allowances (W-4)",
    "Review the overtime calculations for hourly staff",
    "Manage the sabbatical leave request",
    "Coordinate the distribution of total rewards statements",
    "Resolve a billing error with the dental carrier",
    "Process the commission payouts for the sales team",
    "Explain the short-term disability benefits",
    "Update the payroll schedule for the holiday week",
    "Manage the employee assistance program (EAP) utilization",
    "Review the executive compensation package",
    "Process a retroactive pay adjustment",
    "Answer questions about the HSA contribution limits",
    "Coordinate the biometric screening event",
    "Review the leave of absence policy with an expectant parent",
    "Audit the 401k plan for non-discrimination testing",
    "Process the payout for the referral bonus",
    "Explain the COBRA rates to a departing employee",
    "Manage the charity matching gift requests",
    "Review the state-specific payroll tax requirements",
    "Update the travel reimbursement rates",
    "Answer questions about the identity theft protection benefit",
    "Process the longevity bonus",
    "Reconcile the monthly headcount report with payroll",

    # --- Compliance & Policy (50) ---
    "Update the employee handbook to reflect new legislation",
    "Coordinate the annual sexual harassment prevention training",
    "Ensure compliance with the FLSA classification rules",
    "Review the diversity and inclusion initiative progress",
    "Audit the personnel files for required documentation",
    "Update the safety policy for the warehouse",
    "Monitor changes in state labor laws",
    "Conduct an internal audit of I-9 forms",
    "Draft a policy on the use of AI tools in the workplace",
    "Ensure the OSHA log is updated and posted",
    "Review the accommodations request under the ADA",
    "Update the remote work policy guidelines",
    "Coordinate the data privacy training for staff",
    "Review the whistleblower policy for compliance",
    "Ensure the labor law posters are up to date",
    "Draft a memo on the new vaccination policy",
    "Conduct a risk assessment for the office reentry plan",
    "Review the non-compete agreements for enforceability",
    "Monitor the affirmative action plan goals",
    "Update the drug-free workplace policy",
    "Ensure compliance with the GDPR for EU employees",
    "Review the social media policy for legal risks",
    "Conduct a pay equity audit",
    "Update the travel and expense policy",
    "Ensure the record retention policy is followed",
    "Draft a policy on consensual relationships in the workplace",
    "Review the background check vendor's compliance",
    "Monitor the completion of the code of conduct training",
    "Update the bereavement leave policy",
    "Ensure compliance with the WARN Act during layoffs",
    "Review the independent contractor agreements",
    "Draft a policy on political expression at work",
    "Update the dress code to be more inclusive",
    "Ensure the lactation room meets legal requirements",
    "Review the emergency evacuation procedures",
    "Monitor the completion of the cybersecurity training",
    "Update the jury duty leave policy",
    "Ensure compliance with the FMLA regulations",
    "Review the substance abuse testing protocol",
    "Draft a policy on the use of company vehicles",
    "Ensure the workplace violence prevention plan is active",
    "Update the gifting policy for clients and vendors",
    "Review the intellectual property agreement",
    "Monitor compliance with the EEO-1 reporting",
    "Update the volunteer time off policy",
    "Ensure the ergonomics assessment program is compliant",
    "Review the electronic communications policy",
    "Update the pet-friendly office policy",
    "Ensure compliance with the CCPA for California staff",
    "Review the anti-bribery and corruption policy"
]

In [26]:
customer_support = [
    # --- Ticket Resolution (50) ---
    "Close the ticket regarding the resolved login loop issue",
    "Merge duplicate tickets reported by the same organization",
    "Escalate the critical API bug to the engineering team",
    "Update the ticket status to 'Pending Customer' while waiting for logs",
    "Reopen the ticket after the user reported the issue persisted",
    "Assign the ticket to the billing specialist queue",
    "Tag the ticket with 'UI/UX' for product team review",
    "Prioritize the ticket based on the customer's SLA tier",
    "Link the ticket to the master incident report for the outage",
    "Resolve the ticket as 'Working by Design' after investigation",
    "Bulk close spam tickets generated by an auto-responder",
    "Set a reminder to follow up on the ticket in 48 hours",
    "Flag the ticket for manager review due to negative sentiment",
    "Update the ticket priority to 'Urgent' due to business impact",
    "Move the ticket to the 'Feature Request' board",
    "Verify the ticket resolution with the customer before closing",
    "Anonymize the ticket data for GDPR compliance",
    "Route the ticket to the French language support team",
    "Add an internal note summarizing the troubleshooting steps taken",
    "Unassign the ticket from the inactive agent",
    "Change the ticket type from 'Question' to 'Incident'",
    "Mark the ticket as 'Solved' after providing the solution",
    "Review the ticket history to understand the recurring issue",
    "Split the ticket into two separate issues for clarity",
    "Forward the ticket to the success manager for account review",
    "Snooze the ticket until the next product release",
    "Categorize the ticket under 'Mobile App - iOS'",
    "Escalate the ticket to Tier 2 for database investigation",
    "Link the ticket to the known bug tracker ID",
    "Update the ticket fields to reflect the correct software version",
    "Reject the ticket as 'Out of Scope' for custom code support",
    "Acknowledge the ticket receipt within the 1-hour SLA window",
    "Consolidate multiple tickets from the same downtime event",
    "Flag the ticket as a potential churn risk",
    "Update the ticket subject line to be more descriptive",
    "Review the ticket for tone before sending the final response",
    "Assign the ticket to myself from the unassigned queue",
    "Copy the account manager on the ticket response",
    "Mark the ticket as 'Spam' to train the filter",
    "Escalate the ticket to the security team due to phishing report",
    "Resolve the ticket after the user confirmed the fix",
    "Update the custom field 'churn_likelihood' on the ticket",
    "Add a 'VIP' tag to the ticket based on user email",
    "Link the ticket to the Jira engineering task",
    "Close the ticket automatically after 7 days of inactivity",
    "Reassign the ticket to the night shift team for coverage",
    "Update the ticket status to 'On Hold' pending 3rd party vendor",
    "Review the ticket metrics for weekly reporting",
    "Flag the ticket for training purposes",
    "Archive the ticket after the retention period expires",

    # --- Communication (50) ---
    "Answer the incoming live chat regarding pricing plans",
    "Draft a compassionate email response to a frustrated user",
    "Handle the inbound phone inquiry about the service outage",
    "Send a follow-up email to check if the solution worked",
    "Apologize for the delayed response due to high volume",
    "Explain the technical limitation in layman's terms",
    "Thank the customer for their detailed feedback",
    "Clarify the difference between the Pro and Enterprise plans",
    "Inform the user about the scheduled maintenance window",
    "De-escalate the angry customer on the phone call",
    "Ask probing questions to understand the user's goal",
    "Send the 'Welcome' email template to the new user",
    "Notify the customer that their feature request was released",
    "Request a screenshot to better understand the issue",
    "Guide the user through the settings menu via chat",
    "Provide a workaround while the bug is being fixed",
    "Confirm the customer's contact details for a callback",
    "Send a survey link to measure customer satisfaction (CSAT)",
    "Explain the refund policy clearly and politely",
    "Inform the customer that the issue has been escalated",
    "Ask for permission to access the user's account",
    "Suggest a better workflow for the user's use case",
    "Congratulate the user on their company milestone mentioned",
    "Send a personalized video response for complex instructions",
    "Inform the user that their credit card was declined",
    "Ask the customer to rate the support interaction",
    "Respond to the user's review on the app store",
    "Notify the user about the expiring trial period",
    "Explain why the feature was deprecated",
    "Offer a discount code as a goodwill gesture",
    "Ask the user to join a screen-share session",
    "Confirm the meeting time for the troubleshooting call",
    "Send the 'How-to' guide link in the chat window",
    "Politely decline the request for a custom feature",
    "Inform the user about the security best practices",
    "Ask the user to whitelist our email domain",
    "Send a recap email summarizing the phone conversation",
    "Direct the user to the community forum for peer help",
    "Explain the data privacy policy regarding their request",
    "Ask the user to verify their identity before proceeding",
    "Send a 'Happy Holidays' message to the long-term client",
    "Inform the user that their account has been unlocked",
    "Ask the user if there is anything else they need help with",
    "Provide the tracking number for the hardware replacement",
    "Explain the terms of service regarding the violation",
    "Send the instructions for setting up two-factor authentication",
    "Notify the user that the system is back online",
    "Ask the user to clear their browser cache",
    "Provide the contact information for the sales team",
    "Wish the customer a great weekend at the end of the call",

    # --- Troubleshooting (50) ---
    "Diagnose the root cause of the user's login failure",
    "Reproduce the reported bug in the sandbox environment",
    "Investigate the discrepancy in the customer's billing invoice",
    "Check the server logs for error 500 occurrences",
    "Verify if the issue is specific to the Chrome browser",
    "Analyze the HAR file provided by the customer",
    "Test the API integration using the customer's credentials",
    "Check the status page for any active incidents",
    "Inspect the HTML element to identify the UI glitch",
    "Verify the user's permissions in the admin panel",
    "Test the mobile app behavior on an Android device",
    "Check if the user is hitting the API rate limit",
    "Investigate the email delivery failure using the SMTP logs",
    "Verify the DNS settings for the customer's custom domain",
    "Check if the firewall is blocking the connection",
    "Test the payment gateway with a test credit card",
    "Investigate the slow loading speed of the dashboard",
    "Check if the third-party extension is causing conflicts",
    "Verify the data synchronization between the two systems",
    "Test the search functionality with different keywords",
    "Investigate the missing data in the export file",
    "Check the user's activity log for suspicious actions",
    "Verify the SSO configuration with the identity provider",
    "Test the audio settings for the reported call quality issue",
    "Investigate the broken link on the help center page",
    "Check if the user is on the latest version of the app",
    "Verify the integrity of the uploaded file",
    "Test the password reset flow to ensure delivery",
    "Investigate the webhook failure response",
    "Check the memory usage of the application tab",
    "Verify the time zone settings on the user's profile",
    "Test the localization settings for the Spanish interface",
    "Investigate the push notification delivery delay",
    "Check the database for the missing transaction record",
    "Verify the SSL certificate validity for the domain",
    "Test the drag-and-drop functionality in the editor",
    "Investigate the 404 error on the specific permalink",
    "Check the spam folder for the missing activation email",
    "Verify the affiliate tracking code is firing correctly",
    "Test the chatbot flow for the reported loop",
    "Investigate the formatting issue in the PDF export",
    "Check the user's browser extensions for ad blockers",
    "Verify the integrity of the database backup",
    "Test the CSV import with malformed data",
    "Investigate the reported crash on the startup screen",
    "Check the available disk space on the user's plan",
    "Verify the logic of the automation rule",
    "Test the dark mode rendering on the settings page",
    "Investigate the duplicate charges on the credit card",
    "Check the compatibility of the uploaded image format",

    # --- Documentation (50) ---
    "Update the FAQ article regarding the new pricing structure",
    "Write a new knowledge base article on 'Getting Started'",
    "Tag the feature request for 'Dark Mode' in the feedback tool",
    "Correct the typo in the 'Password Reset' help page",
    "Archive the outdated documentation for version 1.0",
    "Create a canned response (macro) for the outage apology",
    "Add a screenshot to the 'Billing Settings' tutorial",
    "Translate the help article into German",
    "Review the draft article submitted by the junior agent",
    "Update the internal wiki with the new escalation process",
    "Record a screencast demonstrating the new feature",
    "Create a troubleshooting flow chart for login issues",
    "Update the tags on the help center articles for better SEO",
    "Write a release note summary for the customer newsletter",
    "Create a template for the refund approval email",
    "Update the chatbot decision tree with new answers",
    "Document the workaround for the known bug in Jira",
    "Create a step-by-step guide for the API integration",
    "Update the 'System Requirements' page",
    "Write a post-mortem summary for the support team",
    "Create a glossary of terms for new users",
    "Update the contact hours on the support page",
    "Write a best practices guide for account security",
    "Create a checklist for the new agent onboarding",
    "Update the links in the email signature footer",
    "Write a script for the automated phone system (IVR)",
    "Create a GIF showing how to clear the cache",
    "Update the privacy policy page with the new clause",
    "Write a case study based on a resolved complex ticket",
    "Create a feedback form for the help center articles",
    "Update the troubleshooting guide for error 503",
    "Write a notification banner text for the dashboard",
    "Create a PDF manual for the enterprise deployment",
    "Update the copyright year in the documentation footer",
    "Write a tooltip text for the new UI element",
    "Create a decision matrix for refund eligibility",
    "Update the 'Known Issues' page with the latest bug",
    "Write a welcome message for the community forum",
    "Create a standard operating procedure (SOP) for fraud",
    "Update the training slides for the product update",
    "Write a disclaimer for the beta feature usage",
    "Create a quick reference card for keyboard shortcuts",
    "Update the API documentation with the new endpoint",
    "Write a script for the support webinar",
    "Create a troubleshooting checklist for mobile connection",
    "Update the 'About Us' page on the help center",
    "Write a template for the churn survey email",
    "Create a category structure for the new help section",
    "Update the formatting of the article for readability",
    "Write a meta description for the help page SEO",

    # --- Account Management (50) ---
    "Process the pro-rated refund for the cancelled subscription",
    "Reset the user's password manually upon request",
    "Upgrade the account to the 'Business' plan tier",
    "Downgrade the subscription to the 'Free' plan",
    "Update the billing address on the invoice",
    "Cancel the subscription to prevent future renewals",
    "Add 5 new seats to the organization's license",
    "Transfer the account ownership to the new admin",
    "Process the GDPR 'Right to be Forgotten' request",
    "Verify the VAT number for the tax exemption",
    "Unlock the account suspended due to failed payments",
    "Merge two user accounts into a single profile",
    "Update the credit card on file for the customer",
    "Generate a past invoice for the finance department",
    "Extend the trial period by 7 days",
    "Reactivate the cancelled account upon user return",
    "Remove the user from the organization's team",
    "Update the primary email address for the account",
    "Apply the discount coupon code to the next invoice",
    "Verify the non-profit status for the discount",
    "Review the account for potential fraud signals",
    "Suspend the account due to terms of service violation",
    "Migrate the account data to the new server region",
    "Enable two-factor authentication for the admin user",
    "Update the company name on the profile",
    "Process the chargeback dispute evidence",
    "Refund the double charge on the credit card",
    "Change the billing cycle from monthly to annual",
    "Export the account data for the departing customer",
    "Update the notification preferences for the user",
    "Verify the student ID for the educational license",
    "Remove the credit card information from the system",
    "Add a billing contact to the account",
    "Review the account usage limits",
    "Provision the enterprise license key",
    "Update the language preference for the account emails",
    "Disable the auto-renewal setting",
    "Process the referral bonus credit",
    "Verify the domain ownership for the account",
    "Update the time zone setting for the organization",
    "Reset the API key for security reasons",
    "Enable the beta features for the specific account",
    "Review the cancellation survey feedback",
    "Process the manual wire transfer payment",
    "Update the phone number for the 2FA SMS",
    "Verify the account age for the loyalty reward",
    "Remove the inactive users from the seat count",
    "Update the currency setting for the pricing",
    "Process the refund to the original payment method",
    "Delete the account permanently per user request"
]

In [27]:
marketing_specialist = [
    # --- Content Creation (50) ---
    "Draft a 1500-word blog post about emerging industry trends",
    "Write a compelling subject line for the weekly newsletter",
    "Create a series of 5 LinkedIn posts promoting the new ebook",
    "Script a 60-second product explainer video for YouTube",
    "Edit a guest post submission for tone and clarity",
    "Design an infographic summarizing the annual report data",
    "Write a case study highlighting a successful client implementation",
    "Draft the copy for the new product landing page",
    "Create a Twitter thread breaking down a complex topic",
    "Write a whitepaper on the future of remote work security",
    "Develop a content calendar for Q3 social media activities",
    "Write a press release announcing the Series B funding",
    "Create a catchy slogan for the upcoming holiday campaign",
    "Draft the script for a webinar introduction and housekeeping",
    "Write the bio and description for the company Instagram profile",
    "Create a checklist lead magnet for potential leads",
    "Write a customer success story based on a recent interview",
    "Draft the copy for a Facebook ad carousel",
    "Create a quiz to engage users on the website",
    "Write a comparison guide between our product and competitors",
    "Edit the CEO's op-ed piece for a major publication",
    "Write a welcome email series for new subscribers",
    "Create a slide deck for the sales team to use in demos",
    "Draft a set of FAQs for the product help center",
    "Write the script for a podcast ad read",
    "Create a meme for social media that aligns with brand humor",
    "Write a 'Year in Review' blog post for the company site",
    "Draft a notification message for an in-app product update",
    "Create a poll on LinkedIn to gather user sentiment",
    "Write a testimonial request email to send to happy clients",
    "Draft the copy for a direct mail postcard campaign",
    "Create a video script for a TikTok trend adaptation",
    "Write a detailed 'How-to' guide for a specific feature",
    "Draft the intro and outro for the company podcast",
    "Create a set of Instagram Stories templates",
    "Write a thought leadership article for Medium",
    "Draft the copy for a webinar registration page",
    "Create a glossary of industry terms for the website",
    "Write a cold outreach template for partnership inquiries",
    "Draft the script for an automated voicemail greeting",
    "Create a storyboard for a brand awareness video",
    "Write a vision statement for the new internal project",
    "Draft a response to a positive review on G2 Crowd",
    "Create a one-pager sell sheet for trade shows",
    "Write a bio for the keynote speaker",
    "Draft the copy for a Google My Business update",
    "Create a list of interview questions for a subject matter expert",
    "Write a roundup post featuring the best tools in the industry",
    "Draft a re-engagement email for inactive users",
    "Create a tagline for the new feature release",

    # --- SEO/SEM (50) ---
    "Conduct keyword research for the new product category",
    "Optimize the meta title and description for the homepage",
    "Audit the website for broken links and 404 errors",
    "Analyze the backlink profile to identify toxic links",
    "Set up a new Google Ads campaign for the summer sale",
    "Optimize the H1 and H2 tags on high-traffic blog posts",
    "Perform a competitor keyword gap analysis",
    "Create a list of long-tail keywords to target in Q4",
    "Update the image alt text for accessibility and SEO",
    "Manage the bid strategy for high-value PPC keywords",
    "Implement schema markup for the reviews section",
    "Analyze the search intent for the top 10 performing pages",
    "Optimize the Google Business Profile listing",
    "Conduct a technical SEO audit using SEMrush",
    "Create a disavow file for spammy backlinks",
    "Write ad copy for a Responsive Search Ad (RSA)",
    "Optimize the page load speed for core web vitals",
    "Set up conversion tracking in Google Ads",
    "Analyze the click-through rate (CTR) of organic search results",
    "Identify internal linking opportunities for the new guide",
    "Create a negative keyword list for the PPC campaign",
    "Optimize the URL structure for the new sub-folder",
    "Monitor keyword rankings variance week-over-week",
    "Set up a retargeting audience in Google Analytics",
    "Optimize the mobile responsiveness of the landing page",
    "Conduct a local SEO audit for regional branches",
    "Create a strategy to acquire high-authority backlinks",
    "Analyze the quality score of the Google Ads keywords",
    "Optimize the anchor text distribution for internal links",
    "Set up Bing Webmaster Tools for the domain",
    "Research voice search keywords for the FAQ page",
    "Optimize the featured snippet opportunities for top queries",
    "Create a dedicated landing page for a specific ad group",
    "Analyze the bounce rate for paid traffic sources",
    "Optimize the robots.txt file for better crawling",
    "Submit the sitemap to Google Search Console",
    "Conduct an A/B test on ad headlines",
    "Analyze the search terms report for new keyword ideas",
    "Optimize the canonical tags to prevent duplicate content",
    "Create a strategy for ranking in the 'People Also Ask' box",
    "Monitor the domain authority (DA) score trends",
    "Optimize the breadcrumb navigation for SEO",
    "Set up a dynamic search ad campaign",
    "Analyze the organic traffic drop for a specific page",
    "Optimize the video metadata for YouTube SEO",
    "Create a plan to fix orphan pages on the site",
    "Research keywords for the international market expansion",
    "Optimize the social meta tags (Open Graph)",
    "Analyze the seasonality of high-volume keywords",
    "Conduct a site migration SEO checklist review",

    # --- Campaigns (50) ---
    "Set up an automated email drip sequence for new leads",
    "Plan the timeline for the upcoming product launch",
    "Coordinate the logistics for the live webinar event",
    "Launch a seasonal promotion for Black Friday",
    "Manage the influencer outreach for the brand awareness campaign",
    "Schedule the social media posts for the campaign week",
    "Create a Gantt chart for the cross-channel marketing push",
    "Launch a referral program for existing customers",
    "Coordinate the assets delivery with the design team",
    "Set up a countdown timer on the campaign landing page",
    "Manage the budget allocation for the Q2 ad spend",
    "Launch a user-generated content (UGC) contest",
    "Coordinate a co-marketing campaign with a partner brand",
    "Plan the email segmentation strategy for the newsletter",
    "Launch a 'Win Back' campaign for churned customers",
    "Coordinate the printing of flyers for the local event",
    "Set up a webinar registration workflow in Zoom",
    "Launch a PR campaign to get media coverage",
    "Plan a virtual summit with multiple speakers",
    "Manage the sponsorship deliverables for the conference",
    "Launch a beta testing recruitment campaign",
    "Coordinate the recording of customer testimonial videos",
    "Set up SMS marketing alerts for the flash sale",
    "Launch a loyalty program reward tier",
    "Plan the content distribution strategy for the whitepaper",
    "Manage the affiliate marketing program applications",
    "Launch a podcast sponsorship campaign",
    "Coordinate the internal communication for the campaign launch",
    "Set up a post-purchase email flow",
    "Launch a survey campaign to gather product feedback",
    "Plan a '12 Days of Giveaways' holiday campaign",
    "Manage the relationship with the external PR agency",
    "Launch a retargeting campaign for cart abandoners",
    "Coordinate the booth setup for the trade show",
    "Set up a lead scoring model for the campaign",
    "Launch a community challenge on social media",
    "Plan a customer appreciation day event",
    "Manage the distribution of swag kits to influencers",
    "Launch a LinkedIn InMail campaign for B2B leads",
    "Coordinate the translation of campaign assets",
    "Set up a webinar follow-up email sequence",
    "Launch a 'Refer a Friend' bonus offer",
    "Plan a takeover campaign on Instagram Stories",
    "Manage the ad creative refresh schedule",
    "Launch a localized campaign for a specific city",
    "Coordinate the guest blog post exchange",
    "Set up a UTM tracking strategy for all campaign links",
    "Launch a pre-order campaign for the new product",
    "Plan a workshop series for potential clients",
    "Manage the debrief meeting after the campaign ends",

    # --- Analytics (50) ---
    "Create a monthly performance report using Google Analytics 4",
    "Analyze the conversion rate of the new landing page",
    "Calculate the Return on Ad Spend (ROAS) for the quarter",
    "Monitor the email open rates and click-through rates (CTR)",
    "Set up a heat map on the homepage to track user behavior",
    "Analyze the customer acquisition cost (CAC) by channel",
    "Run an A/B test on the call-to-action (CTA) button color",
    "Track the goal completions for the contact form",
    "Analyze the churn rate for the subscription service",
    "Create a custom dashboard in Google Looker Studio",
    "Monitor the social media engagement rate trends",
    "Analyze the funnel drop-off points in the checkout process",
    "Calculate the customer lifetime value (LTV)",
    "Track the attribution of leads to specific blog posts",
    "Analyze the results of the subject line A/B test",
    "Monitor the website traffic sources distribution",
    "Analyze the demographic data of the website visitors",
    "Calculate the cost per lead (CPL) for the LinkedIn campaign",
    "Track the video retention rate on YouTube",
    "Analyze the sentiment of social media mentions",
    "Create a report on the performance of influencer partnerships",
    "Monitor the bounce rate on mobile devices vs. desktop",
    "Analyze the search queries triggering the internal site search",
    "Calculate the net promoter score (NPS) from survey data",
    "Track the number of marketing qualified leads (MQLs)",
    "Analyze the effectiveness of the exit-intent popup",
    "Monitor the page views per session metric",
    "Analyze the time on page for long-form content",
    "Calculate the ROI of the content marketing efforts",
    "Track the growth rate of the email subscriber list",
    "Analyze the performance of the referral traffic",
    "Monitor the event tracking for button clicks",
    "Analyze the correlation between social shares and traffic",
    "Calculate the cost per click (CPC) trends",
    "Track the number of downloads for the gated asset",
    "Analyze the cohort retention analysis report",
    "Monitor the site speed impact on conversion rates",
    "Analyze the click map to see where users are clicking",
    "Calculate the revenue generated per email sent",
    "Track the organic ranking improvements for target keywords",
    "Analyze the device breakdown of the audience",
    "Monitor the 404 error rate in the analytics console",
    "Analyze the user flow through the website",
    "Calculate the conversion rate from trial to paid",
    "Track the webinar attendance rate",
    "Analyze the effectiveness of the abandoned cart emails",
    "Monitor the active user count (DAU/MAU)",
    "Analyze the impact of the rebranding on traffic",
    "Calculate the marketing contribution to the sales pipeline",
    "Track the performance of the affiliate referral links",

    # --- Brand & Strategy (50) ---
    "Conduct a SWOT analysis for the new product line",
    "Update the brand voice guidelines for the content team",
    "Research competitor pricing strategies",
    "Develop detailed buyer personas for the target market",
    "Identify potential co-marketing partners in the industry",
    "Conduct a brand sentiment analysis survey",
    "Create a crisis communication plan",
    "Align the marketing goals with the sales team's quotas",
    "Research emerging social media platforms for brand fit",
    "Develop a unique value proposition (UVP) statement",
    "Conduct a gap analysis of the current content library",
    "Create a brand style guide for visual assets",
    "Research the market demand for a potential new feature",
    "Develop a strategy for entering a new geographic market",
    "Conduct customer interviews to understand pain points",
    "Identify key opinion leaders (KOLs) for partnerships",
    "Create a mission statement for the marketing department",
    "Research the best practices for sustainability messaging",
    "Develop a strategy for improving employer branding",
    "Conduct a usability audit of the competitor's website",
    "Identify the unique selling points (USPs) of the product",
    "Create a roadmap for the marketing technology stack",
    "Research the regulatory landscape for data privacy",
    "Develop a strategy for community building",
    "Conduct a focus group for testing new ad concepts",
    "Identify opportunities for brand differentiation",
    "Create a stakeholder map for the internal project",
    "Research the latest trends in consumer behavior",
    "Develop a pricing strategy for the new service tier",
    "Conduct a website audit for brand consistency",
    "Identify the key success metrics (KPIs) for the year",
    "Create a positioning statement for the niche market",
    "Research the competitor's content marketing strategy",
    "Develop a customer journey map",
    "Conduct a survey to measure brand awareness",
    "Identify potential sponsorship opportunities",
    "Create a plan for corporate social responsibility (CSR)",
    "Research the demographics of the new target audience",
    "Develop a channel strategy for content distribution",
    "Conduct a mysterious shopper exercise on the competitor",
    "Identify the core values of the brand",
    "Create a tagline that resonates with the persona",
    "Research the impact of economic trends on the industry",
    "Develop a strategy for increasing customer loyalty",
    "Conduct a competitive benchmarking analysis",
    "Identify the barriers to entry in the new market",
    "Create a narrative for the brand story",
    "Research the effectiveness of different pricing models",
    "Develop a strategy for managing online reputation",
    "Conduct a brainstorming session for the new campaign theme"
]

In [28]:
backend_dev, data_scientist, hr_specialist, customer_support, marketing_specialist

(['Implement a multi-stage GraphQL mutation for batch inventory updates',
  'Design a RESTful API endpoint for partial resource updates using PATCH',
  'Define OpenAPI 3.0 specifications for the customer loyalty service',
  'Develop a versioning strategy for breaking changes in the mobile gateway',
  'Implement HATEOAS links in the order service response metadata',
  'Create a middleware for validating request payloads using JSON schema',
  'Design an idempotent endpoint for processing stripe payment callbacks',
  'Build a reverse proxy to aggregate responses from three internal microservices',
  'Implement content negotiation for exporting reports in CSV and JSON formats',
  'Optimize API response size by implementing field masking and sparse fieldsets',
  'Integrate Scalar or Swagger UI for real-time API documentation testing',
  'Develop a rate-limiting strategy using a Redis-backed leaky bucket algorithm',
  'Configure CORS policies to allow cross-origin requests from partner domai

In [29]:
# 1. Imports:
#Embedding Library
!pip install -q sentence-transformers

import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
#Imports
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from sentence_transformers import SentenceTransformer
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset

In [30]:
embedder = SentenceTransformer('all-MiniLM-L6-v2')

Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


In [31]:
# Map employees to ids
employee_id_map = {
    "backend_dev": 0,
    "data_scientist": 1,
    "hr_specialist": 2,
    "customer_support": 3,
    "marketing": 4
}

In [32]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer

# 1. Setup the Reverse Map (ID -> Role Name)
# We need this to create the 'role' column from the 'employee_id'
id_to_role = {
    0: "backend_dev",
    1: "data_scientist",
    2: "hr_specialist",
    3: "customer_support",
    4: "marketing_specialist"
}

all_tasks = []
all_ids = []

def add_role_data(task_list, role_id):
    all_tasks.extend(task_list)
    all_ids.extend([role_id] * len(task_list))

# Add your data
add_role_data(backend_dev, 0)
add_role_data(data_scientist, 1)
add_role_data(hr_specialist, 2)
add_role_data(customer_support, 3)
add_role_data(marketing_specialist, 4)

# 2. Generate Embeddings
# embedder = SentenceTransformer('all-MiniLM-L6-v2') # Uncomment if needed
print("Generating embeddings...")
embeddings_matrix = embedder.encode(all_tasks, show_progress_bar=True)
embeddings_list = list(embeddings_matrix)

# 3. Create the DataFrame
# Note: We name the column 'task_embedding' right here to match your request
df2 = pd.DataFrame({
    'task_description': all_tasks,
    'employee_id': all_ids,
    'task_embedding': embeddings_list
})

# We use .map() to translate 0 -> "backend_dev", 1 -> "data_scientist", etc.
df2['role'] = df2['employee_id'].map(id_to_role)

# 5. Reorder columns
desired_order = ["task_description", "task_embedding", "role", "employee_id"]
df2 = df2[desired_order]

print("\n--- DATASET GENERATED ---")
print(f"Total Samples: {len(df2)}")
print(df2.head())

Generating embeddings...


Batches:   0%|          | 0/39 [00:00<?, ?it/s]


--- DATASET GENERATED ---
Total Samples: 1248
                                    task_description  \
0  Implement a multi-stage GraphQL mutation for b...   
1  Design a RESTful API endpoint for partial reso...   
2  Define OpenAPI 3.0 specifications for the cust...   
3  Develop a versioning strategy for breaking cha...   
4  Implement HATEOAS links in the order service r...   

                                      task_embedding         role  employee_id  
0  [-0.07499061, 0.03379801, -0.009278719, 0.0164...  backend_dev            0  
1  [-0.08768013, 0.09890051, 0.061932098, -0.0309...  backend_dev            0  
2  [-0.053145178, -0.008096462, -0.057202775, -0....  backend_dev            0  
3  [-0.029114436, 0.030309547, 0.09236805, -0.080...  backend_dev            0  
4  [-0.050366275, 0.046337705, 0.04733956, 0.0041...  backend_dev            0  


In [33]:
df2.to_csv("tasks_dataset_llm.csv", index=False)

In [35]:
import pandas as pd

# ============================================================
# PART 3: MIXING THE DATASETS (The "Goldilocks" Ratio)
# ============================================================

# 1. STANDARDIZE ROLE NAMES (Crucial Step!)
# We need to make sure "marketing" in one df matches "marketing_specialist" in the other.
# Let's force them to a standard set of names.

# Check what names we currently have
print("Roles in DF1 (Template):", df['role'].unique())
print("Roles in DF2 (LLM):     ", df2['role'].unique())

# Optional: Un-comment this if you see mismatches (e.g., 'marketing' vs 'marketing_specialist')
# df2['role'] = df2['role'].replace({'marketing': 'marketing_specialist'})

# 2. DEFINE YOUR RATIO
# This is your "Volume Knob" for accuracy!
# - More Template Data = Higher Accuracy (Easier)
# - More LLM Data      = Lower Accuracy (Harder/More Realistic)

SAMPLES_FROM_TEMPLATE = 300  # The "Base" (Keep this high)
SAMPLES_FROM_LLM = 50        # The "Spice" (Start low, increase if accuracy is too high)

# 3. SAMPLE DATA (Stratified Sampling)
# We take specific amounts from each role to keep classes balanced.

def sample_per_role(dataframe, n_samples):
    # Group by role and take 'n' random samples from each group
    return dataframe.groupby('role').apply(
        lambda x: x.sample(n=min(len(x), n_samples), random_state=42)
    ).reset_index(drop=True)

# Create the subsets
subset_template = sample_per_role(df, SAMPLES_FROM_TEMPLATE)
subset_llm = sample_per_role(df2, SAMPLES_FROM_LLM)

print(f"\nSampling Report:")
print(f"Taken from Template: {len(subset_template)} total rows")
print(f"Taken from LLM:      {len(subset_llm)} total rows")

# 4. MERGE THEM
df_hybrid = pd.concat([subset_template, subset_llm], ignore_index=True)

# 5. RE-SHUFFLE
df_hybrid = df_hybrid.sample(frac=1, random_state=42).reset_index(drop=True)

# 6. RE-GENERATE IDs (Safety Step)
# Since we merged two sources, let's regenerate the IDs to be 100% sure they are correct.
# We create a new map based on the sorted unique roles.
unique_roles = sorted(df_hybrid['role'].unique())
role_to_id_final = {role: i for i, role in enumerate(unique_roles)}

df_hybrid['employee_id'] = df_hybrid['role'].map(role_to_id_final)

# 7. FINAL CLEANUP
# Keep only the columns we need
df_hybrid = df_hybrid[["task_description", "task_embedding", "role", "employee_id"]]

print("\n=== ‚úÖ HYBRID DATASET READY ===")
print(f"Total Samples: {len(df_hybrid)}")
print("Role Distribution:")
print(df_hybrid['role'].value_counts())
print("\nID Mapping Used:", role_to_id_final)

# 8. SAVE
filename = "employee_tasks_hybrid.pkl"
df_hybrid.to_pickle(filename)
print(f"\nüíæ Saved to '{filename}'")

Roles in DF1 (Template): ['hr_specialist' 'marketing_specialist' 'customer_support'
 'data_scientist' 'backend_dev']
Roles in DF2 (LLM):      ['backend_dev' 'data_scientist' 'hr_specialist' 'customer_support'
 'marketing_specialist']

Sampling Report:
Taken from Template: 1500 total rows
Taken from LLM:      250 total rows

=== ‚úÖ HYBRID DATASET READY ===
Total Samples: 1750
Role Distribution:
role
marketing_specialist    350
data_scientist          350
backend_dev             350
customer_support        350
hr_specialist           350
Name: count, dtype: int64

ID Mapping Used: {'backend_dev': 0, 'customer_support': 1, 'data_scientist': 2, 'hr_specialist': 3, 'marketing_specialist': 4}

üíæ Saved to 'employee_tasks_hybrid.pkl'


  return dataframe.groupby('role').apply(
  return dataframe.groupby('role').apply(
