In [None]:
import os
from dotenv import load_dotenv
load_dotenv()

## Generate synthetic mock-up dataset

### Setup endpoint and client

In [None]:
from openai import OpenAI

model_name = "solar-pro-250422"

# Configure OpenAI client to use Upstage API
client = OpenAI(
    base_url="https://api.upstage.ai/v1",
    api_key=os.getenv("UPSTAGE_API_KEY")
)

### Variables

In [None]:
projects_data = []
num_projects = 100

### JSON schema for structured output

In [None]:
# Schema for project generation
project_schema = {
    "type": "object",
    "properties": {
        "project_name": {"type": "string"},
        "descriptive_summary": {"type": "string"},
        "required_roles": {
            "type": "array",
            "items": {"type": "string"}
        },
        "duration_months": {"type": "integer"},
    },
    "required": ["project_name", "descriptive_summary", "required_roles"]
}

# Schema for participant generation
participant_schema = {
    "type": "object",
    "properties": {
        "participants": {
            "type": "array",
            "items": {
                "type": "object",
                "properties": {
                    "name": {"type": "string"},
                    "role": {"type": "string"},
                    "experience_years": {"type": "integer"},
                    "skills": {
                        "type": "array",
                        "items": {"type": "string"}
                    },
                    "performance_rating": {"type": "number", "minimum": 1, "maximum": 5}
                },
                "required": ["name", "role", "experience_years", "skills"]
            }
        }
    },
    "required": ["participants"]
}

### generate data (project and participants per project)

In [29]:
for i in range(num_projects):
    print(f"Generating project {i+1}/{num_projects}...")
    
    # Prepare context about previously generated projects
    previous_projects_context = ""
    if i > 0:
        previous_summaries = [proj['descriptive_summary'] for proj in projects_data]
        previous_projects_context = f"\n\nPreviously generated projects to avoid duplication:\n" + "\n".join([f"- {summary}" for summary in previous_summaries])
    
    # Generate project
    project_response = client.chat.completions.create(
        model=model_name,
        messages=[
            {
                "role": "system",
                "content": "You are a project manager creating realistic software development projects. Generate diverse projects across different industries and complexity levels."
            },
            {
                "role": "user",
                "content": f"Generate a detailed software development project (project #{i+1}). Include a descriptive summary, required roles, and other relevant project details.{previous_projects_context}"
            }
        ],
        response_format={
            "type": "json_schema",
            "json_schema": {
                "name": "project_schema",
                "schema": project_schema
            }
        }
    )
    
    project = project_response.choices[0].message.content
    project_data = eval(project)  # In production, use json.loads() with proper error handling
    
    print(f"Project: {project_data['project_name']}")
    print(f"Required roles: {', '.join(project_data['required_roles'])}")
    
    # Generate participants for this project
    participants_response = client.chat.completions.create(
        model=model_name,
        messages=[
            {
                "role": "system",
                "content": "You are an HR manager creating realistic participant profiles for software projects. Generate diverse participants with varying experience levels and skills."
            },
            {
                "role": "user",
                "content": f"Generate participants for the project '{project_data['project_name']}' with the following required roles: {', '.join(project_data['required_roles'])}. Create one participant for each required role."
            }
        ],
        response_format={
            "type": "json_schema",
            "json_schema": {
                "name": "participant_schema",
                "schema": participant_schema
            }
        }
    )
    
    participants = participants_response.choices[0].message.content
    participants_data = eval(participants)  # In production, use json.loads() with proper error handling
    
    # Combine project and participants data
    project_data['participants'] = participants_data['participants']
    projects_data.append(project_data)
    
    print(f"Generated {len(participants_data['participants'])} participants")
    print("-" * 50)

print(f"\nGenerated {len(projects_data)} projects with participants!")


Generating project 1/100...
Project: Smart Health Monitoring App
Required roles: Project Manager, Mobile App Developer, Backend Developer, Frontend Developer, UI/UX Designer, Quality Assurance Engineer, Healthcare Consultant, Security Specialist
Generated 8 participants
--------------------------------------------------
Generating project 2/100...
Project: Smart Inventory Management System
Required roles: Project Manager, Business Analyst, Cloud Architect, Backend Developer, Frontend Developer, Mobile App Developer, IoT Specialist, Data Scientist, Security Specialist, Quality Assurance Engineer
Generated 10 participants
--------------------------------------------------
Generating project 3/100...
Project: FinanceGenius
Required roles: Software Developer, Project Manager, UX/UI Designer, AI Specialist, Product Manager, Quality Assurance Engineer
Generated 6 participants
--------------------------------------------------
Generating project 4/100...
Project: Smart City Traffic Management

SyntaxError: unterminated string literal (detected at line 3) (<string>, line 3)

## Exportation

### Deduplication

In [36]:
# Remove duplicate projects based on descriptive_summary
unique_projects = []
seen_summaries = set()

for project in projects_data:
    summary = project.get('descriptive_summary', '')
    if summary not in seen_summaries:
        unique_projects.append(project)
        seen_summaries.add(summary)

print(f"Before deduplication: {len(projects_data)} projects")
print(f"After deduplication: {len(unique_projects)} projects")
print(f"Removed {len(projects_data) - len(unique_projects)} duplicates")

# Update projects_data with unique projects
projects_data = unique_projects

Before deduplication: 85 projects
After deduplication: 45 projects
Removed 40 duplicates


### Save final dataset as external file

In [38]:
import json

# Export projects_data to JSON file
with open('projects_data_en.json', 'w') as f:
    json.dump(projects_data, f, indent=2)

print(f"Exported {len(projects_data)} projects to projects_data_en.json")

Exported 45 projects to projects_data_en.json
