In [None]:
import pandas as pd
import numpy as np
import random

In [None]:
# Defining sample data for generating the dataset
first_names = ['Alice', 'Bob', 'Charlie', 'David', 'Eva', 'Frank', 'Grace', 'Hannah', 'Ian', 'Julia', 'Kyle', 'Liam', 'Mia', 'Nina', 'Oliver', 'Paul', 'Quincy', 'Rachel', 'Sophie', 'Tom', 'Uma', 'Vera', 'Will', 'Xander', 'Yara', 'Zoe']
last_names = ['Smith', 'Johnson', 'Brown', 'Taylor', 'Miller', 'Davis', 'Garcia', 'Martinez', 'Anderson', 'Wilson', 'Moore', 'Jackson', 'Martin', 'Lee', 'Perez', 'Thompson', 'White', 'Harris', 'Sanchez', 'Clark', 'Ramirez', 'Lewis', 'Robinson', 'Walker', 'Young', 'King']
cities = ['New York', 'Los Angeles', 'Chicago', 'Houston', 'Phoenix', 'Philadelphia', 'San Antonio', 'San Diego', 'Dallas', 'San Jose', 'Austin', 'Jacksonville', 'Fort Worth', 'Columbus', 'Charlotte', 'Indianapolis', 'Seattle', 'Denver', 'Washington', 'Boston']

projects = {
    'Assemble a band': ['Guitarist', 'Drummer', 'Vocalist', 'Bassist', 'Keyboardist', 'Sound Engineer'],
    'Build a robot': ['Mechanical Engineer', 'Electrical Engineer', 'Programmer', 'CAD Designer', 'Robotics Technician'],
    'Organize a fundraiser': ['Event Planner', 'Marketing Specialist', 'Graphic Designer', 'Public Relations', 'Volunteer Coordinator'],
    'Create a clean energy startup': ['Business Strategist', 'Solar Engineer', 'Wind Power Expert', 'Financial Analyst', 'Marketing Expert'],
    'Develop a mobile app': ['App Developer', 'UI/UX Designer', 'Backend Developer', 'QA Tester', 'Product Manager'],
    'Start a community garden': ['Horticulturist', 'Garden Planner', 'Volunteer Coordinator', 'Event Coordinator', 'Fundraiser'],
    'Film a short movie': ['Director', 'Cinematographer', 'Scriptwriter', 'Editor', 'Actor', 'Sound Technician'],
    'Design a video game': ['Game Developer', 'Animator', 'Game Designer', '3D Modeler', 'Sound Designer'],
    'Launch a food truck': ['Chef', 'Menu Planner', 'Marketing Expert', 'Logistics Manager', 'Cashier'],
    'Publish a digital magazine': ['Editor', 'Writer', 'Graphic Designer', 'Photographer', 'Social Media Manager'],
    'Host a podcast': ['Host', 'Sound Editor', 'Researcher'],
    'Start a book club': ['Organizer', 'Facilitator', 'Book Reviewer', 'Social Media Coordinator']
}

## Generate user data

In [None]:
# Generate preferred group size based on project roles
def generate_group_size(project):
    min_roles = len(projects[project])
    max_roles = min_roles * 2
    return random.randint(min_roles, max_roles)

# Generating random data
def generate_user_data():
    first_name = random.choice(first_names)
    last_name = random.choice(last_names)
    age = random.randint(18, 60)
    project = random.choice(list(projects))
    personal_skill = random.choice(projects[project])
    
    # Exclude personal skill from desired skills
    available_skills = [skill for skill in projects[project] if skill != personal_skill]
    desired_skills = random.sample(available_skills, min(3, len(available_skills)))  # Ensure it doesn't exceed available skills
    
    while len(desired_skills) < 3:
        desired_skills.append(np.nan)
    
    location = random.choice(cities)
    preferred_group_size = generate_group_size(project)
    
    return [first_name, last_name, age, project, personal_skill, desired_skills[0], desired_skills[1], desired_skills[2], location, preferred_group_size]

In [None]:
# Create dataset
data = [generate_user_data() for _ in range(150)]
columns = ['first_name', 'last_name', 'age', 'project', 'personal_skill', 'desired_skill_1', 'desired_skill_2', 'desired_skill_3', 'location', 'preferred_group_size']

In [None]:
# Create DataFrame
df = pd.DataFrame(data, columns=columns)

In [None]:
# Drop any rows with identical users (name, age, location)
df = df.loc[~df.duplicated(subset=['first_name', 'last_name', 'age', 'location'], keep='first')]
df = df.reset_index(drop=True)

In [None]:
# Check dimensions
df.shape

In [None]:
# Preview data
df.head(10)

In [None]:
# Save the dataset as TSV
file_path = 'skill_share_users_dataset.tsv'
df.to_csv(file_path, sep='\t', index=False)

## Generate project data