# Synthetic Resume Generator
Based on this repository: https://github.com/annikaLindstrom/EthicsInAI

In [1]:
import pandas as pd
import numpy as np
import random
from datetime import datetime

In [2]:
# set random seed for reproducibility
random.seed(42)
np.random.seed(42)

In [8]:
# call generate_synthetic_resumes() here
df = generate_synthetic_resumes(1000)
df.head()

Unnamed: 0,Resume_ID,Sex,Employment_Gaps,College_Club,Resume_Keywords,Education_Level,Years_Experience,Skills,Certifications,Programming_Languages,GPA,Interviewed
0,R0001,Male,0,Engineering Society,"established, led, initiated, designed",Master's,5,"Leadership, Cloud Computing, Project Managemen...","PMP, CISSP","Ruby, C++",3.99,1
1,R0002,Male,1,Robotics Club,"improved, collaborated, achieved, coordinated,...",Master's,13,"Python, Project Management, Critical Thinking,...","PMP, AWS Certified","SQL, C++, MATLAB",2.01,1
2,R0003,Female,1,Robotics Club,"organized, managed, initiated, designed",Bachelor's,14,"Data Visualization, Communication, Python, Cri...",,"Ruby, Scala, JavaScript, R, MATLAB",3.75,1
3,R0004,Male,1,Coding Club,"led, implemented, developed, collaborated, cre...",PhD,0,"Agile, Communication, Data Visualization, Crit...",CompTIA,Go,2.58,1
4,R0005,Male,0,ACM Chapter,"developed, achieved, coordinated, established,...",Bachelor's,7,"Communication, Team Collaboration, Data Visual...",,"R, MATLAB, Java, SQL, JavaScript",2.8,1


In [7]:
# call save_dataset() here
save_dataset(df)

### Functions are defined below! Feel free to change the functions as you see fit.

In [4]:
def generate_synthetic_resumes(n_resumes=10):
    """
    Generate synthetic resume dataset
    
    Parameters:
        n_resumes : int
    
    Returns:
        pd.DataFrame : pandas DataFrame    
    """
    
    sexes = ['Male', 'Female']
    
    college_clubs = [
        'Tech Club', 'Women in Tech', 'Engineering Society', 
        'Chess Club', 'Debate Team', 'Student Government',
        'Robotics Club', 'Coding Club', 'Entrepreneurship Club',
        'Data Science Club', 'IEEE Student Branch', 'ACM Chapter',
        'None'
    ]
    
    resume_keywords = [
        'led', 'managed', 'developed', 'created', 'organized',
        'implemented', 'designed', 'coordinated', 'established',
        'achieved', 'improved', 'optimized', 'collaborated',
        'initiated', 'executed', 'spearheaded', 'delivered'
    ]
    
    education_levels = ['Bachelor\'s', 'Master\'s', 'PhD']
    
    skills = [
        'Machine Learning', 'Data Analysis', 'Python', 'SQL',
        'Project Management', 'Agile', 'Leadership', 'Communication',
        'Problem Solving', 'Critical Thinking', 'Team Collaboration',
        'Data Visualization', 'Statistics', 'Cloud Computing'
    ]
    
    certifications = [
        'AWS Certified', 'PMP', 'Scrum Master', 'Google Analytics',
        'Microsoft Certified', 'CompTIA', 'CISSP', 'Six Sigma',
        'None'
    ]
    
    programming_languages = [
        'Python', 'Java', 'C++', 'JavaScript', 'R', 'MATLAB',
        'SQL', 'Ruby', 'Go', 'Scala', 'TypeScript'
    ]
    
    data = []
    
    for i in range(n_resumes):
        sex = random.choice(sexes)
        employment_gaps = random.randint(0, 2)

        if sex == 'Female':
            if random.random() < 0.3:
                club = 'Women in Tech'
            else:
                club = random.choice([c for c in college_clubs if c != 'Women in Tech'])
        else:
            club = random.choice([c for c in college_clubs if c != 'Women in Tech'])
        
        num_keywords = random.randint(3, 10)
        keywords = random.sample(resume_keywords, num_keywords)
        keywords_str = ', '.join(keywords)
        
        education = random.choice(education_levels)
        years_experience = random.randint(0, 17)
        
        num_skills = random.randint(3, 8)
        resume_skills = random.sample(skills, num_skills)
        skills_str = ', '.join(resume_skills)
        
        num_certs = random.randint(0, 3)
        if num_certs == 0:
            certs = 'None'
        else:
            resume_certs = random.sample([c for c in certifications if c != 'None'], 
                                        min(num_certs, len(certifications)-1))
            certs = ', '.join(resume_certs)
        
        num_langs = random.randint(1, 5)
        prog_langs = random.sample(programming_languages, num_langs)
        langs_str = ', '.join(prog_langs)
        
        gpa = round(random.uniform(2.0, 4.0), 2)
        
        hire_probability = 0.5
        if sex == 'Male':
            hire_probability += 0.15  # Gender bias
        
        if 'led' in keywords or 'managed' in keywords:
            hire_probability += 0.1
        
        if education == 'Master\'s':
            hire_probability += 0.05
        elif education == 'PhD':
            hire_probability += 0.1
        
        if years_experience > 5:
            hire_probability += 0.1
        
        if employment_gaps > 0:
            hire_probability -= 0.15
        
        if gpa > 3.5:
            hire_probability += 0.05
        
        if club == 'Women in Tech':
            hire_probability -= 0.1
        
        hire_probability += random.uniform(-0.1, 0.1)
        hire_probability = max(0, min(1, hire_probability))
        hired = 1 if random.random() < hire_probability else 0
        
        resume = {
            'Resume_ID': f'R{i+1:04d}',
            'Sex': sex,
            'Employment_Gaps': employment_gaps,
            'College_Club': club,
            'Resume_Keywords': keywords_str,
            'Education_Level': education,
            'Years_Experience': years_experience,
            'Skills': skills_str,
            'Certifications': certs,
            'Programming_Languages': langs_str,
            'GPA': gpa,
            'Interviewed': hired
        }
        
        data.append(resume)
    
    # Create DataFrame
    df = pd.DataFrame(data)
    
    return df

In [5]:
def save_dataset(df, filename='resumes.csv'):
    """
    Save the dataset to CSV file
    
    Parameters:
        df : pandas DataFrame to save
        filename : str
    """
    df.to_csv(filename, index=False)