# Synthetic Resume Generator
Based on this repository: https://github.com/annikaLindstrom/EthicsInAI

In [1]:
import pandas as pd
import numpy as np
import random
from datetime import datetime

In [2]:
# set random seed for reproducibility
random.seed(42)
np.random.seed(42)

In [6]:
# call generate_synthetic_resumes() here
df = generate_synthetic_resumes(1000)
df.head()

Unnamed: 0,Resume_ID,Sex,Employment_Gaps,College_Club,Resume_Keywords,Education_Level,Years_Experience,Skills,Certifications,Programming_Languages,GPA,Hired
0,R0001,Male,2,Debate Team,"collaborated, managed, achieved, optimized",Bachelor's,0,"Machine Learning, Problem Solving, Communicati...","Microsoft Certified, Google Analytics","SQL, Go",2.55,1
1,R0002,Male,2,Student Government,"created, implemented, managed, improved",Bachelor's,5,"Data Analysis, Agile, Leadership, Python, Proj...",Microsoft Certified,SQL,2.85,1
2,R0003,Female,2,Women in Tech,"organized, managed, created, optimized",PhD,11,"Project Management, Communication, Agile, Pyth...","AWS Certified, PMP","Java, SQL, C++",2.38,1
3,R0004,Female,2,Women in Tech,"led, created, collaborated, implemented, designed",Bachelor's,8,"Data Analysis, SQL, Project Management, Agile,...",Google Analytics,"SQL, JavaScript",3.38,1
4,R0005,Female,2,Women in Tech,"achieved, coordinated, managed, led",PhD,13,"Python, Problem Solving, Leadership","Google Analytics, Scrum Master","R, Python, JavaScript, Go",2.8,1


In [7]:
# call save_dataset() here
save_dataset(df)

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 12 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Resume_ID              1000 non-null   object 
 1   Sex                    1000 non-null   object 
 2   Employment_Gaps        1000 non-null   int64  
 3   College_Club           1000 non-null   object 
 4   Resume_Keywords        1000 non-null   object 
 5   Education_Level        1000 non-null   object 
 6   Years_Experience       1000 non-null   int64  
 7   Skills                 1000 non-null   object 
 8   Certifications         1000 non-null   object 
 9   Programming_Languages  1000 non-null   object 
 10  GPA                    1000 non-null   float64
 11  Hired                  1000 non-null   int64  
dtypes: float64(1), int64(3), object(8)
memory usage: 93.9+ KB


### Functions are defined below! Feel free to change the functions as you see fit.

In [4]:
import random
import pandas as pd

def generate_synthetic_resumes(n_resumes=10):
    sexes = ["Male", "Female"]

    college_clubs = [
        "Tech Club", "Women in Tech", "Engineering Society",
        "Chess Club", "Debate Team", "Student Government",
        "Robotics Club", "Coding Club", "Entrepreneurship Club",
        "Data Science Club", "IEEE Student Branch", "ACM Chapter", "None"
    ]

    resume_keywords = [
        "led", "managed", "developed", "created", "organized",
        "implemented", "designed", "coordinated", "achieved",
        "improved", "optimized", "collaborated"
    ]

    education_levels = ["Bachelor's", "Master's", "PhD"]

    skills = [
        "Machine Learning", "Data Analysis", "Python", "SQL",
        "Project Management", "Agile", "Leadership",
        "Communication", "Problem Solving"
    ]

    certifications = [
        "AWS Certified", "PMP", "Scrum Master",
        "Google Analytics", "Microsoft Certified", "None"
    ]

    programming_languages = [
        "Python", "Java", "C++", "JavaScript",
        "R", "SQL", "Go"
    ]

    records = []

    for i in range(n_resumes):
        sex = random.choice(sexes)
        years_exp = random.randint(0, 17)
        gaps = random.randint(0, 2)
        gpa = round(random.uniform(2.0, 4.0), 2)

        club = (
            "Women in Tech"
            if sex == "Female" and random.random() < 0.3
            else random.choice([c for c in college_clubs if c != "Women in Tech"])
        )

        keywords = random.sample(resume_keywords, random.randint(3, 7))
        resume_skills = random.sample(skills, random.randint(3, 6))
        langs = random.sample(programming_languages, random.randint(1, 4))
        certs = random.sample(
            [c for c in certifications if c != "None"],
            random.randint(0, 2)
        )

        education = random.choice(education_levels)

        # ---- Hiring logic ----
        hire_prob = 0.35

        hire_prob += 0.15 if sex == "Male" else 0
        hire_prob += 0.1 if any(k in keywords for k in ["led", "managed"]) else 0
        hire_prob += {"Bachelor's": 0, "Master's": 0.05, "PhD": 0.1}[education]
        hire_prob += 0.1 if years_exp >= 5 else 0
        hire_prob -= 0.1 if gaps > 0 else 0
        hire_prob += 0.05 if gpa > 3.5 else 0
        hire_prob -= 0.1 if club == "Women in Tech" else 0

        hire_prob += random.uniform(-0.03, 0.03)
        hire_prob = min(max(hire_prob, 0), 1)

        hired = int(random.random() < hire_prob)

        records.append({
            "Resume_ID": f"R{i+1:04d}",
            "Sex": sex,
            "Employment_Gaps": gaps,
            "College_Club": club,
            "Resume_Keywords": ", ".join(keywords),
            "Education_Level": education,
            "Years_Experience": years_exp,
            "Skills": ", ".join(resume_skills),
            "Certifications": ", ".join(certs) if certs else "None",
            "Programming_Languages": ", ".join(langs),
            "GPA": gpa,
            "Hired": hired
        })

    return pd.DataFrame(records)


In [5]:
def save_dataset(df, filename='resumes.csv'):
    """
    Save the dataset to CSV file
    
    Parameters:
        df : pandas DataFrame to save
        filename : str
    """
    df.to_csv(filename, index=False)