In [9]:
pip install faker




In [10]:
cd /content/sample_data

/content/sample_data


In [11]:

from faker import Faker
import pandas as pd
import random

faker = Faker()

# Constants
DEPARTMENTS = ['Legal', 'Marketing', 'Administrative', 'Operations', 'Sales', 'Finance', 'I/T', 'Product', 'Human Resource']
DEPARTMENT_PERCENTAGES = {
    'Legal': 0.05, 'Marketing': 0.10, 'Administrative': 0.10, 'Operations': 0.20,
    'Sales': 0.10, 'Finance': 0.05, 'I/T': 0.10, 'Product': 0.20, 'Human Resource': 0.10
}
GENDERS = ['Male', 'Female']
LANGUAGES = ['Spanish', 'Chinese', 'Hindi', 'French', 'German', 'Japanese', 'Korean']
SALARY_RANGES = {
    'Legal': (100000, 200000),
    'Marketing': (55000, 90000),
    'Administrative': (35000, 60000),
    'Operations': (60000, 120000),
    'Sales': (50000, 110000),
    'Finance': (70000, 150000),
    'I/T': (75000, 140000),
    'Product': (80000, 180000),
    'Human Resource': (50000, 90000)
}

# H-1B Petition percentages for different countries
COUNTRY_PERCENTAGES = {
    'India': 0.745, 'China': 0.118, 'Canada': 0.010, 'South Korea': 0.009,
    'Philippines': 0.006, 'Taiwan': 0.006, 'Mexico': 0.006
}
def generate_employee_data(num_employees):
    data = []
    for _ in range(num_employees):
        first_name = faker.first_name()
        last_name = faker.last_name()
        email = faker.email()
        phone = faker.phone_number()
        gender = random.choice(GENDERS)
        age = random.randint(22, 65)
        years_of_experience = random.randint(0, 40)
        department = random.choices(DEPARTMENTS, weights=[DEPARTMENT_PERCENTAGES[dept] for dept in DEPARTMENTS])[0]
        salary = random.randint(*SALARY_RANGES[department])
        ssn = faker.unique.ssn()
        non_us = random.random() < 0.4
        country = 'USA' if not non_us else random.choices(list(COUNTRY_PERCENTAGES.keys()), weights=list(COUNTRY_PERCENTAGES.values()))[0]
        languages_spoken = random.sample(LANGUAGES, random.randint(0, 2))

        employee = {
            'First Name': first_name,
            'Last Name': last_name,
            'Email': email,
            'Phone': phone,
            'Gender': gender,
            'Age': age,
            'Job Title': faker.job(),
            'Years Of Experience': years_of_experience,
            'Salary': salary,
            'Department': department,
            'SSN': ssn,
            'Country': country,
            'Languages Spoken': languages_spoken if languages_spoken else ['None']  # Assign 'None' if no language spoken
        }
        data.append(employee)
    return data

employees = generate_employee_data(10000)
def balance_gender(data):
    male_count = len([d for d in data if d['Gender'] == 'Male'])
    female_count = len(data) - male_count
    if male_count > female_count:
        for employee in data:
            if employee['Gender'] == 'Male' and male_count > female_count:
                employee['Gender'] = 'Female'
                male_count -= 1
                female_count += 1
    else:
        for employee in data:
            if employee['Gender'] == 'Female' and female_count > male_count:
                employee['Gender'] = 'Male'
                female_count -= 1
                male_count += 1
    return data

balanced_employees = balance_gender(employees)
df = pd.DataFrame(balanced_employees)
df.head(10)
df.to_csv('balanced_synthetic_employees.csv', index=False)


In [12]:
import pandas as pd
from collections import defaultdict


def analyze_employee_data(data):
    # How many men vs. women will we need to hire in each department?
    gender_department_counts = defaultdict(lambda: defaultdict(int))
    for index, employee in data.iterrows():
        gender_department_counts[employee['Gender']][employee['Department']] += 1

    # How much will this new company pay in yearly payroll?
    total_payroll = data['Salary'].sum()

    # Other than hiring from non-US countries, how else might the company grow quickly from size=320 to size=10000?
    # Assuming linear growth, calculating the additional hires needed
    additional_hires_needed = 10000 - len(data)

    # How much office space will this company require?
    # Assuming each employee requires 100 sq ft of office space
    office_space_required = len(data) * 100

    # Does this new dataset preserve the privacy of the original employees listed in employees.csv?
    # Since we balanced the gender, the new dataset does not preserve the original privacy of employees
    return gender_department_counts, total_payroll, additional_hires_needed, office_space_required

gender_department_counts, total_payroll, additional_hires_needed, office_space_required = analyze_employee_data(df)

print("Men vs. Women to hire in each department:")
for gender, department_counts in gender_department_counts.items():
    print(f"{gender}:")
    for department, count in department_counts.items():
        print(f"    {department}: {count}")

print(f"\nYearly Payroll: ${total_payroll}")

print(f"\nAdditional Hires Needed for Company Growth: {additional_hires_needed}")

print(f"\nOffice Space Required: {office_space_required} sq ft")


Men vs. Women to hire in each department:
Male:
    Marketing: 496
    Product: 995
    Administrative: 517
    I/T: 488
    Operations: 972
    Human Resource: 524
    Finance: 249
    Legal: 255
    Sales: 504
Female:
    Human Resource: 474
    Legal: 227
    Finance: 252
    Product: 1054
    I/T: 508
    Sales: 495
    Operations: 1036
    Administrative: 483
    Marketing: 471

Yearly Payroll: $947644482

Additional Hires Needed for Company Growth: 0

Office Space Required: 1000000 sq ft


In [13]:
pip install ydata-profiling




In [14]:
import pandas as pd
from pandas_profiling import ProfileReport

# Read the CSV file into a DataFrame
df = pd.read_csv('balanced_synthetic_employees.csv')

# Create a profile report
profile = ProfileReport(df, title="Employee Dataset Profiling")

# Save the report to an HTML file
profile.to_file("employee_dataset_profile.html")


  from pandas_profiling import ProfileReport


Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

In [16]:
import pandas as pd
from pandas_profiling import ProfileReport

# Read the CSV file into a DataFrame
df = pd.read_csv('employees.csv')

# Create a profile report
profile = ProfileReport(df, title="Original Employee Dataset Profiling")

# Save the report to an HTML file
profile.to_file("original_employee_dataset_profile.html")




Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

In [17]:
import pandas as pd

df = pd.read_csv('balanced_synthetic_employees.csv')
# Assuming df is your DataFrame containing the employee data
# Create weights based on age ranges
weights = df['Age'].apply(lambda x: 3 if 40 <= x < 50 else 1)

# Use the sample method with the weights parameter
biased_sample = df.sample(n=20, weights=weights, random_state=42)

print(biased_sample)


       First Name Last Name                        Email  \
3736  Christopher    Holmes          paullee@example.net   
9501       Dennis    Thomas          julie41@example.com   
7286       Denise    Garcia  nelsonelizabeth@example.net   
5961        James   Higgins       dawnwright@example.net   
1551       Sandra      Wang        kathryn92@example.com   
569        Tamara     Russo        umartinez@example.org   
8633       Latoya    Tucker            pam30@example.com   
5984      Felicia     Smith           dmills@example.com   
7049         Erin    Mercer     josephbowman@example.org   
200       Candice    Fowler    danielgilbert@example.net   
9700        Donna     Smith   barnettmichael@example.net   
8281      Antonio     White         cheryl66@example.org   
2115         Lisa    Wilson          susan21@example.org   
1815    Alexander    Ingram           dleach@example.com   
1830       Joseph     Welch    stantontaylor@example.org   
3019       Rachel  Phillips      lucassh

In [18]:
import pandas as pd

df = pd.read_csv('employees.csv')

# Create weights based on age ranges
weights = df['Age'].apply(lambda x: 3 if 40 <= x < 50 else 1)

# Use the sample method with the weights parameter
biased_sample = df.sample(n=20, weights=weights, random_state=42)

print(biased_sample)


    First Name Last Name                                  Email  \
113     Nicole     Davis     nicoledavis113588@slingacademy.com   
302     Steven  Peterson  stevenpeterson302672@slingacademy.com   
230    Zachary     Scott    zacharyscott230288@slingacademy.com   
185    Michael    Galvan   michaelgalvan185989@slingacademy.com   
50     Bradley    Garcia    bradleygarcia50896@slingacademy.com   
16      Nicole     Smith      nicolesmith16360@slingacademy.com   
277      Peter   Johnson    peterjohnson277327@slingacademy.com   
186    Michael    Warner   michaelwarner186194@slingacademy.com   
223    Jessica      West     jessicawest223110@slingacademy.com   
7         John      Tate          johntate7881@slingacademy.com   
309    Vanessa    Carter   vanessacarter309183@slingacademy.com   
265      Megan     Brown      meganbrown265895@slingacademy.com   
65      Kristy    Rhodes     kristyrhodes65269@slingacademy.com   
57        Jodi    Harris       jodiharris57145@slingacademy.co

In [19]:
import pandas as pd
import faker

# Read the CSV file into a DataFrame
df = pd.read_csv('employees.csv')

# Create a Faker generator
fake = faker.Faker()

# Anonymize name, email, and phone number columns
df['First Name'] = df['First Name'].apply(lambda x: fake.first_name())
df['Last Name'] = df['Last Name'].apply(lambda x: fake.last_name())
df['Email'] = df['Email'].apply(lambda x: fake.email())
df['Phone'] = df['Phone'].apply(lambda x: fake.phone_number())

# Save the anonymized DataFrame to a new CSV file
df.to_csv('anon_employees.csv', index=False)


In [20]:
import pandas as pd
import numpy as np

import faker

# Read the CSV file into a DataFrame
df = pd.read_csv('employees.csv')


# Calculating the range of each attribute
age_range = df['Age'].max() - df['Age'].min()
salary_range = df['Salary'].max() - df['Salary'].min()
exp_range = df['Years Of Experience'].max() - df['Years Of Experience'].min()

# Choosing standard deviations as a fraction of the ranges
age_std = 0.1 * age_range
salary_std = 0.2 * salary_range  # Larger standard deviation for salary
exp_std = 0.05 * exp_range

# Perturbing the attributes using Gaussian noise
df['Age'] = df['Age'] + np.random.normal(0, age_std, len(df))
df['Salary'] = df['Salary'] + np.random.normal(0, salary_std, len(df))
df['Years Of Experience'] = df['Years Of Experience'] + np.random.normal(0, exp_std, len(df))
df.head(10)

Unnamed: 0,First Name,Last Name,Email,Phone,Gender,Age,Job Title,Years Of Experience,Salary,Department
0,Jose,Lopez,joselopez0944@slingacademy.com,+1-971-533-4552x1542,male,22.13061,Project Manager,1.714893,7152.753066,Product
1,Diane,Carter,dianecarter1228@slingacademy.com,881.633.0107,female,23.013334,Machine Learning Engineer,2.476383,849.381637,Product
2,Shawn,Foster,shawnfoster2695@slingacademy.com,001-966-861-0065x493,male,36.893923,Project Manager,13.603205,17319.411493,Product
3,Brenda,Fisher,brendafisher3185@slingacademy.com,001-574-564-4648,female,30.079106,Web Developer,8.457965,10854.703877,Product
4,Sean,Hunter,seanhunter4753@slingacademy.com,5838355842,male,33.856432,Project Manager,9.453071,18084.466744,Product
5,Joshua,Jacobs,joshuajacobs5904@slingacademy.com,053-913-2609,male,25.657384,Project Manager,3.107051,8583.58254,Product
6,Brianna,Marshall,briannamarshall6438@slingacademy.com,701-932-8553,female,36.783965,Machine Learning Engineer,10.417165,13284.359636,Product
7,John,Tate,johntate7881@slingacademy.com,001-889-992-5260x62725,male,33.664283,Mobile Developer,9.773572,7194.061235,Product
8,Jillian,Byrd,jillianbyrd8170@slingacademy.com,077-635-0084x1647,female,34.938506,Web Developer,9.258896,12403.961237,Product
9,Melanie,Sharp,melaniesharp9256@slingacademy.com,(848)212-0230,female,41.21699,Web Developer,14.621676,19784.696878,Product
