In [3]:
# imports
import pandas as pd
import numpy as np
from faker import Faker
from datetime import datetime, timedelta, date
import warnings
warnings.filterwarnings('ignore')

In [4]:
# set random seed for reproducibility
np.random.seed(1234)
fake = Faker()
Faker.seed(1234)

In [5]:
# load existing data
customers_orig = pd.read_excel('./data_original/Customers.xlsx')
orderinfo_orig = pd.read_excel('./data_original/OrderInfo.xlsx')

# making sure dates show up as date not datetime
customers_orig['DOB'] = customers_orig['DOB'].dt.date
orderinfo_orig['Date'] = orderinfo_orig['Date'].dt.date

print(f"Customers: {len(customers_orig)} records")
print(f"OrderInfo: {len(orderinfo_orig)} records")

Customers: 18907 records
OrderInfo: 28862 records


In [6]:
# loading employees data
employees_prof_orig = pd.read_excel('./data_original/Employees.xlsx')


employees_dates_orig = pd.read_excel('./data_original/Ace_Bikes_Data.xlsx', 
                                usecols=['EmployeeID', 'StartDate', 'TerminationDate', 'LocationID'])

employees_skills_orig = pd.read_excel('./data_original/Ace_Bikes_Data.xlsx', 
                                usecols=['EmpID', 'Date', 'Salesmanship', 'ProductKnowledge', 'TeamPlayer', 'Innovator', 'Satisfaction'])

employees_term_orig = pd.read_excel('./data_original/Ace_Bikes_Data.xlsx', 
                                usecols=['EmployeeID', 'Reason'])


employees_prof_orig['DOB'] = employees_prof_orig['DOB'].dt.date
employees_dates_orig['StartDate'] = employees_dates_orig['StartDate'].dt.date
employees_dates_orig['TerminationDate'] = employees_dates_orig['TerminationDate'].dt.date
employees_skills_orig['Date'] = employees_skills_orig['Date'].dt.date

print(f"Employee Profiles: {len(employees_prof_orig)} records")
print(f"Employees Dates: {len(employees_dates_orig)} records")
print(f"Employees Skills: {len(employees_skills_orig)} records")
print(f"Employees Reasons: {len(employees_term_orig)} records")


Employee Profiles: 55 records
Employees Dates: 3539 records
Employees Skills: 3539 records
Employees Reasons: 3539 records


In [7]:
employees_orig = employees_prof_orig.merge(
    employees_dates_orig,
    left_on='Employee Number',
    right_on='EmployeeID',
    how='left'
)

# employees_orig = employees_orig.rename(columns={'Employee Number': 'EmployeeID'})
employees_orig = employees_orig.drop(columns=['EmployeeID']).rename(columns={'Employee Number': 'EmployeeID'})

employees_orig

Unnamed: 0,EmployeeID,FName,LName,gender,SkillsTraining,SalesmanshipTraining,ProductTraining,DOB,StartDate,TerminationDate,LocationID
0,1,Warden,Oylett,Male,False,False,False,1996-08-21,2017-01-02,2019-11-04,L01
1,2,Tulley,Cockroft,Male,True,True,True,1997-08-04,2017-01-05,NaT,L01
2,3,Pip,Cuming,Male,False,True,False,1993-02-05,2017-01-02,2017-06-17,L01
3,4,Birdie,McOrkill,Female,True,True,False,1990-12-29,2017-06-18,NaT,L01
4,5,Reeta,Doyley,Female,False,True,True,1996-08-05,2019-11-05,2020-10-18,L01
5,6,Bruno,Brigstock,Male,False,False,False,1990-06-12,2020-10-19,NaT,L01
6,7,Glennis,Bond,Female,False,True,False,1992-09-04,2018-01-01,2018-12-25,L02
7,8,Tani,Beagles,Female,True,False,False,1992-10-26,2018-01-07,NaT,L02
8,9,Cecile,Cohani,Other,False,True,False,1995-03-27,2018-01-01,2018-11-13,L02
9,10,Oralle,Scay,Female,False,False,False,1993-10-17,2019-01-06,2019-06-30,L02


In [8]:
# ========================
# starting with joining customers + orderinfo @ customerID
# ========================

customer_order_df = customers_orig.merge(
    orderinfo_orig,
    left_on='id',
    right_on='CustomerID',
    how='left'
)

customer_order_df = customer_order_df.drop(columns={'CustomerID'}).rename(columns={'id': 'CustomerID'})

customer_order_df

Unnamed: 0,CustomerID,first_name,last_name,gender,DOB,LoyaltyMember,EmailList,Source,LocationID,Date,Time,EmployeeID,OrderID
0,1,Eveleen,Erat,F,1977-02-01,1.0,1.0,Newspaper,L01,2017-07-22,14:36:00,2,426
1,2,Micheil,Fransseni,M,1980-06-17,1.0,0.0,Social,L01,2017-03-23,15:29:00,2,117
2,2,Micheil,Fransseni,M,1980-06-17,1.0,0.0,Social,L01,2017-08-11,15:00:00,2,405
3,4,Carin,Oulett,F,1973-03-14,1.0,0.0,Referral,L01,2017-04-16,12:11:00,3,101
4,8,Mallory,McShane,F,1968-03-24,0.0,1.0,Social,L01,2017-07-05,13:24:00,2,443
...,...,...,...,...,...,...,...,...,...,...,...,...,...
28857,57003,Muire,Limpenny,F,1969-08-07,1.0,1.0,WalkIn,L11,2022-11-12,09:28:00,54,18513
28858,57003,Muire,Limpenny,F,1969-08-07,1.0,1.0,WalkIn,L11,2021-06-19,16:41:00,55,28612
28859,57008,Ali,Dewdney,M,1994-07-09,0.0,0.0,WalkIn,L11,2022-12-17,09:42:00,55,18545
28860,57008,Ali,Dewdney,M,1994-07-09,0.0,0.0,WalkIn,L11,2022-11-06,16:51:00,53,18554


In [9]:
next_order_id = customer_order_df['OrderID'].max() + 1
next_order_id

np.int64(28863)

In [10]:
next_customer_id = customer_order_df['CustomerID'].max() + 1
next_customer_id # should be 57009

np.int64(57009)

In [11]:
baseline_year = 2022

customers_2022 = len(customer_order_df[customer_order_df['Date'] > date(2021, 12, 31)])
print('Baseline customers in 2022: ', customers_2022)

Baseline customers in 2022:  8489


In [12]:
# define new store openings
new_stores = {
    2022: {'location': 'L012', 'employees_needed': 5},
    2023: {'location': 'L013', 'employees_needed': 5},
    2024: {'location': 'L014', 'employees_needed': 5},
    2025: {'location': 'L015', 'employees_needed': 6}
}

# existing locations
existing_locations = ['L01', 'L02', 'L03', 'L04', 'L05', 'L06', 
                      'L07', 'L08', 'L09', 'L10', 'L11', 'L012']

# source distribution from historical data
source_options = ['Newspaper', 'Social', 'Referral', 'WalkIn', 'Online', 'Advertisement']
source_weights = [0.10, 0.25, 0.15, 0.30, 0.15, 0.05]

# gender distribution
gender_options = ['M', 'F', 'X']
gender_weights = [0.48, 0.50, 0.02]

# Training probabilities (most new hires start with False)
training_prob = 0.15 # 15% of new hires have prior training

existing locs = customers exist; only need 10% growth to show new customers
new locs = cusotmers don't exist; take avg of other locations and add

In [13]:
np.random.seed(1234)
Faker.seed(1234)

def generate_new_customers(year, num_customers, next_customer_id, next_order_id, all_locations): 
    """Generate new customers for a given year"""

    new_customers = []

    # determine if there's a new store this year
    new_store = new_stores.get(year)

    #if new store opened, allocate some customers to it (let's say 20% of new customers)
    if new_store:
        new_store_customers = int(num_customers * 0.20)
        regular_customers = num_customers - new_store_customers
    else:
        new_store_customers = 0
        regular_customers = num_customers
    
    # generate regular customers distributed across existing locations
    for i in range(regular_customers):
        # random date within the year
        day_of_year = np.random.randint(1, 366)
        customer_date = datetime(year, 1, 1) + timedelta(days=day_of_year - 1)

        # random time (business hours = 9am to 6pm)
        hour = np.random.randint(9, 18)
        minute = np.random.randint(0, 60)
        second = np.random.randint(0, 60)
        customer_time = f"{hour:02d}:{minute:02d}:{second:02d}"
        
        # generate DOB (age between 18-70)
        # age = np.random.randint(18, 71)
        # dob = customer_date - timedelta(days=age * 365)
        dob = fake.date_of_birth(minimum_age=18, maximum_age=71)
        
        # gender
        gender = np.random.choice(gender_options, p=gender_weights)
        
        # name based on gender
        if gender == 'M':
            first_name = fake.first_name_male()
        elif gender == 'F':
            first_name = fake.first_name_female()
        else:
            first_name = fake.first_name_nonbinary()
        last_name = fake.last_name()
        
        # loyalty member (30-40% are members)
        loyalty_member = np.random.choice([0.0, 1.0], p=[0.65, 0.35])

        # email list (correlated with loyalty)
        if loyalty_member == 1.0:
            email_list = np.random.choice([0.0, 1.0], p=[0.30, 0.70])
        else:
            email_list = np.random.choice([0.0, 1.0], p=[0.70, 0.30])
        
        # source
        source = np.random.choice(source_options, p=source_weights)

        # location (exclude new store if it exists)
        new_locations = [item['location'] for item in new_stores.values()]
        available_locs = [loc for loc in all_locations if loc != new_locations]
        location = np.random.choice(available_locs)
        
        # Employee ID (blank for now)
        employee_id = None  # Will be filled when generating orders
        
        # Order ID (blank for now)
        # order_id = None  # Will be filled when generating orders
        
        new_customer = {
            'CustomerID': next_customer_id,
            'first_name': first_name,
            'last_name': last_name,
            'gender': gender,
            'DOB': dob.strftime('%Y-%m-%d'),
            'LoyaltyMember': loyalty_member,
            'EmailList': email_list,
            'Source': source,
            'LocationID': location,
            'Date': customer_date.strftime('%Y-%m-%d'),
            'Time': customer_time,
            'EmployeeID': employee_id,
            'OrderID': next_order_id
        }
        
        new_customers.append(new_customer)
        next_customer_id += 1
        next_order_id += 1
        
    # generate new store customers (if applicable)
    if new_store and new_store_customers > 0:
        # concentrate new store customers in first 3 months (opening spike)
        for i in range(new_store_customers):
            # 70% in first 3 months, 30% rest of year
            if i < int(new_store_customers * 0.70):
                month = np.random.randint(1, 4)  # Jan-Mar
            else:
                month = np.random.randint(4, 13)  # Apr-Dec
            
            day = np.random.randint(1, 29)
            customer_date = datetime(year, month, day)
            
            # random time
            hour = np.random.randint(9, 21)
            minute = np.random.randint(0, 60)
            second = np.random.randint(0, 60)
            customer_time = f"{hour:02d}:{minute:02d}:{second:02d}"
            
            # generate DOB
            # age = np.random.randint(18, 71)
            # dob = customer_date - timedelta(days=age * 365)
            dob = fake.date_of_birth(minimum_age=18, maximum_age=71)

            
            # gender
            gender = np.random.choice(gender_options, p=gender_weights)
            
            # name
            if gender == 'M':
                first_name = fake.first_name_male()
            elif gender == 'F':
                first_name = fake.first_name_female()
            else:
                first_name = fake.first_name_nonbinary()
            last_name = fake.last_name()
            
            # higher loyalty rate for new store (grand opening promotion)
            loyalty_member = np.random.choice([0.0, 1.0], p=[0.50, 0.50])
            
            if loyalty_member == 1.0:
                email_list = np.random.choice([0.0, 1.0], p=[0.20, 0.80])
            else:
                email_list = np.random.choice([0.0, 1.0], p=[0.60, 0.40])
            
            # source (more WalkIn and Advertisement for new store)
            new_store_sources = ['Newspaper', 'Social', 'Referral', 'WalkIn', 'Online', 'Advertisement']
            new_store_weights = [0.05, 0.20, 0.10, 0.35, 0.10, 0.20]
            source = np.random.choice(new_store_sources, p=new_store_weights)
            
            # EmployeeID
            employee_id = None 
            
            new_customer = {
                'CustomerID': next_customer_id,
                'first_name': first_name,
                'last_name': last_name,
                'gender': gender,
                'DOB': dob.strftime('%Y-%m-%d'),
                'LoyaltyMember': loyalty_member,
                'EmailList': email_list,
                'Source': source,
                'LocationID': new_store,
                'Date': customer_date.strftime('%Y-%m-%d'),
                'Time': customer_time,
                'EmployeeID': employee_id,
                'OrderID': next_order_id
            }
            
            new_customers.append(new_customer)
            next_customer_id += 1
            next_order_id += 1
    
    return new_customers, next_customer_id, next_order_id


In [14]:
np.random.seed(1234)
Faker.seed(1234)

def generate_duplicate_customers(year, num_customers, next_order_id, DF):
    """Generate repeat orders for existing customers"""
    
    new_customers = []
    
    # Generate orders distributed across the year
    for i in range(num_customers):
        # Sample a random existing customer
        sampled_row = DF.sample(n=1).iloc[0]
        
        # Generate new date within the specified year
        day_of_year = np.random.randint(1, 366)
        customer_date = datetime(year, 1, 1) + timedelta(days=day_of_year - 1)
        
        # Random time (business hours: 9 AM - 8 PM)
        hour = np.random.randint(9, 21)
        minute = np.random.randint(0, 60)
        second = np.random.randint(0, 60)
        customer_time = f"{hour:02d}:{minute:02d}:{second:02d}"
        
        # Employee ID (blank for now)
        employee_id = None
        
        new_customer = {
            'CustomerID': sampled_row['CustomerID'],
            'first_name': sampled_row['first_name'],
            'last_name': sampled_row['last_name'],
            'gender': sampled_row['gender'],
            'DOB': sampled_row['DOB'],
            'LoyaltyMember': sampled_row['LoyaltyMember'],
            'EmailList': sampled_row['EmailList'],
            'Source': sampled_row['Source'],
            'LocationID': sampled_row['LocationID'],  # Keep their usual location
            'Date': customer_date.strftime('%Y-%m-%d'),
            'Time': customer_time,
            'EmployeeID': employee_id,
            'OrderID': next_order_id
        }
        
        new_customers.append(new_customer)
        next_order_id += 1
        
    return new_customers, next_order_id

In [15]:
# now generate new customers for each year using above function
all_new_customers = []
current_baseline = customers_2022

for year in [2022, 2023, 2024, 2025]:

    # calc 10% growth
    new_customer_count = int(current_baseline * 0.095)

    # update locations list to include new stores up to this year
    current_locations = existing_locations.copy()
    for y in range(2023, year + 1):
        if y in new_stores:
            current_locations.append(new_stores[y]['location'])
    
    print(f"Year {year}: ")
    print(f"    - Target new customers: {new_customer_count}")
    if year in new_stores:
        print(f"    - New store opening: {new_stores[year]}")
        print(f"    - New store customers: {int(new_customer_count * 0.20)}")
        print(f"    - Regular customers: {int(new_customer_count * 0.80)}")

    # generate customers
    year_customers, next_customer_id, next_order_id = generate_new_customers(
        year,
        new_customer_count,
        next_customer_id,
        next_order_id,
        current_locations
    )

    all_new_customers.extend(year_customers)

    # update baseline for next year
    current_baseline += new_customer_count

    print(f"    ! Generate {len(year_customers)} new customers")

Year 2022: 
    - Target new customers: 806
    - New store opening: {'location': 'L012', 'employees_needed': 5}
    - New store customers: 161
    - Regular customers: 644
    ! Generate 806 new customers
Year 2023: 
    - Target new customers: 883
    - New store opening: {'location': 'L013', 'employees_needed': 5}
    - New store customers: 176
    - Regular customers: 706
    ! Generate 883 new customers
Year 2024: 
    - Target new customers: 966
    - New store opening: {'location': 'L014', 'employees_needed': 5}
    - New store customers: 193
    - Regular customers: 772
    ! Generate 966 new customers
Year 2025: 
    - Target new customers: 1058
    - New store opening: {'location': 'L015', 'employees_needed': 6}
    - New store customers: 211
    - Regular customers: 846
    ! Generate 1058 new customers


In [16]:
# Initialize
current_baseline = customers_2022

DF = pd.DataFrame(all_new_customers) # Use your full existing customer dataframe

for year in [2023, 2024, 2025]:  # Start from 2023, not 2022
    
    # Calculate 10% growth (or 5% if you prefer)
    new_customer_count = int(current_baseline * 0.005)
    
    print(f"\nYear {year}: Generating {new_customer_count} repeat orders")
    
    # Generate duplicate customers (repeat orders)
    year_customers, next_order_id = generate_duplicate_customers(
        year, 
        new_customer_count, 
        next_order_id, 
        DF
    )
    
    all_new_customers.extend(year_customers)
    

print(f"\nTotal repeat orders generated: {len(all_new_customers)}")


Year 2023: Generating 42 repeat orders

Year 2024: Generating 42 repeat orders

Year 2025: Generating 42 repeat orders

Total repeat orders generated: 3839


In [17]:
# create DataFrame
new_customers_df = pd.DataFrame(all_new_customers)
new_customers_df

Unnamed: 0,CustomerID,first_name,last_name,gender,DOB,LoyaltyMember,EmailList,Source,LocationID,Date,Time,EmployeeID,OrderID
0,57009,Jeremiah,Smith,M,2006-03-31,1.0,1.0,Social,L11,2022-10-31,12:38:53,,28863
1,57010,Teresa,Aguilar,F,2003-04-02,1.0,1.0,Referral,L03,2022-01-31,15:58:56,,28864
2,57011,Angela,Miranda,F,1990-04-28,0.0,0.0,Newspaper,L04,2022-08-29,14:34:38,,28865
3,57012,Andre,Mccormick,M,1966-11-05,0.0,1.0,Referral,L11,2022-09-16,12:12:01,,28866
4,57013,Lisa,Gilbert,F,1972-09-29,0.0,0.0,Social,L10,2022-04-18,10:23:13,,28867
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3834,58154,Laura,Reynolds,F,1976-08-06,0.0,0.0,WalkIn,L05,2025-10-03,09:10:04,,32697
3835,57382,Lisa,Duran,F,1978-03-22,0.0,0.0,Social,L06,2025-06-15,09:05:38,,32698
3836,59474,Jonathan,Romero,M,1969-11-05,1.0,1.0,WalkIn,"{'location': 'L014', 'employees_needed': 5}",2025-12-01,16:37:02,,32699
3837,59421,Elijah,Barker,M,2001-06-03,0.0,0.0,WalkIn,L012,2025-10-21,20:53:52,,32700


In [18]:
new_customers_df.loc[new_customers_df["CustomerID"] == 60150]

Unnamed: 0,CustomerID,first_name,last_name,gender,DOB,LoyaltyMember,EmailList,Source,LocationID,Date,Time,EmployeeID,OrderID
3141,60150,Tiffany,Cole,F,1997-08-19,0.0,1.0,Social,L015,2025-08-14,09:11:59,,32004
3838,60150,Tiffany,Cole,F,1997-08-19,0.0,1.0,Social,L015,2025-08-22,11:43:37,,32701


---
# Employees Data

In [19]:
# ========================
# now joining employees + orderinfo @ EmployeeID
# ========================

employees_order_df = employees_orig.merge(
    orderinfo_orig,
    left_on='EmployeeID',
    right_on='EmployeeID',
    how='left'
)

employees_order_df = employees_order_df.drop(columns={'LocationID_x'}).rename(columns={'LocationID_y': 'LocationID'})

employees_order_df

Unnamed: 0,EmployeeID,FName,LName,gender,SkillsTraining,SalesmanshipTraining,ProductTraining,DOB,StartDate,TerminationDate,CustomerID,LocationID,Date,Time,OrderID
0,1,Warden,Oylett,Male,False,False,False,1996-08-21,2017-01-02,2019-11-04,428,L01,2017-01-19,14:57:00,4
1,1,Warden,Oylett,Male,False,False,False,1996-08-21,2017-01-02,2019-11-04,450,L01,2017-01-25,13:05:00,8
2,1,Warden,Oylett,Male,False,False,False,1996-08-21,2017-01-02,2019-11-04,419,L01,2017-02-20,09:27:00,9
3,1,Warden,Oylett,Male,False,False,False,1996-08-21,2017-01-02,2019-11-04,426,L01,2017-01-09,13:38:00,12
4,1,Warden,Oylett,Male,False,False,False,1996-08-21,2017-01-02,2019-11-04,455,L01,2017-01-23,16:24:00,14
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28857,55,Creigh,Desson,Male,False,False,False,1993-01-28,2021-01-05,NaT,55958,L11,2022-09-16,15:56:00,28850
28858,55,Creigh,Desson,Male,False,False,False,1993-01-28,2021-01-05,NaT,54820,L11,2022-09-16,13:10:00,28851
28859,55,Creigh,Desson,Male,False,False,False,1993-01-28,2021-01-05,NaT,55832,L11,2022-09-20,17:36:00,28855
28860,55,Creigh,Desson,Male,False,False,False,1993-01-28,2021-01-05,NaT,55462,L11,2022-09-21,15:20:00,28857


In [20]:
employees_order_df.groupby("LocationID")["EmployeeID"].nunique()

LocationID
L01    6
L02    7
L03    7
L04    5
L05    6
L06    4
L07    6
L08    5
L09    3
L10    4
L11    3
Name: EmployeeID, dtype: int64

In [21]:
next_employee_id = employees_order_df['EmployeeID'].max() + 1
next_employee_id # should be 56

np.int64(56)

In [22]:
np.random.seed(1234)
Faker.seed(1234)


def generate_employee(employee_id, hire_date, location, is_new_store=False):
    """Generate a single employee record"""
    
    # Gender
    gender = np.random.choice(gender_options, p=gender_weights)

    # name
    if gender == 'M':
        fname = fake.first_name_male()
    elif gender == 'F':
        fname = fake.first_name_female()
    else:
        fname = fake.first_name_nonbinary()
    lname = fake.last_name()
    
    # DOB (age between 20-60 at hire)
    # age_at_hire = np.random.randint(20, 61)
    # dob = hire_date - timedelta(days=age_at_hire * 365)
    dob = fake.date_of_birth(minimum_age=20, maximum_age=61)

    # Training (new store employees slightly more likely to have training)
    training_rate = 0.25 if is_new_store else training_prob
    
    skills_training = np.random.choice([True, False], p=[training_rate, 1-training_rate])
    salesmanship_training = np.random.choice([True, False], p=[training_rate, 1-training_rate])
    product_training = np.random.choice([True, False], p=[training_rate, 1-training_rate])
    
    employee = {
        'EmployeeID': employee_id,
        'FName': fname,
        'LName': lname,
        'gender': gender,
        'SkillsTraining': skills_training,
        'SalesmanshipTraining': salesmanship_training,
        'ProductTraining': product_training,
        'DOB': dob.strftime('%Y-%m-%d'),
        'StartDate': hire_date.strftime('%Y-%m-%d'),
        'TerminationDate': None,  # active employee
        'LocationID': location
    }
    
    return employee


In [23]:
np.random.seed(1234)
Faker.seed(1234)

def generate_terminated_employees(year, num_terminations, start_employee_id):
    """Generate employees who were hired and terminated in the same year"""
    
    terminated_employees = []
    employee_id = start_employee_id
    
    for i in range(num_terminations):
        # hire date (early in the year)
        # hire_month = np.random.randint(1, 7)  # Jan-Jun
        # hire_day = np.random.randint(1, 29)
        # hire_date = datetime(year, hire_month, hire_day)
        hire_date = fake.date_between(
            start_date=date(year, 1, 1),
            end_date=date(year, 7, 30)
        )
        
        # termination date (2-6 months after hire)
        # months_employed = np.random.randint(2, 7)
        # termination_date = hire_date + timedelta(days=months_employed * 30)
        termination_date = fake.date_between(
            start_date=hire_date + timedelta(days=60),   # ~2 months
            end_date=hire_date + timedelta(days=180)     # ~6 months
        )
        
        # Make sure termination is within the year
        if termination_date.year > year:
            # termination_date = datetime(year, 12, np.random.randint(15, 29))
            termination_date = fake.date_between(
                start_date=date(year, 12, 15),
                end_date=date(year, 12, 31)
            )
        
        # Random location
        location = np.random.choice(existing_locations)
        
        # Gender
        gender = np.random.choice(gender_options, p=gender_weights)
        
        # Name
        # name
        if gender == 'M':
            fname = fake.first_name_male()
        elif gender == 'F':
            fname = fake.first_name_female()
        else:
            fname = fake.first_name_nonbinary()
        lname = fake.last_name()
        
        # DOB
        # age_at_hire = np.random.randint(20, 61)
        # dob = hire_date - timedelta(days=age_at_hire * 365)
        dob = fake.date_of_birth(minimum_age=20, maximum_age=61)

        
        terminated_emp = {
            'EmployeeID': employee_id,
            'FName': fname,
            'LName': lname,
            'gender': gender,
            'SkillsTraining': np.random.choice([True, False], p=[0.20, 0.80]),
            'SalesmanshipTraining': np.random.choice([True, False], p=[0.20, 0.80]),
            'ProductTraining': np.random.choice([True, False], p=[0.20, 0.80]),
            'DOB': dob.strftime('%Y-%m-%d'),
            'StartDate': hire_date.strftime('%Y-%m-%d'),
            'TerminationDate': termination_date.strftime('%Y-%m-%d'),
            'LocationID': location
        }
        
        terminated_employees.append(terminated_emp)
        employee_id += 1
    
    return terminated_employees, employee_id


In [24]:
np.random.seed(1234)
Faker.seed(1234)

# generate new employees

all_new_employees = []
current_employee_id = next_employee_id

for year in [2022, 2023, 2024, 2025]:
    print(f"Year {year}: ")
    year_employees = []

    # 1. New store opening employees
    if year in new_stores:
        store_info = new_stores[year]
        location = store_info['location']
        num_emps = store_info['employees_needed']

        print(f"    - New Store: {location}")
        print(f"    - # Employees Hiring: {num_emps}")

        for i in range(num_emps):
            # hire date (mostly in Jan-Feb for new store openings)
            if i < num_emps - 1:
                hire_month = np.random.choice([1, 2], p=[0.70, 0.30]) # mostly in Jan
            else:
                hire_month = np.random.randint(1, 4) 
            
            hire_day = np.random.randint(1, 29)
            hire_date = datetime(year, hire_month, hire_day)

            employee = generate_employee(
                current_employee_id,
                hire_date,
                location,
                is_new_store=True
            )

            year_employees.append(employee)
            current_employee_id += 1
        
        print(f"    ! Generated {num_emps} new store employees")
    
    # 2. Replacement hires (1-2 per year for existing locations)
    num_replacements = np.random.randint(1, 3)
    print(f"    - Replacement Hires: {num_replacements}")

    for i in range(num_replacements):
        # random month throughout the year
        hire_date = fake.date_between(
            start_date=date(year, 1, 1),
            end_date=date(year, 12, 31)
        )

        # random existing location
        location = np.random.choice(existing_locations)

        employee = generate_employee(
            current_employee_id,
            hire_date,
            location,
            is_new_store=False
        )

        year_employees.append(employee)
        current_employee_id += 1

    print(f"    ! Generated {num_replacements} replacement employees")

    # 3. Turnover employees (hired and terminated in same year)
    num_turnover = np.random.randint(1, 3)
    print(f"    - Turnover (hired & terminated): {num_turnover}")
    
    turnover_emps, current_employee_id = generate_terminated_employees(
        year, 
        num_turnover,
        current_employee_id
    )
    year_employees.extend(turnover_emps)

    # add all year's employees to master list
    all_new_employees.extend(year_employees)

    print(f"    - Total for {year}: {len(year_employees)} employees")


Year 2022: 
    - New Store: L012
    - # Employees Hiring: 5
    ! Generated 5 new store employees
    - Replacement Hires: 2
    ! Generated 2 replacement employees
    - Turnover (hired & terminated): 2
    - Total for 2022: 9 employees
Year 2023: 
    - New Store: L013
    - # Employees Hiring: 5
    ! Generated 5 new store employees
    - Replacement Hires: 1
    ! Generated 1 replacement employees
    - Turnover (hired & terminated): 2
    - Total for 2023: 8 employees
Year 2024: 
    - New Store: L014
    - # Employees Hiring: 5
    ! Generated 5 new store employees
    - Replacement Hires: 2
    ! Generated 2 replacement employees
    - Turnover (hired & terminated): 2
    - Total for 2024: 9 employees
Year 2025: 
    - New Store: L015
    - # Employees Hiring: 6
    ! Generated 6 new store employees
    - Replacement Hires: 2
    ! Generated 2 replacement employees
    - Turnover (hired & terminated): 1
    - Total for 2025: 9 employees


In [25]:
new_employees_df = pd.DataFrame(all_new_employees)
new_employees_df

Unnamed: 0,EmployeeID,FName,LName,gender,SkillsTraining,SalesmanshipTraining,ProductTraining,DOB,StartDate,TerminationDate,LocationID
0,56,Tracy,Alexander,F,False,False,False,1964-05-15,2022-01-07,,L012
1,57,Susan,Crosby,F,True,True,True,1988-07-05,2022-01-12,,L012
2,58,Megan,Miller,F,False,False,False,1996-04-01,2022-01-17,,L012
3,59,David,Johnson,M,True,False,False,1997-03-09,2022-01-03,,L012
4,60,Gregory,Hanson,M,False,False,False,1989-12-03,2022-01-04,,L012
5,61,Catherine,Wilson,F,False,False,False,1964-09-02,2022-02-24,,L04
6,62,Traci,Brown,F,False,False,False,1986-10-13,2022-06-27,,L05
7,63,Brandon,Riley,M,False,False,False,1975-05-19,2022-04-08,2022-08-18,L10
8,64,Karen,Kelly,F,False,True,False,1974-07-10,2022-04-27,2022-09-11,L09
9,65,Taylor,Chavez,F,True,False,True,1986-04-29,2023-01-06,,L013
