In [13]:
# imports
import pandas as pd
import numpy as np
from faker import Faker
from datetime import datetime, timedelta, date
import warnings
warnings.filterwarnings('ignore')

In [14]:
# set random seed for reproducibility
np.random.seed(1234)
fake = Faker()
Faker.seed(1234)

In [15]:
# load existing data
customers_orig = pd.read_excel('./data_original/Customers.xlsx')
orderinfo_orig = pd.read_excel('./data_original/OrderInfo.xlsx')

# making sure dates show up as date not datetime
customers_orig['DOB'] = customers_orig['DOB'].dt.date
orderinfo_orig['Date'] = orderinfo_orig['Date'].dt.date

print(f"Customers: {len(customers_orig)} records")
print(f"OrderInfo: {len(orderinfo_orig)} records")

Customers: 18907 records
OrderInfo: 28862 records


In [16]:
# loading employees data
employees_prof_orig = pd.read_excel('./data_original/Employees.xlsx')


employees_dates_orig = pd.read_excel('./data_original/Ace_Bikes_Data.xlsx', 
                                usecols=['EmployeeID', 'StartDate', 'TerminationDate', 'LocationID'])

employees_skills_orig = pd.read_excel('./data_original/Ace_Bikes_Data.xlsx', 
                                usecols=['EmpID', 'Date', 'Salesmanship', 'ProductKnowledge', 'TeamPlayer', 'Innovator', 'Satisfaction'])

employees_term_orig = pd.read_excel('./data_original/Ace_Bikes_Data.xlsx', 
                                usecols=['EmployeeID', 'Reason'])


employees_prof_orig['DOB'] = employees_prof_orig['DOB'].dt.date
employees_dates_orig['StartDate'] = employees_dates_orig['StartDate'].dt.date
employees_dates_orig['TerminationDate'] = employees_dates_orig['TerminationDate'].dt.date
employees_skills_orig['Date'] = employees_skills_orig['Date'].dt.date

print(f"Employee Profiles: {len(employees_prof_orig)} records")
print(f"Employees Dates: {len(employees_dates_orig)} records")
print(f"Employees Skills: {len(employees_skills_orig)} records")
print(f"Employees Reasons: {len(employees_term_orig)} records")


Employee Profiles: 55 records
Employees Dates: 3539 records
Employees Skills: 3539 records
Employees Reasons: 3539 records


In [17]:
employees_orig = employees_prof_orig.merge(
    employees_dates_orig,
    left_on='Employee Number',
    right_on='EmployeeID',
    how='left'
)

# employees_orig = employees_orig.rename(columns={'Employee Number': 'EmployeeID'})
employees_orig = employees_orig.drop(columns=['EmployeeID']).rename(columns={'Employee Number': 'EmployeeID'})

employees_orig

Unnamed: 0,EmployeeID,FName,LName,gender,SkillsTraining,SalesmanshipTraining,ProductTraining,DOB,StartDate,TerminationDate,LocationID
0,1,Warden,Oylett,Male,False,False,False,1996-08-21,2017-01-02,2019-11-04,L01
1,2,Tulley,Cockroft,Male,True,True,True,1997-08-04,2017-01-05,NaT,L01
2,3,Pip,Cuming,Male,False,True,False,1993-02-05,2017-01-02,2017-06-17,L01
3,4,Birdie,McOrkill,Female,True,True,False,1990-12-29,2017-06-18,NaT,L01
4,5,Reeta,Doyley,Female,False,True,True,1996-08-05,2019-11-05,2020-10-18,L01
5,6,Bruno,Brigstock,Male,False,False,False,1990-06-12,2020-10-19,NaT,L01
6,7,Glennis,Bond,Female,False,True,False,1992-09-04,2018-01-01,2018-12-25,L02
7,8,Tani,Beagles,Female,True,False,False,1992-10-26,2018-01-07,NaT,L02
8,9,Cecile,Cohani,Other,False,True,False,1995-03-27,2018-01-01,2018-11-13,L02
9,10,Oralle,Scay,Female,False,False,False,1993-10-17,2019-01-06,2019-06-30,L02


In [18]:
# ========================
# starting with joining customers + orderinfo @ customerID
# ========================

customer_order_df = customers_orig.merge(
    orderinfo_orig,
    left_on='id',
    right_on='CustomerID',
    how='left'
)

customer_order_df = customer_order_df.drop(columns={'CustomerID'}).rename(columns={'id': 'CustomerID'})

customer_order_df

Unnamed: 0,CustomerID,first_name,last_name,gender,DOB,LoyaltyMember,EmailList,Source,LocationID,Date,Time,EmployeeID,OrderID
0,1,Eveleen,Erat,F,1977-02-01,1.0,1.0,Newspaper,L01,2017-07-22,14:36:00,2,426
1,2,Micheil,Fransseni,M,1980-06-17,1.0,0.0,Social,L01,2017-03-23,15:29:00,2,117
2,2,Micheil,Fransseni,M,1980-06-17,1.0,0.0,Social,L01,2017-08-11,15:00:00,2,405
3,4,Carin,Oulett,F,1973-03-14,1.0,0.0,Referral,L01,2017-04-16,12:11:00,3,101
4,8,Mallory,McShane,F,1968-03-24,0.0,1.0,Social,L01,2017-07-05,13:24:00,2,443
...,...,...,...,...,...,...,...,...,...,...,...,...,...
28857,57003,Muire,Limpenny,F,1969-08-07,1.0,1.0,WalkIn,L11,2022-11-12,09:28:00,54,18513
28858,57003,Muire,Limpenny,F,1969-08-07,1.0,1.0,WalkIn,L11,2021-06-19,16:41:00,55,28612
28859,57008,Ali,Dewdney,M,1994-07-09,0.0,0.0,WalkIn,L11,2022-12-17,09:42:00,55,18545
28860,57008,Ali,Dewdney,M,1994-07-09,0.0,0.0,WalkIn,L11,2022-11-06,16:51:00,53,18554


In [19]:
next_customer_id = customer_order_df['CustomerID'].max() + 1
next_customer_id # should be 57009

np.int64(57009)

In [20]:
baseline_year = 2022

customers_2022 = len(customer_order_df[customer_order_df['Date'] > date(2021, 12, 31)])
print('Baseline customers in 2022: ', customers_2022)

Baseline customers in 2022:  8489


In [21]:
# define new store openings
new_stores = {
    2022: 'L012',
    2023: 'L013',
    2024: 'L014',
    2025: 'L015'
}

# existing locations
existing_locations = ['L01', 'L02', 'L03', 'L04', 'L05', 'L06', 'L07', 'L08', 'L09', 'L10', 'L11', 'L012']

# source distribution from historical data
source_options = ['Newspaper', 'Social', 'Referral', 'WalkIn', 'Online', 'Advertisement']
source_weights = [0.10, 0.25, 0.15, 0.30, 0.15, 0.05]

# gender distribution
gender_options = ['M', 'F', 'X']
gender_weights = [0.48, 0.50, 0.02]

In [22]:
np.random.seed(1234)
Faker.seed(1234)

def generate_new_customers(year, num_customers, next_customer_id, all_locations): 
    """Generate new customers for a given year"""

    new_customers = []

    # determine if there's a new store this year
    new_store = new_stores.get(year)

    #if new store opened, allocate some customers to it (let's say 20% of new customers)
    if new_store:
        new_store_customers = int(num_customers * 0.20)
        regular_customers = num_customers - new_store_customers
    else:
        new_store_customers = 0
        regular_customers = num_customers
    
    # generate regular customers distributed across existing locations
    for i in range(regular_customers):
        # random date within the year
        day_of_year = np.random.randint(1, 366)
        customer_date = datetime(year, 1, 1) + timedelta(days=day_of_year - 1)

        # random time (business hours = 9am to 6pm)
        hour = np.random.randint(9, 18)
        minute = np.random.randint(0, 60)
        second = np.random.randint(0, 60)
        customer_time = f"{hour:02d}:{minute:02d}:{second:02d}"
        
        # generate DOB (age between 18-70)
        # age = np.random.randint(18, 71)
        # dob = customer_date - timedelta(days=age * 365)
        dob = fake.date_of_birth(minimum_age=18, maximum_age=71)
        
        # gender
        gender = np.random.choice(gender_options, p=gender_weights)
        
        # name based on gender
        if gender == 'M':
            first_name = fake.first_name_male()
        elif gender == 'F':
            first_name = fake.first_name_female()
        else:
            first_name = fake.first_name_nonbinary()
        last_name = fake.last_name()
        
        # loyalty member (30-40% are members)
        loyalty_member = np.random.choice([0.0, 1.0], p=[0.65, 0.35])

        # email list (correlated with loyalty)
        if loyalty_member == 1.0:
            email_list = np.random.choice([0.0, 1.0], p=[0.30, 0.70])
        else:
            email_list = np.random.choice([0.0, 1.0], p=[0.70, 0.30])
        
        # source
        source = np.random.choice(source_options, p=source_weights)

        # location (exclude new store if it exists)
        available_locs = [loc for loc in all_locations if loc != new_store]
        location = np.random.choice(available_locs)
        
        # Employee ID (blank for now)
        employee_id = None  # Will be filled when generating orders
        
        # Order ID (blank for now)
        order_id = None  # Will be filled when generating orders
        
        new_customer = {
            'CustomerID': next_customer_id,
            'first_name': first_name,
            'last_name': last_name,
            'gender': gender,
            'DOB': dob.strftime('%Y-%m-%d'),
            'LoyaltyMember': loyalty_member,
            'EmailList': email_list,
            'Source': source,
            'LocationID': location,
            'Date': customer_date.strftime('%Y-%m-%d'),
            'Time': customer_time,
            'EmployeeID': employee_id,
            'OrderID': order_id
        }
        
        new_customers.append(new_customer)
        next_customer_id += 1
        
    # generate new store customers (if applicable)
    if new_store and new_store_customers > 0:
        # concentrate new store customers in first 3 months (opening spike)
        for i in range(new_store_customers):
            # 70% in first 3 months, 30% rest of year
            if i < int(new_store_customers * 0.70):
                month = np.random.randint(1, 4)  # Jan-Mar
            else:
                month = np.random.randint(4, 13)  # Apr-Dec
            
            day = np.random.randint(1, 29)
            customer_date = datetime(year, month, day)
            
            # random time
            hour = np.random.randint(9, 21)
            minute = np.random.randint(0, 60)
            second = np.random.randint(0, 60)
            customer_time = f"{hour:02d}:{minute:02d}:{second:02d}"
            
            # generate DOB
            # age = np.random.randint(18, 71)
            # dob = customer_date - timedelta(days=age * 365)
            dob = fake.date_of_birth(minimum_age=18, maximum_age=71)

            
            # gender
            gender = np.random.choice(gender_options, p=gender_weights)
            
            # name
            if gender == 'M':
                first_name = fake.first_name_male()
            elif gender == 'F':
                first_name = fake.first_name_female()
            else:
                first_name = fake.first_name_nonbinary()
            last_name = fake.last_name()
            
            # higher loyalty rate for new store (grand opening promotion)
            loyalty_member = np.random.choice([0.0, 1.0], p=[0.50, 0.50])
            
            if loyalty_member == 1.0:
                email_list = np.random.choice([0.0, 1.0], p=[0.20, 0.80])
            else:
                email_list = np.random.choice([0.0, 1.0], p=[0.60, 0.40])
            
            # source (more WalkIn and Advertisement for new store)
            new_store_sources = ['Newspaper', 'Social', 'Referral', 'WalkIn', 'Online', 'Advertisement']
            new_store_weights = [0.05, 0.20, 0.10, 0.35, 0.10, 0.20]
            source = np.random.choice(new_store_sources, p=new_store_weights)
            
            # EmployeeID
            employee_id = np.random.randint(1, 61)
            
            new_customer = {
                'CustomerID': next_customer_id,
                'first_name': first_name,
                'last_name': last_name,
                'gender': gender,
                'DOB': dob.strftime('%Y-%m-%d'),
                'LoyaltyMember': loyalty_member,
                'EmailList': email_list,
                'Source': source,
                'LocationID': new_store,
                'Date': customer_date.strftime('%Y-%m-%d'),
                'Time': customer_time,
                'EmployeeID': employee_id,
                'OrderID': order_id
            }
            
            new_customers.append(new_customer)
            next_customer_id += 1
    
    return new_customers, next_customer_id


In [23]:
# now generate new customers for each year using above function
all_new_customers = []
current_baseline = customers_2022

for year in [2022, 2023, 2024, 2025]:

    # calc 10% growth
    new_customer_count = int(current_baseline * 0.10)

    # update locations list to include new stores up to this year
    current_locations = existing_locations.copy()
    for y in range(2023, year + 1):
        if y in new_stores:
            current_locations.append(new_stores[y])
    
    print(f"Year {year}: ")
    print(f"    - Target new customers: {new_customer_count}")
    if year in new_stores:
        print(f"    - New store opening: {new_stores[year]}")
        print(f"    - New store customers: {int(new_customer_count * 0.20)}")
        print(f"    - Regular customers: {int(new_customer_count * 0.80)}")

    # generate customers
    year_customers, next_customer_id = generate_new_customers(
        year,
        new_customer_count,
        next_customer_id,
        current_locations
    )

    all_new_customers.extend(year_customers)

    # update baseline for next year
    current_baseline += new_customer_count

    print(f"    ! Generate {len(year_customers)} new customers")

Year 2022: 
    - Target new customers: 848
    - New store opening: L012
    - New store customers: 169
    - Regular customers: 678
    ! Generate 848 new customers
Year 2023: 
    - Target new customers: 933
    - New store opening: L013
    - New store customers: 186
    - Regular customers: 746
    ! Generate 933 new customers
Year 2024: 
    - Target new customers: 1027
    - New store opening: L014
    - New store customers: 205
    - Regular customers: 821
    ! Generate 1027 new customers
Year 2025: 
    - Target new customers: 1129
    - New store opening: L015
    - New store customers: 225
    - Regular customers: 903
    ! Generate 1129 new customers


In [24]:
# create DataFrame
new_customers_df = pd.DataFrame(all_new_customers)
new_customers_df

Unnamed: 0,CustomerID,first_name,last_name,gender,DOB,LoyaltyMember,EmailList,Source,LocationID,Date,Time,EmployeeID,OrderID
0,57009,Jeremiah,Smith,M,2006-03-30,1.0,1.0,Social,L11,2022-10-31,12:38:53,,
1,57010,Teresa,Aguilar,F,2003-04-01,1.0,1.0,Referral,L03,2022-01-31,15:58:56,,
2,57011,Angela,Miranda,F,1990-04-27,0.0,0.0,Newspaper,L04,2022-08-29,14:34:38,,
3,57012,Andre,Mccormick,M,1966-11-04,0.0,1.0,Referral,L11,2022-09-16,12:12:01,,
4,57013,Lisa,Gilbert,F,1972-09-28,0.0,0.0,Social,L10,2022-04-18,10:23:13,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3932,60941,Tammy,Hicks,F,1979-02-17,0.0,0.0,Newspaper,L015,2025-06-12,13:53:13,34.0,
3933,60942,Jeffrey,Barajas,M,1971-08-21,1.0,1.0,Online,L015,2025-06-23,16:14:50,59.0,
3934,60943,Gregory,Beasley,M,1980-06-12,1.0,1.0,Referral,L015,2025-12-25,09:30:52,4.0,
3935,60944,Sara,Mendez,F,1999-05-30,0.0,0.0,Advertisement,L015,2025-06-11,17:40:48,47.0,
