In [1]:
# # Install missing package
# %pip install faker

# Import necessary libraries
import os
import numpy as np
import pandas as pd
from faker import Faker
import random

In [2]:
# Initialize Faker and set random seed
fake = Faker()

random.seed(42)

np.random.default_rng(42)

Generator(PCG64) at 0x219B0376180

In [3]:
# Define static Lookup data

sampling_Types = ["Open Market", "Traffic", "Trade", "Third Space", "Institutional"]
institution_types = ["Church", "Mosque"]
regions_districts = {
    "Greater Accra": ["La Nkwantanang", "Ablekuma", "Madina", "Adenta"],
    "Ashanti": ["Kumasi Metro", "Ejisu", "Obuasi"],
    "Central": ["Cape Coast", "Kasoa", "Mankessim"]
}
age_ranges = ["18–24", "25–34", "35–44", "45–54", "55+"]
toothpaste_brands = ["Pepsodent","Kel","Colgate", "Close-Up", "Oral-B", "Sensodyne"]

In [4]:
# List of reasons for participating
reasons = [
    "Curious about the brand",
    "Referred by friend",
    "Free product sample",
    "Interested in survey",
    "Enjoy trying new products",
    "Promoter was convincing",
    "Happened to be available"
]

In [5]:
# Generate Dummy Area Table Data 
area_record = []

areaID = 1001

for region, districts in regions_districts.items():
    for district in districts:
        for i in range(2):  # Assuming 3 areas per district
            area_record.append({
                "areaID": f"A {areaID}",
                "AreaName": f"{district} Area {i+1}",
                "region": region,
                "district": district
            })
            areaID += 1
    

In [6]:
area_df = pd.DataFrame(area_record)

In [7]:
area_df

Unnamed: 0,areaID,AreaName,region,district
0,A 1001,La Nkwantanang Area 1,Greater Accra,La Nkwantanang
1,A 1002,La Nkwantanang Area 2,Greater Accra,La Nkwantanang
2,A 1003,Ablekuma Area 1,Greater Accra,Ablekuma
3,A 1004,Ablekuma Area 2,Greater Accra,Ablekuma
4,A 1005,Madina Area 1,Greater Accra,Madina
5,A 1006,Madina Area 2,Greater Accra,Madina
6,A 1007,Adenta Area 1,Greater Accra,Adenta
7,A 1008,Adenta Area 2,Greater Accra,Adenta
8,A 1009,Kumasi Metro Area 1,Ashanti,Kumasi Metro
9,A 1010,Kumasi Metro Area 2,Ashanti,Kumasi Metro


In [8]:
# Generate Dummy Promoter Table Data
Promoters = [{
    "promoterID": f"P{1000 + i}",
    "name": fake.name(),
    "contact": fake.phone_number(),
} for i in range(5)]


In [43]:
sampling_type_df = pd.DataFrame(sampling_Types)
sampling_type_df.reset_index(drop=True, inplace=True)

sampling_type_df.index = [f'ST{i}' for i in range(1, len(sampling_type_df) + 1)]

In [9]:
# convert Promoters to DataFrame
promoter_df = pd.DataFrame(Promoters)
promoter_df

Unnamed: 0,promoterID,name,contact
0,P1000,Dalton White,253-936-0255x625
1,P1001,Laura Moreno,9856542381
2,P1002,Christina Sutton,+1-854-668-7820x808
3,P1003,Dr. Susan Cortez DDS,(689)789-9912x3985
4,P1004,Jenny Martinez,001-934-621-8300x876


In [65]:
sampling_type_df


Unnamed: 0,0
ST1,Open Market
ST2,Traffic
ST3,Trade
ST4,Third Space
ST5,Institutional


In [87]:
# Generate Sampling Fact Table Data
sampling_facts = []
samplingID=1000
for i in range(6):  # Generate 500 records
    area = area_df.sample(1).iloc[0]
    promoter = promoter_df.sample(1).iloc[0]
    sampling_type = random.choice(sampling_type_df.index)
    
    institution_type = random.choice(institution_types) if sampling_type in "ST5" else None
    age_range = random.choice(age_ranges)
    target = random.randint(100, 200)  # Random target between 100 and 500
    passengers = random.randint(5, 10)  if sampling_type == "ST2" else None # Random passengers between 5 and 10
    start_date = fake.date_between(start_date='-1y', end_date='today')
    end_date = start_date + pd.Timedelta(days=30)
    sampling_facts.append({
        "samplingID": f"S{samplingID + i}",
        "areaID": area["areaID"],
        "promoterID": promoter["promoterID"],
        "samplingType": sampling_type,
        "institutionType": institution_type,
        "target": target,
        "passengers": passengers,
        "toothpasteBrand": random.choice(toothpaste_brands),
        "startDate": start_date,
        "endDate": end_date,
    })

In [88]:
sampling_fact_df = pd.DataFrame(sampling_facts)
sampling_fact_df

Unnamed: 0,samplingID,areaID,promoterID,samplingType,institutionType,target,passengers,toothpasteBrand,startDate,endDate
0,S1000,A 1020,P1003,ST3,,145,,Oral-B,2024-12-10,2025-01-09
1,S1001,A 1012,P1000,ST2,,188,5.0,Pepsodent,2025-06-16,2025-07-16
2,S1002,A 1006,P1004,ST4,,127,,Pepsodent,2025-04-01,2025-05-01
3,S1003,A 1018,P1004,ST2,,149,6.0,Close-Up,2024-10-21,2024-11-20
4,S1004,A 1010,P1000,ST1,,126,,Oral-B,2024-09-26,2024-10-26
5,S1005,A 1008,P1004,ST5,Mosque,190,,Close-Up,2025-04-03,2025-05-03


In [90]:
respondents = []
respondentID = 1000
for _,sampling in sampling_fact_df.iterrows():
    # for _ in range(sampling_fact_df.shape[0]):  # Assuming you want to generate respondents for each sampling
    #     # Generate a random number of respondents between 50 and the target for each sampling
    actual = random.randint(100,sampling['target'])
    print(actual)

    for _ in range(actual):
        respondents.append({
            "respondentID": f"R{respondentID}",
            "samplingID": sampling['samplingID'],
            "fullName": fake.name(),
            "ageRange": random.choice(age_ranges),
            "contact": fake.phone_number(),
            "toothpasteBrand": sampling['toothpasteBrand'],
            "perferredBrand": random.choice(toothpaste_brands),
            "areaID": sampling['areaID'],
            "residenceArea": area_df['AreaName'].sample(1).values[0],
            "reason": random.choice(reasons),
            "optInOtherProducts": random.choice(["Yes", "No"]),
            "dateOfSubmistion": random.choice(pd.date_range(sampling['startDate'], sampling['endDate'])),   
        })
        respondentID += 1


122
141
122
110
115
109


In [91]:
respondents_df= pd.DataFrame(respondents)
respondents_df.head()


Unnamed: 0,respondentID,samplingID,fullName,ageRange,contact,toothpasteBrand,perferredBrand,areaID,residenceArea,reason,optInOtherProducts,dateOfSubmistion
0,R1000,S1000,Wanda Wade,18–24,618.613.8875,Oral-B,Close-Up,A 1020,Madina Area 1,Happened to be available,Yes,2025-01-06
1,R1001,S1000,Ryan Oconnell,55+,540.674.8561x21222,Oral-B,Pepsodent,A 1020,La Nkwantanang Area 1,Promoter was convincing,No,2025-01-08
2,R1002,S1000,Rodney Brown,55+,001-817-380-3638x4036,Oral-B,Oral-B,A 1020,Cape Coast Area 1,Enjoy trying new products,No,2025-01-03
3,R1003,S1000,Allison Wade,55+,748.821.5472x12070,Oral-B,Sensodyne,A 1020,Mankessim Area 1,Referred by friend,No,2024-12-20
4,R1004,S1000,Aimee Morales,25–34,+1-407-522-8864x37698,Oral-B,Close-Up,A 1020,Madina Area 2,Happened to be available,Yes,2024-12-19


In [64]:
respondents_df.shape

(684, 12)

In [92]:
# export the DataFrames to xlsx files

with pd.ExcelWriter('market_sampling_dummy_data.xlsx') as writer:
    area_df.to_excel(writer, sheet_name='Area', index=False)
    promoter_df.to_excel(writer, sheet_name='Promoter', index=False)
    sampling_fact_df.to_excel(writer, sheet_name='SamplingFact', index=False)
    respondents_df.to_excel(writer, sheet_name='Respondents', index=False)
    sampling_type_df.to_excel(writer, sheet_name='SamplingType', index=True)

print("Dummy data generation completed and saved to 'market_sampling_dummy_data.xlsx'.")

Dummy data generation completed and saved to 'market_sampling_dummy_data.xlsx'.
