### Import libraries

In [12]:
import pandas as pd
import numpy as np
import datetime
from itertools import cycle
import geopandas as gpd
from shapely.geometry import Point
import random

### Load Geodata

In [13]:
# load geodata
polygon = gpd.read_file("Gurugram_sample_Polygon.geojson")
print(polygon.head())

                                            geometry
0  POLYGON ((77.05980 28.46807, 77.03740 28.44785...


In [14]:
polygon.columns

Index(['geometry'], dtype='object')

### Set constants

In [15]:
# list of skills
skills = ['regular', 'premium', 'special']

# prices of services
service_pricing_dict = {
    'regular': 200,
    'premium': 300,
    'special': 400,
}

# phebotomist hiring cost
phlebotomist_cost_dict = {
    'regular': 800,
    'premium': 900,
    'special': 1000,
}

# time taken to carry out each service
service_duration_dict = {
    'regular': 15,
    'premium': 15,
    'special': 15,
}

# size of dataframe
phleb_size = 30
order_size = 100
catchment_size = 5

# seed
# seed = random.randint(0, 9999)
seed = 1576

### Functions

In [16]:
def generate_coords(size=phleb_size+order_size+catchment_size):

    coords_df = pd.DataFrame()
    # bounds of geodata
    x_min, y_min, x_max, y_max = polygon.total_bounds
    points_x = []
    points_y = []
    i=0

    while i < size:
        # generate random data within the bounds
        point = Point(random.uniform(x_min, x_max), random.uniform(y_min, y_max))
        if polygon.contains(point).any():
            points_x.append(point.x)
            points_y.append(point.y)
            i += 1
            
    coords_df['x'] = points_x
    coords_df['y'] = points_y

    # search for, remove and replace duplicates
    while (True in coords_df.duplicated(subset=['x','y'], keep='first').unique()): # while there are duplicates
        # remove duplicate coordinate pairs, only keeping first occurrence
        coords_df.drop_duplicates(subset=['x','y'], keep='first')
        n = size - len(coords_df)
        while j < n:
            point = Point(random.uniform(x_min, x_max), random.uniform(y_min, y_max))
            if polygon.contains(point).any():
                coords_df.append(pd.DataFrame([point.x, point.y], colums=['x', 'y']))
                j += 1

    print(f"Presence of duplicates: {True in coords_df.duplicated(subset=['x','y'], keep='first').unique()}")

    return coords_df

In [17]:
coords_df = generate_coords()
order_coords = coords_df[:order_size]
phleb_home_coords = coords_df[order_size:order_size+phleb_size]
catchment_coords = coords_df[order_size+phleb_size:]

Presence of duplicates: False


In [18]:
def create_phleb_df(size=phleb_size, seed=seed):
    # random generator 
    rng = np.random.default_rng(seed) 

    df = pd.DataFrame()
    # start shift either at 6 or 7
    df['shift_start'] = rng.integers(6, 8, size)

    # start break 4 hours after of work
    df['break_start'] = df['shift_start'] + 4

    # shift ends after 8 hours 
    df['shift_end'] = df['shift_start'] + 8

    # # random number of skills
    # df['num_skills'] =  rng.integers(1, len(skills)+1, size) # each phleb must have at least 1 skill
    # # randomly choose skills from list of skills based
    # df['skillset'] = df.num_skills.apply(lambda x: str(rng.choice(skills, x, replace=False, shuffle=True))[1:-1]).str.replace("'",'')
    # # one-hot encoded columns
    # dummies = df['skillset'].str.get_dummies(sep=" ").add_prefix('expertise_')
    # # join df with one-hot encoded columns
    # df = pd.concat([df, dummies], axis=1)

    # choose a category
    df['skill_cat'] = rng.choice(skills, size)
    # cost of hiring
    df['cost'] = df['skill_cat'].apply(lambda x:phlebotomist_cost_dict.get(x))

    # one-hot encode expertise from category
    df = pd.get_dummies(data=df, prefix='expertise', columns=['skill_cat'])
    df.loc[df['expertise_special'] == 1, 'expertise_premium'] = 1
    df.loc[(df['expertise_special'] == 1) | (df['expertise_premium'] == 1), 'expertise_regular'] = 1 
    
    # carrying capacity
    df['capacity'] = 20 

    # service rating
    df['service_rating'] = rng.lognormal(mean=4.5, sigma=0.1, size=size)
    df['service_rating'] = df['service_rating'] / df['service_rating'].max() *5.0 # scale to range of 0.0 to 5.0
    df['service_rating'] = df['service_rating'].round(decimals=1)

    # coordinates of phlebo's home
    df['long'] = phleb_home_coords['x'].to_numpy()
    df['lat'] = phleb_home_coords['y'].to_numpy()

    # gender, male = 0, female = 1
    df['gender'] = rng.choice(a=[0,1], size=size, p=[0.7, 0.3]) # 70% of phlebotomists are males
    
    # set phleb id
    df['phleb_id'] = df.index

    # df.drop(['num_skills', 'skillset'], inplace=True, axis=1)
    # print(df)
    df.to_csv(f"../Simulated Data/phleb_data_{seed}.csv", index=False)
    return df

In [19]:
def create_catchment_df(size=catchment_size, seed=seed):
    # random generator 
    rng = np.random.default_rng(seed) 

    df = pd.DataFrame()
    
    # coordinates
    df['long'] = catchment_coords['x'].to_numpy()
    df['lat'] = catchment_coords['y'].to_numpy()

    # print(df)
    df.to_csv(f"../Simulated Data/catchment_data_{seed}.csv", index=False)
    return df

In [20]:
def create_orders_df(size=order_size, seed=seed):
    # random generator 
    rng = np.random.default_rng(seed) 

    df = pd.DataFrame()

    # array of hours to choose from
    hours = np.arange(6,15) # orders start from 6am to 2pm
    # probability of each hour being chosen
    p_hour = np.array([0.6, 0.6, 0.6, 0.6, 0.6, 0.4, 0.4, 0.4, 0.4])
    # the probabilities are scaled so that they sum to 1
    p_hour /= p_hour.sum()
    df['order_start'] = rng.choice(hours, size=size, p=p_hour)

    # generate number of services chosen
    df['num_services'] =  rng.integers(1, len(skills)+1, size) # each order must have at least 1 skill
    # randomly choose skills from list of skills based
    df['services'] = df.num_services.apply(lambda x: str(rng.choice(skills, x, replace=False, shuffle=True))[1:-1]).str.replace("'",'') 
    # one-hot encoded columns
    dummies = df['services'].str.get_dummies(sep=" ").add_prefix('service_')
    # join df with one-hot encoded columns
    df = pd.concat([df, dummies], axis=1)

    # calculate order duration and price
    df['duration'] = 0 # initialise column
    df['price'] = 0 # initialise column
    for skill in skills:
        df['duration'] += service_duration_dict.get(skill) * df[f"service_{skill}"]
        df['price'] += service_pricing_dict[skill] * df[f"service_{skill}"]

    # buffer time between 10 and 15mins
    df['buffer'] = rng.integers(10, 16, size)

    df['capacity_needed'] = df['num_services']
    
    # coordinates
    df['long'] = order_coords['x'].to_numpy()
    df['lat'] = order_coords['y'].to_numpy()

    df['order_id'] = df.index

    # gender, male = 0, female = 1
    df['requested_female'] = rng.choice(a=[0,1], size=size, p=[0.8,0.2]) 

    # # generate order cancellation
    # p_cancellation = rng.integers(low=10, high=26) / 100
    # df['cancel'] = rng.binomial(n=1, p=p_cancellation, size=size) # refers to last minute cancellations
    # df['cancel_time'] = 1 # initialise with 1 so that rng will choose 0 if order is not cancelled
    # # calculate amount of time customer has to cancel an order in seconds
    # df.loc[df['cancel'] == 1, 'cancel_time'] = (df['order_start'] - pd.to_datetime(df['order_start'].dt.date)).dt.total_seconds()
    # # randomly choose number of seconds before appointment to cancel 
    # df['cancel_time'] = rng.integers(0, df['cancel_time'], size)
    # df.loc[df['cancel'] == 1, 'cancel_time'] = df['order_start'] - pd.to_timedelta(df['cancel_time'], unit='s')
  
    df.drop(['num_services', 'services'], inplace=True, axis=1)
    # print(df)
    df.to_csv(f"../Simulated Data/order_data_{seed}.csv", index=False)
    return df

### Generate csv

In [21]:
# run functions
print(f"Seed: {seed}")
phleb = create_phleb_df()
orders = create_orders_df()
catchment_areas = create_catchment_df()

Seed: 1576


In [22]:
# check percentage of orders between 6 and 10
len(orders[orders['order_start'] <= 10]) / len(orders)

0.74

In [23]:
orders

Unnamed: 0,order_start,service_premium,service_regular,service_special,duration,price,buffer,capacity_needed,long,lat,order_id,requested_female
0,14,1,0,1,30,700,11,2,77.113403,28.427224,0,0
1,9,0,1,1,30,600,13,2,77.077945,28.438187,1,0
2,8,0,1,0,15,200,11,1,77.071593,28.420298,2,0
3,14,0,1,1,30,600,12,2,77.121457,28.433804,3,0
4,14,1,0,1,30,700,10,2,77.055444,28.444915,4,0
...,...,...,...,...,...,...,...,...,...,...,...,...
95,8,1,0,0,15,300,11,1,77.083691,28.493998,95,0
96,9,1,0,0,15,300,11,1,77.129749,28.477212,96,0
97,12,0,1,0,15,200,12,1,77.141962,28.459857,97,0
98,13,1,1,1,45,900,11,3,77.116182,28.424851,98,0


In [24]:
catchment_areas

Unnamed: 0,long,lat
0,77.119802,28.46164
1,77.107803,28.445355
2,77.075972,28.4875
3,77.046794,28.453942
4,77.061358,28.462278


In [25]:
phleb

Unnamed: 0,shift_start,break_start,shift_end,cost,expertise_premium,expertise_regular,expertise_special,capacity,service_rating,long,lat,gender,phleb_id
0,7,11,15,1000,1,1,1,20,3.5,77.113253,28.431043,0,0
1,7,11,15,800,0,1,0,20,4.0,77.095634,28.438047,0,1
2,7,11,15,800,0,1,0,20,4.1,77.102681,28.429497,0,2
3,6,10,14,1000,1,1,1,20,4.2,77.090672,28.43631,0,3
4,6,10,14,1000,1,1,1,20,4.6,77.08525,28.421829,0,4
5,6,10,14,1000,1,1,1,20,4.1,77.109977,28.438662,0,5
6,6,10,14,1000,1,1,1,20,4.3,77.081469,28.481428,0,6
7,7,11,15,800,0,1,0,20,4.2,77.073236,28.429363,0,7
8,7,11,15,800,0,1,0,20,4.5,77.129152,28.480519,0,8
9,7,11,15,800,0,1,0,20,4.3,77.117988,28.450556,1,9


In [26]:
from shapely.geometry import Point

In [27]:
n_true = 0
n_false = 0

for index, row in coords_df.iterrows():
    # print(row)
    point = Point(row['x'], row['y'])
    # print(polygon.contains(point)[0])
    if polygon.contains(point)[0]:
        n_true+=1
    else:
        n_false+=1

print(n_true, n_false)

135 0
