In [1]:
# !git clone https://github.com/dsa-playground/tvt2024.git
# %cd /content/tvt2024/
# !git pull
# !pip install -r requirements.txt -t "tvt2024"
# !pip install pyaml-env

In [1]:
# Imports
from faker import Faker
import pandas as pd
import numpy as np
import scipy.stats as stats
import plotly.graph_objects as go
import random

In [2]:
# Functions
def create_norm_data(lower_limit, upper_limit, mu, sigma, sample_size):
    # lower_limit = 5  # lower limit
    # upper_limit = 25  # upper limit
    # mu = 24  # mean
    # sigma = 10  # standard deviation
    # sample_size = 10000  # number of samples

    # Define the lower and upper bounds of the truncated normal distribution
    a, b = (lower_limit - mu) / sigma, (upper_limit - mu) / sigma

    # Generate the random samples
    random_samples = stats.truncnorm.rvs(a, b, loc=mu, scale=sigma, size=sample_size)

    # Convert to integers
    random_samples = [i.astype("int") for i in random_samples]
    # print(random_samples)
    return random_samples



def get_random_time(row):
    employee = row['employee']
    task = row['task']
    return random.choice(times.get(employee, {}).get(task, []))


In [3]:
# Settings
fake = Faker('nl_NL')

# Define the rules
rules = {
    'employee1': {
        'work_days': ['Monday', 'Tuesday'],
        'clients': {
            'client1': {
                'service_days': {
                    'wash': ['Monday', 'Tuesday'],
                    'medicine': ['Tuesday']
                },
                'service_period': pd.date_range(start='2022-12-20', end='2023-06-06')
            },
            'client2': {
                'service_days': {
                    'wash': ['Monday', 'Tuesday'],
                    'medicine': ['Monday']
                },
                'service_period': pd.date_range(start='2023-06-07', end='2023-12-20')
            }
        }
    },
    'employee2': {
        'work_days': ['Wednesday', 'Friday'],
        'clients': {
            'client1': {
                'service_days': {
                    'wash': ['Wednesday', 'Friday'],
                    'medicine': ['Wednesday']
                },
                'service_period': pd.date_range(start='2022-12-20', end='2023-06-06')
            },
            'client2': {
                'service_days': {
                    'wash': ['Wednesday', 'Friday'],
                    'medicine': ['Friday']
                },
                'service_period': pd.date_range(start='2023-06-07', end='2023-12-20')
            }
        }
    }

}

employee1_washing = create_norm_data(lower_limit=5, upper_limit=25, mu=24, sigma=10, sample_size=10000)
employee1_medicine = create_norm_data(lower_limit=1, upper_limit=10, mu=5, sigma=10, sample_size=10000)
employee2_washing = create_norm_data(lower_limit=15, upper_limit=25, mu=20, sigma=10, sample_size=10000)
employee2_medicine = create_norm_data(lower_limit=2, upper_limit=10, mu=5, sigma=2, sample_size=10000)

times = {
    'employee1': {
        'wash': employee1_washing,
        'medicine': employee1_medicine
    },
    'employee2': {
        'wash': employee2_washing,
        'medicine': employee2_medicine 
    }
    # Add more employees and tasks as needed
}

In [4]:
# df_target = pd.DataFrame(
#     {"Client":[],
#      "Medewerker": [],
#      "Datum": [],
#      "Taak": [],
#      "Duur": [],
#      "Locatie": [],
#      }
# )
# df_target

In [5]:
# # Generate data
# list_client_names = ["Meneer "+fake.last_name() for i in range(0,3)] + ["Mevrouw "+fake.last_name() for i in range(0,3)]
# list_employee_names = [fake.first_name_female() for i in range(0,2)] + [fake.first_name_male() for i in range(0,1)]
# dict_tasks = {"Huishouden": {0: "Maandag",
#                             1: "Maandag",
#                             2: "Dinsdag",
#                             3: "Dinsdag",
#                             4: "Woensdag",
#                             5: "Woensdag"}, 
#               "Medicijnen toedienen": {0: "Maandag",
#                             1: "Maandag",
#                             2: "Dinsdag",
#                             3: "Dinsdag",
#                             4: "Woensdag",
#                             5: "Woensdag"}, 
#               "Wassen": {0: "Maandag",
#                             1: "Maandag",
#                             2: "Dinsdag",
#                             3: "Dinsdag",
#                             4: "Woensdag",
#                             5: "Woensdag"}}
# list_locations = ["Verpleeghuis Den Bosch", "Thuiszorg"]

# # Generate dates
# display([fake.date_between(start_date='-3y', end_date='today') for i in range(0,10)])
# #Makkelijker via pandas? pd.datarange(start='2020-01-01', end='2020-12-31', freq='D')

# # 

# # Generate times for tasks
# ## Use statsmodels?
# # Verschil in locaties
# # Taken op maandagochtend langer
# # Trend in tijd: steeds korter
# # Seizoenspatroon
# # Invoegen van outliers



In [7]:
# client_names

In [8]:
# # Create a histogram
# histogram = go.Histogram(
#     x=random_samples,
#     name='Histogram',
#     opacity=0.7,
#     nbinsx=30,  # you can change the number of bins as needed
#     histnorm='probability density'
# )

# # Create a trace (line) for the normal distribution
# mu, sigma = stats.norm.fit(random_samples)  # get mean and standard deviation
# x = np.linspace(min(random_samples), max(random_samples), 10000)
# y = stats.norm.pdf(x, mu, sigma)

# trace = go.Scatter(
#     x=x,
#     y=y,
#     mode='lines',
#     name='Normal Distribution'
# )

# # Create a layout
# layout = go.Layout(
#     title='Histogram with Normal Distribution',
#     xaxis=dict(title='Value'),
#     yaxis=dict(title='Density'),
#     bargap=0.2,
#     bargroupgap=0.1
# )

# fig = go.Figure(data=[histogram, trace], layout=layout)

# fig.show()

In [6]:
# Generate the data
data = []
for employee, employee_info in rules.items():
    for client, client_info in employee_info['clients'].items():
        for date in client_info['service_period']:
            if date.strftime('%A') in employee_info['work_days']:
                service_days = client_info.get('service_days', {})
                if date.strftime('%A') in service_days.get('wash', []):
                    data.append({
                        'employee': employee,
                        'client': client,
                        'date': date,
                        'task': 'wash'
                    })
                if date.strftime('%A') in service_days.get('medicine', []):
                    data.append({
                        'employee': employee,
                        'client': client,
                        'date': date,
                        'task': 'medicine'
                    })

# Convert the data to a pandas DataFrame
df = pd.DataFrame(data)

# Add times according to distribution
df['time'] = df.apply(get_random_time, axis=1)

In [7]:
df.sort_values('date')

Unnamed: 0,employee,client,date,task,time
0,employee1,client1,2022-12-20,wash,7
1,employee1,client1,2022-12-20,medicine,6
159,employee2,client1,2022-12-21,medicine,4
158,employee2,client1,2022-12-21,wash,20
160,employee2,client1,2022-12-23,wash,21
...,...,...,...,...,...
313,employee2,client2,2023-12-15,medicine,5
155,employee1,client2,2023-12-18,wash,23
156,employee1,client2,2023-12-18,medicine,8
157,employee1,client2,2023-12-19,wash,6
