In [1]:
%pip install pandas

Collecting pandas
  Downloading pandas-2.1.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.3/12.3 MB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting tzdata>=2022.1
  Using cached tzdata-2023.3-py2.py3-none-any.whl (341 kB)
Collecting pytz>=2020.1
  Using cached pytz-2023.3.post1-py2.py3-none-any.whl (502 kB)
Collecting numpy<2,>=1.22.4
  Using cached numpy-1.26.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.2 MB)
Installing collected packages: pytz, tzdata, numpy, pandas
Successfully installed numpy-1.26.2 pandas-2.1.4 pytz-2023.3.post1 tzdata-2023.3
Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
from datetime import datetime, timedelta
import random

In [3]:
def generate_sample_event_log(num_cases):
    activities = ['Start', 'TaskA', 'TaskB', 'TaskC', 'End']
    cases = [f'Case{i}' for i in range(1, num_cases + 1)]

    timestamps = []
    for case in cases:
        start_time = datetime.now()
        for activity in activities:
            timestamps.append((case, activity, start_time))
            start_time += timedelta(hours=random.randint(1, 5))

    # Create a DataFrame
    columns = ['CaseID', 'Activity', 'Timestamp']
    df = pd.DataFrame(timestamps, columns=columns)

    # Save the event log to a CSV file
    filename = f'sample_event_log_{num_cases}_cases.csv'
    df.to_csv(filename, index=False)
    print(f"Event log with {num_cases} cases generated and saved as '{filename}'.")


In [4]:
# Example: Generate an event log with 500 cases
num_cases = 1000
generate_sample_event_log(num_cases)

Event log with 1000 cases generated and saved as 'sample_event_log_1000_cases.csv'.


In [5]:
def generate_complex_event_log(num_cases):
    activities = ['Start', 'TaskA', 'TaskB', 'TaskC', 'Decision', 'TaskD', 'TaskE', 'End']
    cases = [f'Case{i}' for i in range(1, num_cases + 1)]

    timestamps = []
    for case in cases:
        start_time = datetime.now()
        for activity in activities:
            timestamps.append((case, activity, start_time))
            if activity in ['TaskA', 'TaskB', 'TaskC', 'TaskD', 'TaskE']:
                start_time += timedelta(hours=random.randint(1, 5))
            elif activity == 'Decision':
                decision = random.choice(['Option1', 'Option2'])
                timestamps.append((case, f'Decision_{decision}', start_time))
            else:
                start_time += timedelta(hours=random.randint(1, 5))

    # Create a DataFrame
    columns = ['CaseID', 'Activity', 'Timestamp']
    df = pd.DataFrame(timestamps, columns=columns)

    # Save the event log to a CSV file
    filename = f'complex_event_log_{num_cases}_cases.csv'
    df.to_csv(filename, index=False)
    print(f"Event log with {num_cases} cases generated and saved as '{filename}'.")



In [6]:
# Example: Generate a more complex event log with 500 cases
num_cases=1000
generate_complex_event_log(num_cases)

Event log with 1000 cases generated and saved as 'complex_event_log_1000_cases.csv'.
