In [None]:
# needed: Python 3.10-3.11

# requirements:
## pm4py>=2.7.11.13
## duckdb
## openai
## crewai
## crewai[tools]
## langchain
## langchain-openai

# you can install them with the command: pip install -U pm4py>=2.7.11.13 duckdb openai crewai crewai[tools] langchain langchain-openai

# in Colab, you can uncomment the following line:
# !pip install pm4py>=2.7.11.13 duckdb openai crewai crewai[tools] langchain langchain-openai

In [None]:
# imports the necessary requirements
from crewai import Crew, Agent, Task
from langchain_openai import ChatOpenAI
import pm4py
import duckdb
import os
from crewai_tools import tool
import pickle

In [None]:
# sets the API key (always needed!)
os.environ["OPENAI_API_KEY"] = open("api_key.txt", "r").read()

# sets the address of the APIs and the required model

# DeepInfra Qwen2-72B-Instruct
llm = ChatOpenAI(model="Qwen/Qwen2-72B-Instruct", base_url="https://api.deepinfra.com/v1/openai/")

# OpenAI's GPT-4o
#llm = ChatOpenAI(model="gpt-4o", base_url="https://api.openai.com/v1")

# OpenAI's GPT-3.5-turbo
#llm = ChatOpenAI(model="gpt-3.5-turbo", base_url="https://api.openai.com/v1")

In [None]:
# keep a dictionary shared to all the methods

global_variables = {}

In [None]:
# define the tool that can be used to get the SQL query to filter the 'protected' group (more subject to discrimination)

@tool("Prompt containing the description of the attributes of event log.")
def get_prompt_base_description_of_event_log() -> str:
    """
    Returns the prompt that can be used to get the base description of the attributes of the event log.

    Returns
    --------------
    base_description_prompt
        Base description prompt
    """
    pickle.dump(global_variables, open("global_variables.dump", "wb"))

    prompt = "Given an event log describing a process with the following directly-follows graph:\n\n"
    prompt += pm4py.llm.abstract_dfg(global_variables['dataframe'], max_len=5000, response_header=False)
    prompt += "\n\nand the following attributes:\n\n"
    prompt += pm4py.llm.abstract_log_attributes(global_variables['dataframe'], max_len=5000)
    prompt += "\n\nCould you make an hypothesis and provide me a DuckDB SQL query that I can execute to filter the dataframe on the cases in which unfairness is likely to happen? It can be an OR query (several different conditions could signal unfairness) The dataframe is called 'dataframe'. Please only a single query. The single query should only look at the case attributes, not the activities in a case. Quote the names of the attributes with a \" at start and at the end."
    prompt += "\nThe SQL query has to be contained between the tags ```sql and ```"
    return prompt

In [None]:
# define the tool that can be used to execute the SQL query. The 'protected' and 'non-protected' cases are identified and saved.

@tool("Executes the provided SQL query.")
def execute_sql_query(sql_query: str):
    """
    Executes the given SQL query.

    Parameters
    ----------------
    sql_query
        SQL query
    """
    pickle.dump(global_variables, open("global_variables.dump", "wb"))

    F = open("sql.txt", "w")
    F.write(str(sql_query))
    F.close()

    sql_query = sql_query.split("```sql")[-1].split("```")[0]
    dataframe = global_variables['dataframe']
    dataframe_prot = duckdb.sql(sql_query).to_df()
    cases = dataframe_prot["case:concept:name"].unique()
    dataframe_nonprot = dataframe[~dataframe["case:concept:name"].isin(cases)]
    global_variables['dataframe_prot'] = dataframe_prot
    global_variables['dataframe_nonprot'] = dataframe_nonprot

In [None]:
# define the tool that can be used to obtain the prompt to compare the behavior in the two groups

@tool("Prompt for process comparison between two groups.")
def get_prompt_for_process_comparison() -> str:
    """
    Gets the prompt that should be used for process comparison.

    Returns
    --------------
    process_comparison_prompt
        Process comparison prompt
    """
    pickle.dump(global_variables, open("global_variables.dump", "wb"))

    dataframe_prot = global_variables['dataframe_prot']
    dataframe_nonprot = global_variables['dataframe_nonprot']
    prompt = "I want to identify the unfair differences between the treatment of the 'protected' group (first) and the 'unprotected' group (second). I report the process variants. Each process variant is also reported with its execution time."
    prompt += "\n\nProcess variants of the protected group:"
    prompt += pm4py.llm.abstract_variants(dataframe_prot, max_len=5000, response_header=False)
    prompt += "\n\nProcess variants of the unprotected group:"
    prompt += pm4py.llm.abstract_variants(dataframe_nonprot, max_len=5000, response_header=False)
    prompt += "\n\nwhich are the main differences? use your domain knowledge."
    return prompt

In [None]:
# reads the event log in-memory using pm4py
global_variables['dataframe'] = pm4py.read_xes("renting_log_high.xes.gz")

In [None]:
# defines the two agents:
# - the first one is expert in identifying the protected group
# - the second one is expert in process comparison
# the agents have all a 'role', a 'goal', a 'backstory', and are connected to the provided LLM.

pm_protected_group_identification = Agent(role="protected_group_identification", goal="Identifying the protected group.", backstory="Fairness expert.", llm=llm)
pm_comparison_expert = Agent(role="comparison_expert", goal="Compare the behavior of two groups of cases.", backstory="Comparison expert.", llm=llm)

In [None]:
# defines the two tasks for the identification of the protected group and the comparison between the 'protected' and the 'non-protected' group.
# the first task is connected to two tools
# the second task is connected to the last tool (for process comparison)
protected_group_identification = Task(description="Retrieve a SQL query to identify the protected cases and executes that against the event log.", expected_output="No output", agent=pm_protected_group_identification, tools=[get_prompt_base_description_of_event_log, execute_sql_query])
comparison_protected_nonprotected = Task(description="Comparing the behavior of the protected group against the nonprotected group.", expected_output="A list of discriminations", agent=pm_comparison_expert, tools=[get_prompt_for_process_comparison])


In [None]:
# defines the crew

print(os.environ["OPENAI_API_KEY"])
crew = Crew(agents=[pm_protected_group_identification, pm_comparison_expert], tasks=[protected_group_identification, comparison_protected_nonprotected])


In [None]:
# starts the crew

crew.kickoff()