#### End-to-end workflow for AI-assisted requirement reviews and revisions

In [1]:
from dotenv import dotenv_values
from src import utils

# Load config settings
DOT_ENV = dotenv_values("../.env")
config = utils.load_config("../config.yaml")

# Create a unique run-id folder to store outputs
config["FILE_LOCATIONS"]["MAIN_DATA_FOLDER"] = "../src/data"
output_directory = utils.make_output_directory(config["FILE_LOCATIONS"])

import pandas as pd
# Load requirements
df = pd.read_excel('../src/data/demo_dataset.xlsx')
requirement_col = 'requirement'
id_col = 'requirement_#'
requirements = list(df[requirement_col].values)
ids = list(df[id_col].values)
print(ids)
print(requirements)


In [3]:
from src.components import prompteval as pe
## Run evaluations
# Functions currently requiring remediation
exclude_funcs = [
    'eval_explicit_enumeration',
    'eval_follows_style_guide',
    'eval_has_correct_grammar',
    'eval_has_supporting_diagram_or_model_reference',
    'eval_is_structured_set',
    'eval_is_unique_expression',
    'eval_has_explicit_conditions_for_single_action',
    'eval_is_structured_statement'
]
# Make evaluation function config
eval_config = pe.make_eval_config(pe, exclude_funcs=exclude_funcs)
# Call the evaluations on the dataframe 
eval_df = pe.call_evals(df, col=requirement_col, eval_config=eval_config)
# Get list of failed eval functions
eval_df = pe.get_failed_evals(eval_df)
# Map the failed eval functions to rule groups (as defined in the config.yaml file)
eval_df = pe.map_failed_eval_col_to_rule_group(eval_df, eval_to_rule_map=config["SECTION_4_RULE_GROUPS"], failed_eval_col='failed_evals')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Daniel\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\Daniel\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Daniel\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Daniel\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Daniel\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Daniel\AppData\Roam

In [4]:
eval_df

Unnamed: 0.1,Unnamed: 0,requirement_#,requirement,eval_acronym_consistency,eval_avoid_pronouns,eval_avoids_absolutes,eval_avoids_parentheses,eval_avoids_purpose_phrases,eval_avoids_vague_terms,eval_consistent_terms_and_units,...,eval_is_singular_statement,eval_is_solution_free,eval_logical_expressions,eval_no_oblique_symbol,eval_terms_are_defined,eval_uses_abbreviations,eval_uses_not,eval_uses_universal_qualification,failed_evals,failed_evals_rule_ids
0,0,0,The Disputes System shall record the name of t...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,"[eval_avoid_pronouns, eval_avoids_absolutes, e...","[Completeness, Realism, Abstraction, Accuracy,..."
1,1,1,The WCS system shall use appropriate nomenclat...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,"[eval_acronym_consistency, eval_avoid_pronouns...","[Uniformity_Of_Language, Completeness, Realism..."
2,2,2,The system will notify affected parties when ...,1.0,1.0,1.0,1.0,0.0,1.0,0.0,...,1.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,"[eval_avoids_purpose_phrases, eval_consistent_...","[Abstraction, Quantification, Uniformity_Of_La..."
3,3,3,Application testability DESC: Test environment...,0.0,1.0,0.0,1.0,0.0,1.0,0.0,...,1.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,"[eval_acronym_consistency, eval_avoids_absolut...","[Uniformity_Of_Language, Realism, Abstraction,..."
4,4,4,The product shall be platform independent.The ...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,"[eval_avoid_pronouns, eval_avoids_absolutes, e...","[Completeness, Realism, Abstraction, Accuracy,..."


In [None]:
import ast
import asyncio
from pydantic import BaseModel, Field, SecretStr
from src.components.promptrunner import RateLimitOpenAIClient
# Run prompt for requirements containing failed evaluations (asychronously with retry and backoff to overcome rate limit errors)
openai_api_key = SecretStr(str(DOT_ENV['OPENAI_API_KEY']))
openai_client = RateLimitOpenAIClient(api_key=openai_api_key.get_secret_value())
# Define data validation model
class Revision(BaseModel):
    requirement_id: int = Field(description="The original requirement id provided by the user")
    requirement: str = Field(description="The original requirement provided by the user")
    revision: str = Field(description="The revised AI-generated output requirement")
    review: str = Field(description="A summary of the thought process used to generate the revision and why it was chosen") 

# Example usage
async def run_reviewer_prompts(requirements, ids, model: str = 'gpt-4o-mini', ):
       
    req_tups = zip(requirements, ids)
    # Example of making multiple concurrent requests
    tasks = []
    for i, req_tup in enumerate(req_tups):
        tasks.append(
            openai_client.chat_completion_parse(
                model=model,
                messages=[
                    {"role":"system", "content": 'You are a meticulous requirements analyst tasked with verifying the quality and clarity of a given requirement based on a detailed checklist.'},
                    {"role":"user", "content": f'Given the following id and requirement:\n\n"""\nrequirement_id:{req_tup[1]} requirement:{req_tup[0]}\n"""\n\nPlease systematically assess the requirement against each of the following criteria, providing a clear answer (Yes/No) and a concise explanation for each:\n\n1. Is the requirement clearly stated, avoiding ambiguous or vague terms?\n2. Can all readers (technical and non-technical stakeholders) understand the requirement without confusion?\n3. Is it written in plain, simple language with no jargon or undefined acronyms?\n4. Does the requirement use active voice and a positive statement (e.g., \'The system shall…\')?\n5. Does it avoid subjective words such as \'user-friendly\', \'fast\', or \'optimal\'?\n6. Is the requirement phrased as a single, atomic statement (not combining multiple requirements)?\n7. Does the requirement address a single capability or attribute?\n8. Avoid compound requirements that include more than one functionality or condition.\n9. Split complex requirements into multiple focused ones if necessary.\n10. Is the requirement stated in such a way that it can be verified through test, inspection, or analysis?\n11. Are the acceptance criteria or measurable parameters clearly indicated or implied?\n12. Could a tester or analyst objectively determine if the requirement is met or not?\n13. Does the requirement use terminology consistent with the rest of the documentation?\n14. Are key terms defined somewhere in a glossary or within the document?\n15. Is there any conflicting wording when compared to other requirements?\n16. Is the requirement traceable back to a stakeholder need, higher-level system requirement, or project objective?\n17. Does the requirement add value to the system, or is it redundant or unnecessary?\n18. Is the rationale for the requirement clear or documented (if applicable)?\n19. Does the requirement cover all relevant conditions and constraints (e.g., operating environment, performance parameters)?\n20. Is it realistically implementable within project constraints (time, cost, technology)?\n\nProvide your answer in a numbered list with each item corresponding to the checklist criteria.'}
                ],
                response_format=Revision
            )
        )
    
    # Wait for all requests to complete
    responses = await asyncio.gather(*tasks)
    
    # Process responses
    processed_responses = []
    for i, response in enumerate(responses):
        
        output = dict(response)
        message = response.choices[0].message

        if getattr(response, "usage"):
            
            output.update(
                dict(response.usage)
            )
            
            output.update(
                dict(response.usage.completion_tokens_details)
            )
            output.update(
                dict(response.usage.prompt_tokens_details)
            )

        if getattr(message, "parsed"):
            output.update(
                dict(message.parsed)
            )
        
        print(f"\nResponse {i+1}:")
        print(output)
        processed_responses.append(output)
        
    #return processed_responses
    return processed_responses


Response 1:
{'id': 'chatcmpl-CFQTm4lqFKEQPcrFQ1kFvGCtbIFSb', 'choices': [ParsedChoice[Revision](finish_reason='stop', index=0, logprobs=None, message=ParsedChatCompletionMessage[Revision](content='{"requirement_id":0,"requirement":"The Disputes System shall record the name of the user and the date for any activity that creates or modifies the disputes case in the system. A detailed history of the actions taken on the case including the date and the user that performed the action must be maintained for auditing purposes.","revision":"The Disputes System shall record the following for each activity that creates or modifies a disputes case: the name of the user, the date of the action, and a detailed history of all actions taken on the case, including the date and the user responsible for each action. This history must be maintained for auditing purposes.","review":"The original requirement was restructured for clarity and to ensure it is atomic. The revision breaks down the requirement 

In [None]:
import pandas as pd
iternum=3
id_col = 'requirement_#'

for i in range(iternum):
    if i == 0:
        # Load requirements
        df = pd.read_excel('../src/data/demo_dataset.xlsx')
        requirement_col = 'requirement'
        # Call the evaluations on the dataframe 
        df = pe.call_evals(df, col=requirement_col, eval_config=eval_config)
        # Get list of failed eval functions
        df = pe.get_failed_evals(df)
        # Map the failed eval functions to rule groups (as defined in the config.yaml file)
        df = pe.map_failed_eval_col_to_rule_group(df, eval_to_rule_map=config["SECTION_4_RULE_GROUPS"], failed_eval_col='failed_evals')   
        # Drop requirements which pass acceptance criteria
        # At present, the criteria is len(failed_evals_rule_ids) == 0
        df = df.loc[df['failed_evals_rule_ids'].str.len() > 0]
        df.to_excel(f"{output_directory}/eval_df_initial.xlsx")
    else:
        df = revisions_df.copy()
        requirement_col = 'revision'

    requirements = list(df[requirement_col].values)
    ids = list(df[id_col].values)     
    revisions = asyncio.run(run_reviewer_prompts(requirements, ids))
    revisions_df = pd.DataFrame(revisions)
    # Call the evaluations on the dataframe 
    eval_df = pe.call_evals(revisions_df, col=requirement_col, eval_config=eval_config)
    # Get list of failed eval functions
    eval_df = pe.get_failed_evals(eval_df)
    # Map the failed eval functions to rule groups (as defined in the config.yaml file)
    eval_df = pe.map_failed_eval_col_to_rule_group(eval_df, eval_to_rule_map=config["SECTION_4_RULE_GROUPS"], failed_eval_col='failed_evals')
    # Drop requirements which pass acceptance criteria
    # At present, the criteria is len(failed_evals_rule_ids) == 0
    eval_df = eval_df.loc[eval_df['failed_evals_rule_ids'].str.len() > 0]
    eval_df.to_excel(f"{output_directory}/eval_df_iter_{i+1}.xlsx")

In [11]:
# Call the evaluations on the dataframe 
eval_df = pe.call_evals(revisions_df, col='revision', eval_config=eval_config)
# Get list of failed eval functions
eval_df = pe.get_failed_evals(eval_df)
# Map the failed eval functions to rule groups (as defined in the config.yaml file)
eval_df = pe.map_failed_eval_col_to_rule_group(eval_df, eval_to_rule_map=config["SECTION_4_RULE_GROUPS"], failed_eval_col='failed_evals')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Daniel\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Daniel\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Daniel\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Daniel\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Daniel\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Daniel\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_dat

In [12]:
eval_df

Unnamed: 0,id,choices,created,model,object,service_tier,system_fingerprint,usage,completion_tokens,prompt_tokens,...,eval_is_singular_statement,eval_is_solution_free,eval_logical_expressions,eval_no_oblique_symbol,eval_terms_are_defined,eval_uses_abbreviations,eval_uses_not,eval_uses_universal_qualification,failed_evals,failed_evals_rule_ids
0,chatcmpl-CFQTm4lqFKEQPcrFQ1kFvGCtbIFSb,"[ParsedChoice[Revision](finish_reason='stop', ...",1757792742,gpt-4o-mini-2024-07-18,chat.completion,default,fp_560af6e559,"CompletionUsage(completion_tokens=209, prompt_...",209,611,...,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,"[eval_avoid_pronouns, eval_avoids_absolutes, e...","[Completeness, Realism, Abstraction, Quantific..."
1,chatcmpl-CFQTm0x7TD4hGS8AIKXxyAks2ptg8,"[ParsedChoice[Revision](finish_reason='stop', ...",1757792742,gpt-4o-mini-2024-07-18,chat.completion,default,fp_560af6e559,"CompletionUsage(completion_tokens=470, prompt_...",470,585,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,"[eval_acronym_consistency, eval_avoid_pronouns...","[Uniformity_Of_Language, Completeness, Realism..."
2,chatcmpl-CFQTlfJz17czJjm3SpLQB29eT8MYl,"[ParsedChoice[Revision](finish_reason='stop', ...",1757792741,gpt-4o-mini-2024-07-18,chat.completion,default,fp_560af6e559,"CompletionUsage(completion_tokens=175, prompt_...",175,582,...,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,"[eval_avoid_pronouns, eval_avoids_absolutes, e...","[Completeness, Realism, Quantification, Unifor..."
3,chatcmpl-CFQTl173N1ooshvmu1Yu0t50h1V5B,"[ParsedChoice[Revision](finish_reason='stop', ...",1757792741,gpt-4o-mini-2024-07-18,chat.completion,default,fp_560af6e559,"CompletionUsage(completion_tokens=117, prompt_...",117,576,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,"[eval_avoid_pronouns, eval_avoids_absolutes, e...","[Completeness, Realism, Quantification, Modula..."
4,chatcmpl-CFQTl0vx5Twe0yYh21nuHX4skQeXR,"[ParsedChoice[Revision](finish_reason='stop', ...",1757792741,gpt-4o-mini-2024-07-18,chat.completion,default,fp_560af6e559,"CompletionUsage(completion_tokens=384, prompt_...",384,573,...,1.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,"[eval_avoid_pronouns, eval_avoids_absolutes, e...","[Completeness, Realism, Abstraction, Quantific..."
