In [85]:
pip install ollama


Note: you may need to restart the kernel to use updated packages.


# Load input file

In [86]:
import os

project_title_map = {
        "archives_space": "Archives Space Project",
        "archives_space_old": "Archives Space Project",
        "neurohub": "NeuroHub Project",
        "open_spending": "Open Spending Project",
        "open_spending_old": "Open Spending Project",
        "planning_poker": "Planning Poker Project",
        "recycling": "Recycling Project",
        "color_ide": "ColorIDE Project"
    }
projects = ["archives_space", "neurohub", "open_spending", "planning_poker", "recycling", "color_ide"]

selected_project = os.environ.get("project")
if selected_project is None:
    selected_project = projects[0] # if not running through the bash script, you can choose the project here
file_name = "user_stories_{}.json".format(selected_project)
project_path = "./input_files/{}".format(file_name)
with open(project_path, 'r') as file:
        project_content = file.read()
        project_title = project_title_map[selected_project]

# Load model and configure model parameters

In [87]:
import ollama
from ollama import Client

#model_version = "llama3.1:8b-instruct-fp16"
model_version = "llama3.1:70b"
num_ctx = 20480 # context length is higher because of the refinement process
temperature = 0 # should be kept 0 for deterministic results, default value 0.8
top_k = 0

options = {"num_ctx": num_ctx, "temperature": temperature}
model_name = model_version

one_shot = False

add_pattern_descriptions = True

# Load one-shot example if set True

In [88]:
import json
if one_shot:
    with open('example_reasoning_1.txt', 'r') as file:
        example_run_text = file.read()
        example_run_prompt = f"""
        I will give you an example run for another software project just to show you the reasoning process and the output format. 
        VERY IMPORTANT NOTE: "NEVER COPY THE REASONING GIVEN IN THE EXAMPLE RUN! This Example is for Reference Only, Come Up With Your Own Reasoning for The User Input!"
        - EXAMPLE RUN START - 

        {example_run_text}
        
        - EXAMPLE RUN END -

        """
else:
    example_run_prompt = ""

if add_pattern_descriptions:
    with open('pattern_descriptions.txt', 'r') as file:
        pattern_descriptions_text = file.read()
else:
    pattern_descriptions_text = ""
pattern_descriptions_text


'Architecture Pattern Descriptions:\n\nLayered Architecture: Layered Architecture organizes the system into a set of layers, where each layer has a distinct responsibility and communicates only with its adjacent layers. This separation of concerns simplifies maintenance and testing.\nEvent-Driven Architecture: Event-Driven Architecture centers on the production, detection, and consumption of events, enabling asynchronous communication among decoupled components.\nMicrokernel Architecture: Microkernel Architecture features a minimal core system that handles essential services, while additional functionalities are provided through plug-in modules.\nMicroservices Architecture: Microservices Architecture decomposes an application into a suite of small, independently deployable services, each responsible for a specific business function.\nSpace-Based Architecture: Space-Based Architecture uses shared memory spaces to support communication and coordination among distributed processes. It is 

# Get first assessment from model

In [89]:
client = Client(
  headers={}
)
messages = []
import time


generator_system_message = {'role': 'system', 'content': f"""
You are a software architect. Your task is to get a list of categorized user stories with a description, analyze them in detail and
assign a score for each architecture pattern depending on their relevance to the project and 
if it would prove useful in the implementation of the described project.
- Go over all of the user stories thoroughly, think step by step and explain your though process clearly. 
- Mention all specific user stories that helped you to make a decision.
- Be as objective as possible during your scoring. The decisions need to be deterministic and reproducible.

Here are the architecture patterns you will score:
*Layered Architecture 
*Event-Driven Architecture
*Microkernel Architecture 
*Microservices Architecture 
*Space-Based Architecture
*Pipeline Architecture
*Client-Server Architecture

{pattern_descriptions_text}

Here are the score options for the applicability of a pattern, ordered from lowest to highest:
- "Completely unsuitable"
- "Partially suitable"
- "Moderately applicable"
- "Well suited"
- "Perfectly aligned"

IMPORTANT: First explain your reasoning for each pattern, and then print the final scores in this json format:

{{
  "Layered Architecture": "score option",
  "Event-Driven Architecture": "score option",
  "Microkernel Architecture": "score option",
  "Microservices Architecture": "score option",
  "Space-Based Architecture": "score option",
  "Pipeline Architecture": "score option",
  "Client-Server Architecture": "score option"
}}

{example_run_prompt}

"""}
messages.append(generator_system_message)


messages.append({'role': 'user', 'content': f"""
I will give you a list of categorized user stories and a description created for a software project titled {project_title}. Please analyze them in detail and give me the scoring for each architecture pattern.
The final scores must be given in json format after the detailed reasoning for each architecture pattern:

{{
  "Layered Architecture": "score option",
  "Event-Driven Architecture": "score option",
  "Microkernel Architecture": "score option",
  "Microservices Architecture": "score option",
  "Space-Based Architecture": "score option",
  "Pipeline Architecture": "score option",
  "Client-Server Architecture": "score option"
}}

Project Title: {project_title}

Categorized User Stories:

{project_content}

"""})
start_time = time.time()
response = client.chat(model=model_name, messages=messages, options=options)
end_time = time.time()
score_generation_duration = end_time - start_time

message = response['message']
print(message['content'])
messages.append(message)


After analyzing the user stories in detail, here's my reasoning for each architecture pattern:

**Layered Architecture**: This pattern is suitable for the Archives Space Project because it organizes the system into distinct layers, each with a specific responsibility. The project requires various components to work together seamlessly, such as data import/export, processing, and storage. A layered architecture would simplify maintenance and testing by separating concerns.

Score: 4 (Well suited)

**Event-Driven Architecture**: This pattern is partially suitable for the project because it involves asynchronous communication among decoupled components. The project requires handling large-scale data imports, which could be processed in stages using event-driven architecture. However, not all user stories explicitly require event-driven interactions.

Score: 2 (Partially suitable)

**Microkernel Architecture**: This pattern is not well suited for the project because it features a minimal c

# Self-refinement Process

In [90]:

# Let's capture the model's initial answer in a variable:
current_answer = message['content']  # e.g. the text with reasoning + final scores
evaluator_context = [] # this context will be used in evaluator messages
eval_system_message = {
        "role": "system",
        "content": f"""You are the Evaluator. 
Your job: Go over the user stories thoroughly and evaluate the given reasoning and scores for each architecture pattern and provide score refinement suggestions if you are confident that the current score for any architecture pattern should be adjusted upward or downward.
Mention all specific user stories that are relevant to your refinement suggestions.

Here are the architecture patterns you will score:
*Layered Architecture 
*Event-Driven Architecture
*Microkernel Architecture 
*Microservices Architecture 
*Space-Based Architecture
*Pipeline Architecture
*Client-Server Architecture

{pattern_descriptions_text}

Here are the score options and their corresponding meaning:
1: "Completely unsuitable"
2: "Partially suitable"
3: "Moderately applicable"
4: "Well suited"
5: "Perfectly aligned"

- Do not assume any new information about the project. The description and the user stories for the project remain the same. Do not suggest refinements for possible future changes.
- Only suggest a refinement when you are absolutely certain that the assigned score is incorrect and needs adjustment. IMPORTANT: If the current score is already reasonable DO NOT make a refinement suggestion.
- Also, review your previous evaluations to ensure you don’t repeat the same refinement suggestion for an architecture pattern for the same reason.
- Be as objective as possible during your scoring. The decisions need to be deterministic and reproducible.

After your refinement suggestions, on a NEW line, end your response with exactly one of the two markers:
- REFINE (if refinements are needed)
- NO_REFINEMENT (if no refinements are needed)

Do not include any additional text after that marker.
Do not wrap it in quotes.

Important:
- Score can only be assigned one of the integer values that are given as the score options, no float values.
- If you provide refinement for any of the pattern scores, do not use NO_REFINEMENT.
Because this will stop the whole process and the current scores will be left unrefined.
- Use NO_REFINEMENT only if you have zero refinements to suggest.
"""
}
evaluator_context.append(eval_system_message)

refiner_context = [] # this context will be used in refiner messages

refine_system_message = {
    "role": "system",
    "content": f"""You are the Refiner. 
You will be given the user stories and the latest reasoning/scores for the project and the evaluator's feedback for the latest reasoning/scores.
Make changes to the current scores according to the provided refinement suggestions.
- Be as objective as possible during your scoring. The decisions need to be deterministic and reproducible.
- Do not assume any new information about the project. The description and the user stories for the project remain the same.
- IMPORTANT:  ONLY make a change for a score when the refinement suggestion for that score is reasonable and the assigned score must change. Otherwise keep it unchanged.

Here are the architecture patterns you will score:
*Layered Architecture 
*Event-Driven Architecture
*Microkernel Architecture 
*Microservices Architecture 
*Space-Based Architecture
*Pipeline Architecture
*Client-Server Architecture

{pattern_descriptions_text}

Here are the score options for the applicability of a pattern, ordered from lowest to highest:
- "Completely unsuitable"
- "Partially suitable"
- "Moderately applicable"
- "Well suited"
- "Perfectly aligned"

IMPORTANT: When returning the refined scores, keep the same format in the updated response (reasoning and then the updated scores in json format).

"""
    }
refiner_context.append(refine_system_message)
# We'll define how many refinement loops to allow:
MAX_REFINEMENT_ITERATIONS = 2

start_time = time.time()
refinement_iterations = MAX_REFINEMENT_ITERATIONS # if not set again in the loop, it means the model went through max iterations
for i in range(MAX_REFINEMENT_ITERATIONS):
    # 1) Evaluate the current answer
    
    evaluate_prompt = {
        "role": "user",
        "content": f"""
            Evaluate the current reasoning and scores by taking the original categorized user stories into account and make refinement suggestions for the current answer:

            Project Title: {project_title}

            Categorized User Stories:

            {project_content}

            Current Answer:
            {current_answer}

            Refinement Suggestions:
            """
    }
    evaluator_context.append(evaluate_prompt)
    eval_response = client.chat(model=model_name, messages=evaluator_context, options=options)
    eval_message = eval_response['message']
    eval_feedback = eval_message['content'].strip()
    evaluator_context.append(eval_message)
    
    print(f"\n=== Evaluator Feedback (Iteration {i+1}) ===\n{eval_feedback}\n")
    
    # 2) If the evaluator indicates no changes are needed, break out
    if "NO_REFINEMENT" in eval_feedback:
        print("Evaluator says no changes are needed. Stopping refinement.")
        refinement_iterations = i
        break
    
    # 3) Otherwise, refine
    refine_prompt = {
        "role": "user",
        "content": f"""
            Analyze the previous answer and the feedback of the evaluator, and then refine your previous answer by taking the feedback of the evaluator into account. 
            Keep the same format (reasoning and then the final scores in json format).
            
            Project Title: {project_title}

            Categorized User Stories:

            {project_content}
            
            Previous Answer:
            {current_answer}

            Evaluator Feedback:
            {eval_feedback}

            Revised Answer:
            """
    }
    refiner_context.append(refine_prompt)
    
    refine_response = client.chat(model=model_name, messages=refiner_context, options=options)
    refine_message = refine_response['message']
    revised_answer = refine_message['content'].strip()
    refiner_context.append(refine_message)
    
    print(f"=== Refined Answer (Iteration {i+1}) ===\n{revised_answer}\n")
    
    # Update current_answer for potential next iteration
    current_answer = revised_answer

end_time = time.time()
refinement_duration = end_time - start_time
# After the loop, current_answer holds the final refined output:
print("=== Final Refined Scores & Reasoning ===\n", current_answer)


=== Evaluator Feedback (Iteration 1) ===
After re-evaluating the user stories, I suggest the following refinements:

* **Event-Driven Architecture**: Upon closer inspection, I realize that several user stories (e.g., "As an Archivist, I want to import Accessions data in CSV.", "As an Archivist, I want to export agent records as EAC-CPF.") imply asynchronous communication among decoupled components. This pattern is more suitable than initially thought.

Score refinement: 3 (Moderately applicable)

* **Microkernel Architecture**: While still not the best fit, some user stories (e.g., "As a Developer, I want to configure processing stages through external settings...") suggest a need for modularity and extensibility. Microkernel architecture might be more suitable than initially thought.

Score refinement: 2 (Partially suitable)

* **Space-Based Architecture**: Upon re-evaluation, I realize that some user stories (e.g., "As an Archivist, I want large-scale data imports to be handled in s

# Summarize the evaluation result

In [91]:
refiner_context.append({'role': 'system', 'content': """
Ok now the evaluation process is finished. take the first assessment and the last assessment from the user. 
And just return the final scores before and after the evaluation-refinement process
following this json format below:

{
  "Layered Architecture": {"before": "score option", "after": "score option"},
  "Event-Driven Architecture": {"before": "score option", "after": "score option"},
  "Microkernel Architecture": {"before": "score option", "after": "score option"},
  "Microservices Architecture": {"before": "score option", "after": "score option"},
  "Space-Based Architecture": {"before": "score option", "after": "score option"},
  "Pipeline Architecture": {"before": "score option", "after": "score option"},
  "Client-Server Architecture": {"before": "score option", "after": "score option"}
}
"""})

refiner_context.append({'role': 'user', 'content': f"""
read the first assessment and the last assessment, and return the final scores before and after the evaluation-refinement process
following this json format below:

{{
  "Layered Architecture": {{"before": "score option", "after": "score option"}},
  "Event-Driven Architecture": {{"before": "score option", "after": "score option"}},
  "Microkernel Architecture": {{"before": "score option", "after": "score option"}},
  "Microservices Architecture": {{"before": "score option", "after": "score option"}},
  "Space-Based Architecture": {{"before": "score option", "after": "score option"}},
  "Pipeline Architecture": {{"before": "score option", "after": "score option"}},
  "Client-Server Architecture": {{"before": "score option", "after": "score option"}}
}}


first assessment:
{messages[2]["content"].strip()}

last assessment:
{current_answer.strip()}
"""})
response = client.chat(model=model_name, messages=refiner_context, options={"num_ctx": num_ctx, "temperature": 0}) # need deterministic answer
message = response['message']
refiner_context.append(message)
print(message["content"])


Here are the final scores before and after the evaluation-refinement process in JSON format:

```
{
  "Layered Architecture": {"before": 4, "after": 4},
  "Event-Driven Architecture": {"before": 2, "after": 4},
  "Microkernel Architecture": {"before": 1, "after": 3},
  "Microservices Architecture": {"before": 4, "after": 4},
  "Space-Based Architecture": {"before": 1, "after": 3},
  "Pipeline Architecture": {"before": 4, "after": 4},
  "Client-Server Architecture": {"before": 2, "after": 4}
}
```

Note that the scores have changed for Event-Driven Architecture, Microkernel Architecture, Space-Based Architecture, and Client-Server Architecture after considering the evaluator's feedback.


In [92]:
import json

def parse_json_in_string(s):
    """
    Parses and returns the first JSON object found in the input string.

    Args:
        s (str): The input string that contains at least one JSON object.

    Returns:
        object: The Python representation of the parsed JSON object.

    Raises:
        ValueError: If no JSON object is found or if decoding fails.
    """
    # Find the first occurrence of '{'
    start = s.find('{')
    if start == -1:
         return None

    decoder = json.JSONDecoder()
    try:
        obj, _ = decoder.raw_decode(s, idx=start)
        return obj
    except json.JSONDecodeError as e:
        return None

# calculate kappa scores

In [None]:
from sklearn.metrics import cohen_kappa_score
architecture_patterns = ["Layered Architecture", "Event-Driven Architecture",
 "Microkernel Architecture", "Microservices Architecture", "Space-Based Architecture", "Pipeline Architecture", "Client-Server Architecture"]

refiner_context = [message if isinstance(message, dict) else message.dict() for message in refiner_context]

scores = {"Completely unsuitable": 1, "Partially suitable": 2, "Moderately applicable": 3, "Well suited": 4, "Perfectly aligned": 5}
with open('expert_answers.json') as f:
    expert_answers = json.load(f)
llm_answers = parse_json_in_string(refiner_context[-1]["content"])
answers_before = [scores[llm_answers[pattern]["before"]] for pattern in architecture_patterns]
answers_after = [scores[llm_answers[pattern]["after"]] for pattern in architecture_patterns]
score_before_refinement = cohen_kappa_score(expert_answers[selected_project], answers_before, labels=[1, 2, 3, 4, 5], weights='quadratic')
score_after_refinement = cohen_kappa_score(expert_answers[selected_project], answers_after, labels=[1, 2, 3, 4, 5], weights='quadratic')

print(f"Before: {score_before_refinement}, After: {score_after_refinement}")
print(f"Before Scores: {answers_before}, After Scores: {answers_after}")


# Print log to output file

In [93]:
from datetime import datetime
import json
from pathlib import Path
Path("./logs/self-refinement-one-shot").mkdir(parents=True, exist_ok=True)
Path("./logs/self-refinement-zero-shot").mkdir(parents=True, exist_ok=True)

final_data = {
        "modelName": model_name,
        "temperature": temperature,
        "context_limit": num_ctx,
        "options": options,
        "projectTitle": project_title,
        "file_name": file_name,
        "selfRefinement": True,
        "oneShot": one_shot,
        "wckScoreBeforeRefinement": score_before_refinement,
        "wckScoreAfterRefinement": score_after_refinement,
        "patternDescriptionsAdded": add_pattern_descriptions,
        "timestamp": datetime.now().strftime("%Y%m%d_%H%M%S"),
        "json_output_parsed": parse_json_in_string(refiner_context[-1]["content"]),
        "numberOfIterations": refinement_iterations,
        "maxIterationsAllowed": MAX_REFINEMENT_ITERATIONS,
        "messages": [message["content"] for message in refiner_context],
        "scoreGenerationDuration": score_generation_duration,
        "refinementDuration": refinement_duration
    }
    
# 3) Generate a filename based on model name and current timestamp
if one_shot:
    filename = f"./logs/self-refinement-one-shot/log_{model_version}_{final_data['timestamp']}.json"
else:
    filename = f"./logs/self-refinement-zero-shot/log_{model_version}_{final_data['timestamp']}.json"
# 4) Write the conversation to a JSON file
with open(filename, "w", encoding="utf-8") as f:
    json.dump(final_data, f, indent=2, ensure_ascii=False)

print(f"Assesment complete. The whole conversation is saved to {filename}")

Assesment complete. The whole conversation is saved to ./logs/self-refinement-zero-shot/log_llama3.1:70b_20250204_194642.json
