In [48]:
import re

def get_prev_captured_length(matches, current_idx):
   captured_length = 0
   for _ in matches[:current_idx]:
      captured_length += 4
   return captured_length

def capture_labels(text):
   pattern = re.compile(r"(?<!\\)<%(.*?)(?<!\\)%>", re.DOTALL)
   matches = pattern.finditer(text)
   matches = [item for item in matches]
   labels = []
   for idx, match in enumerate(matches):
      labels.append({
         'start': match.start() + 2,
         'label_start': match.start() - get_prev_captured_length(matches, idx),
         'text': match.group(1)
      })
   return labels

def clean_labels(text):
   pattern = re.compile(r"(?<!\\)<%(.*?)(?<!\\)%>", re.DOTALL)
   result = re.sub(pattern, lambda x: x.group(0)[2:-2], text)
   return result

In [49]:
a = """a <%abc%>1 <%cd%> 2 <%f%>3 <%%>4"""
b = clean_labels(a)
labels = capture_labels(a)
         
print(f"{b=}")         

for label in labels:
   text_length = len(label['text'])
   start = label['label_start']
   s = b[start:start+text_length]      
   print(f"{s=}")

b='a abc1 cd 2 f3 4'
s='abc'
s='cd'
s='f'
s=''


In [50]:
labels = capture_labels("""Code Review Short Checklist
<%For Reviewee:
* Engage in discussion with the reviewer regarding the code and requirements.
* Take notes of feedback and improvement suggestions.
* Address identified improvements before the next review iteration.
For Reviewer:
* Offer feedback on strengths and weaknesses of the code.
* Give specific suggestions for improvement and guide on addressing issues.
* Ensure that feedback helps enhance code quality and align with standards.%>""")
labels

[{'start': 30,
  'label_start': 28,
  'text': 'For Reviewee:\n* Engage in discussion with the reviewer regarding the code and requirements.\n* Take notes of feedback and improvement suggestions.\n* Address identified improvements before the next review iteration.\nFor Reviewer:\n* Offer feedback on strengths and weaknesses of the code.\n* Give specific suggestions for improvement and guide on addressing issues.\n* Ensure that feedback helps enhance code quality and align with standards.'}]

In [51]:
def extractive_answers(text):
    labels = capture_labels(text)
    answers = [{ "text": label['text'], "answer_start": label['label_start'] } for label in labels]
    return answers

def create_qas(question, context):
    answers = extractive_answers(context)
    return {
       "question": question, 
       "answers": answers
    }
    
def create_full_qas(context, qas_list):
    return {
        "context": context,
        "qas": qas_list
    }
   
def create_qas_from_file(qas_file):
    question_pattern = re.compile(r"Question:(.*)")
    with open(qas_file, 'r', encoding='utf-8') as f:
        context = ""
        for idx, line in enumerate(f):
            if idx == 0:
                match = re.match(question_pattern, line)
                if not match:
                    return None
                question = match.group(1).lstrip()
            else:
                context += line
    qas = create_qas(question, context)
    return qas
                

In [52]:
qas = create_qas_from_file('./documents/qas.md')
qas

Code Review Short Checklist
<%For Reviewee:
* Engage in discussion with the reviewer regarding the code and requirements.
* Take notes of feedback and improvement suggestions.
* Address identified improvements before the next review iteration.
For Reviewer:
* Offer feedback on strengths and weaknesses of the code.
* Give specific suggestions for improvement and guide on addressing issues.
* Ensure that feedback helps enhance code quality and align with standards.%>


Reviewee Pre-Review Preparation: 
* Are all preconditions and prerequisites met for the review?
* Do reviewer have access to the necessary documentation and requirements?


Reviewer Understanding Requirements:
* Does the solution align with the defined requirements?
* Have Reviewee identified the target audience and their specific needs?
* Have security requirements and permission control been confirmed?
Code Quality and Readability:
* Is the code readable and understand without excessive comments?
* Are coding standards a

{'question': 'What is the checklist for a code review?',
 'answers': [{'text': 'For Reviewee:\n* Engage in discussion with the reviewer regarding the code and requirements.\n* Take notes of feedback and improvement suggestions.\n* Address identified improvements before the next review iteration.\nFor Reviewer:\n* Offer feedback on strengths and weaknesses of the code.\n* Give specific suggestions for improvement and guide on addressing issues.\n* Ensure that feedback helps enhance code quality and align with standards.',
   'answer_start': 28}]}

In [4]:
a = "Some text <%matched1%> with <%matched2%> matches."
pattern = r"<%.*?%>"  # 这个模式会匹配 "<%...%>"

result = re.sub(pattern, lambda x: x.group(0)[2:-3], a)
print(result)

Some text matched with matched matches.
