In [13]:
from bs4 import BeautifulSoup
import pandas as pd
import re

# 파일 열기
with open('TEST.htm', 'r', encoding='cp949') as file:
    contents = file.read()

# BeautifulSoup 객체 생성
soup = BeautifulSoup(contents, 'html.parser')

In [14]:
tables = soup.find_all('table')

In [15]:
qids = {}

# Items
br = '<br/>'
html_class = 'MsoTableGrid'
question_class = 'MsoTableGridLight'

radio = ['단수', '단수응답', '단수문항', 'SA', 'sa']
checkbox = ['복수', '복수응답', '복수문항', 'MA', 'ma']

In [28]:
def suspend() :
    return '''
<suspend/>

'''

def create_html_tag(label, txt) :
    tag = f'''<html label="{label}">
<div class="comment-box">
{txt}
</div>
</html>
{suspend()}'''
    return tag


def create_note_tag(txt) :
    tag = f'''
<note>{txt}</note>

'''
    return tag


def create_radio(label, txt, attrs) :
    qlabel = label
    qname = label.replace('x', '-')

    tag = f'''
<radio
label="{qlabel}">
<title><div class="q-name">{qname}</div> {txt}</title>
<comment></comment>
{attrs}
</radio>
{suspend()}'''
    
    return tag


def create_checkbox(label, txt, attrs) :
    qlabel = label
    qname = label.replace('x', '-')

    tag = f'''
<checkbox
label="{qlabel}"
atleast="1">
<title><div class="q-name">{qname}</div> {txt}</title>
<comment></comment>
{attrs}
</checkbox>
{suspend()}'''
    
    return tag


def create_rows(txt, code, default_label='r') :
    # open check
    chk = txt.replace(' ', '')
    chk = chk.lower()
    oe_flag = False
    for oth in ['구체적으로', 'Specify'] :
        ot = oth.lower()
        if ot in chk :
             oe_flag = True
    oe_option = ' open="1" openSize="25" randomize="0"' if oe_flag else ''
    txt = re.sub(r'(:\s*\))', ':)', txt)
    
    return f'''<row label="{default_label}{code}" value="{code}"{oe_option}>{txt}</row>\n'''

In [32]:
def question_title_type(soup) :
    trs = soup.find_all('tr')
    heads = [hd for hd in trs[0].find_all('td')]

    # Question ID
    qname = heads[0].get_text().strip()
    qname = qname.replace('-', 'x')
    qname = qname.replace('_', 'x')
    qname = qname.replace('X', 'x')

    if qname == '' :
        qname = None

    # Question Title and Type
    title = heads[1]
    bolds = [b.get_text() for b in title.find_all('b')]
    bolds = '/'.join(bolds)
    clean_bolds =  re.sub(r'\[.*?\]', '',bolds)
    bold_txt = [c for c in clean_bolds.split('/') if c != '']

    title = title.get_text().strip()
    title = title.replace('\n', '')

    # Type
    type = re.findall(r'\[.*?\]', title)[-1][1:-1]
    type = type.strip().replace(' ', '').replace('\n', '')

    if type in checkbox :
        type = 'ma'

    if type in radio :
        type = 'sa'

    if type == '' :
        type = None

    title = re.sub(r'\[.*?\]', '', title)
    if bold_txt :
        for b in bold_txt :
            title = title.replace(b, f'<span class=\"f-highlight\">{b}</span>')
    # Title
    title = title.strip()
    title = title.replace('?', f'?{br}')
    title = title.replace('.', f'.{br}')
    title = title.split(br)
    title = br.join([t.strip() for t in title])

    if title == '' :
        title = None

    return {
        'qname' : qname,
        'title' : title,
        'type' : type
    }


def make_simple_rows(soup) :
    # Rows
    trs = soup.find_all('tr')[1:]
    rows = []
    for tr in trs :
        tds = tr.find_all('td')
        tds = [None if td.get_text(strip=True) == '' else td for td in tds]
        empty, txt, code, route = tds

        # row text
        bolds = [b.get_text() for b in txt.find_all('b')]
        bolds = '/'.join(bolds)
        clean_bolds =  re.sub(r'\[.*?\]', '',bolds)
        bold_txt = [c for c in clean_bolds.split('/') if c != '']

        txt = txt.get_text().strip()
        txt = txt.replace('\n', '')    

        txt = re.sub(r'\[.*?\]', '', txt)
        if bold_txt :
            for b in bold_txt :
                txt = txt.replace(b, f'<b>{b}</b>')

        txt = txt.replace('\n', ' ')
        txt = txt.strip()

        # row code
        code = code.get_text(strip=True)
        code = int(code) if code != None and code.isdigit() else None

        # row route
        if route != None :
            route = route.get_text(strip=True)
            route = route.lower()
            route = route if route == 'close' else None

        rows.append(create_rows(txt, code))

    return ''.join(rows)

In [33]:
xml_txt = []
html_page_cnt = 1
for tb in tables :
    class_name = tb['class'][0]
    # html / note tag
    if class_name == html_class :
        tx = tb.get_text().strip()

        pro = re.findall(r'\[.*?\]', tx)
        chk = re.sub(r'\[.*?\]', '', tx)
        if pro :
            # note tag : Logic Guide
            note = create_note_tag(pro[0][1:-1])
            xml_txt.append(note)
        
        if chk != '' :
            tx = tx.replace('[', '(')
            tx = tx.replace(']', ')')
            tx = tx.replace('?', f'?{br}\n')
            tx = tx.replace('.', f'.{br}\n')
            tx = [t.strip().replace('\n', '') for t in tx.split(br)]
            tx = f'{br}'.join(tx)
            tx = tx.replace(f'{br}{br}', f'{br}\n')
            tx = tx.replace('  ', ' ')
            tx = tx.replace(br, f'{br}\n')
            
            # html tag
            label = f'info_page_{html_page_cnt}'
            xml_txt.append(create_html_tag(label, tx))

            html_page_cnt += 1
    
    # Questions
    if class_name == question_class :
        # Grid check
        trs = tb.find_all('tr')
        heads = [hd for hd in trs[0].find_all('td')]

        # Simple Question
        if len(heads) > 2 :
            question_info = question_title_type(tb)
            
            qname = question_info['qname']
            title = question_info['title']
            type = question_info['type']

            rows = make_simple_rows(tb)

            # Single Answer
            if type == 'sa' :
                xml_txt.append(create_radio(qname, title, rows))
            

            # Multiple Answer
            if type == 'ma' :
                xml_txt.append(create_checkbox(qname, title, rows))

In [34]:
with open('test.xml', 'w') as f :
    f.write(''.join(xml_txt))