In [1]:
import os
import re
import sqlite3
import pandas as pd
from bs4 import BeautifulSoup

In [2]:
def openDoc(file):
    '''  Open the HTML document
    @param file: file_path, str
    @return soup: beautiful soup, bs4.BeautifulSoup
    '''
    htmldoc = open(file, 'r', encoding='utf-8')
    htmlhandle = htmldoc.read()
    soup = BeautifulSoup(htmlhandle, 'lxml')
    htmldoc.close()
    return soup

In [3]:
def splitContent(case_content):
    ''' Split Use Case Contents
    @param case_content: use case contents, str
    @return desc_list: use case description, list
    '''
    text = re.sub(r'[\n\xa0]', '', case_content)
    desc_list = re.split(r'\x95\s+', text)
    return desc_list

In [4]:
def getUseCase(page_code, use_case):
    ''' Analysis Use Case Contents
    @param page_code: page code, str
    @param use_case: use case, bs4.element.Tag
    @return case_desc: use case desc, list
    @return child_case: child use case, list
    '''
    case_content = use_case.get_text()
    tmp = re.split(r'#', case_content)
    if len(tmp) > 1:
        case_note = re.sub(r'[\n\xa0]', '', tmp[1])
    else:
        case_note = ''
    desc_list = splitContent(tmp[0])
    
    case_desc = [page_code,]
    case_desc.extend(re.split(r'\s+', desc_list[0]))
    case_desc.append(case_note)
    case_code = case_desc[1]
    
    child_case = []
    for desc in desc_list[1:]:
        tmp = re.split(r':\s+', desc)
        num = tmp[0]
        name = tmp[1]
        child = [page_code, case_code, num, name]
        child_case.append(child)
    
    return case_desc, child_case

In [5]:
def getPageCase(soup):
    ''' Analysis all use cases for the page
    @param soup: beautiful soup, bs4.BeautifulSoup
    '''
    base = soup.find('div', id='base')
    overview = base.find_all('div', attrs={'data-label': '页面说明'})
    normal = base.find_all('div', attrs={'data-label': '普通用例'})
    abnormal = base.find_all('div', attrs={'data-label': '异常用例'})
    logic = base.find_all('div', attrs={'data-label': '数据逻辑'})
    
    if overview:
        overview_desc = splitContent(overview[0].get_text())
        page = re.split(r'\s+', overview_desc[0])
        page.extend(overview_desc[1:])
        page_code = page[0]
        page_list.append(page)
    
    if normal:
        for case in normal:
            case_desc, child_case = getUseCase(page_code, case)
            case_desc.append('normal')
            case_list.append(case_desc)
            child_case_list.extend(child_case)
        
    if abnormal:
        for case in abnormal:
            case_desc, child_case = getUseCase(page_code, case)
            case_desc.append('abnormal')
            case_list.append(case_desc)
            child_case_list.extend(child_case)
        
    if logic:
        for case in logic:
            case_desc, child_case = getUseCase(page_code, case)
            case_desc.append('logic')
            case_list.append(case_desc)
            child_case_list.extend(child_case)

In [6]:
file_list = []
ignore_list = ['index.html', 'page1.html', 'page2.html', 'page3.html', 'start.html', 'start_c_1.html', 'start_g_0.html']

file_dir = 'F:/profile/iSpider/practica/03_Axure/demo/'
file_type = '.html'
for file in os.listdir(file_dir):
    if file_type in file:
        if file not in ignore_list:
            file = file_dir + file
            file_list.append(file)

In [7]:
page_list = []
case_list = []
child_case_list = []

In [8]:
for file in file_list:
    getPageCase(openDoc(file))

In [9]:
page_list

[['A-0', '页面名称', '0', '用户场景：进入该页面的场景', '功能描述：页面的功能', '补充说明：需求的补充说明']]

In [10]:
case_list

[['A-0', 'N1', '普通用例名称', '0', '', 'normal'],
 ['A-0', 'N2', '普通用例名称', '0', ' 这里是备注', 'normal'],
 ['A-0', 'E1', '异常用例名称', '0', '', 'abnormal'],
 ['A-0', 'D1', '数据逻辑', '0', '', 'logic']]

In [11]:
child_case_list

[['A-0', 'N1', 'N1-1', '普通子用例说明'],
 ['A-0', 'N1', 'N1-2', '普通子用例说明，包括：- 这里是具体的说明- 这里是具体的说明'],
 ['A-0', 'N1', 'N1-3', '普通子用例说明'],
 ['A-0', 'N2', 'N2-1', '普通子用例说明'],
 ['A-0', 'N2', 'N2-2', '普通子用例说明，包括：- 这里是具体的说明- 这里是具体的说明'],
 ['A-0', 'N2', 'N2-3', '普通子用例说明，普通子用例说明，普通子用例说明'],
 ['A-0', 'E1', 'E1-1', '异常子用例说明'],
 ['A-0', 'E1', 'E1-2', '异常子用例说明，包括：- 这里是具体的说明- 这里是具体的说明'],
 ['A-0', 'D1', 'D1-1', '排序规则'],
 ['A-0', 'D1', 'D1-2', '状态变更规则'],
 ['A-0', 'D1', 'D1-3', '字段说明，包括：- 这里是具体说明- 这里是具体说明']]