In [1]:
import os
import re
import sqlite3
import pandas as pd
from bs4 import BeautifulSoup

In [37]:
def openDoc(file):
    '''  Open the HTML document
    @param file: file_path, str
    @return soup: beautiful soup, bs4.BeautifulSoup
    '''
    htmldoc = open(file, 'r', encoding='utf-8')
    htmlhandle = htmldoc.read()
    soup = BeautifulSoup(htmlhandle, 'lxml')
    htmldoc.close()
    return soup

In [5]:
def splitContent(use_case):
    ''' Split Use Case Contents
    @param use_case: use case, bs4.element.Tag
    @return desc_list: use case description, list
    '''
    text = re.sub(r'[\n\xa0]', '', use_case.get_text())
    desc_list = re.split(r'\x95\s+', text)
    return desc_list

In [6]:
def getUseCase(use_case):
    ''' Analysis Use Case Contents
    @param use_case: use case, bs4.element.Tag
    @return case_info: use case info, list
    @return child_case: child use case, list
    '''
    desc_list = splitContent(use_case)
    
    case_info = [page_code,]
    case_info.extend(re.split(r'\s+', desc_list[0]))
    case_code = case_info[1]
    
    child_case = []
    for desc in desc_list[1:]:
        tmp = re.split(r':\s+', desc)
        num = tmp[0]
        name = tmp[1]
        child = [page_code, case_code, num, name]
        child_case.append(child)
    
    return case_info, child_case

In [None]:
def getPageCase(page):
    ''' Analysis all use cases for the page
    @param page: page soup, bs4.BeautifulSoup
    '''
    

In [34]:
file_list = []
ignore_list = ['index.html', 'page1.html', 'page2.html', 'page3.html', 'start.html', 'start_c_1.html', 'start_g_0.html']

file_dir = 'F:/profile/iSpider/practica/03_Axure/demo/'
file_type = '.html'
for file in os.listdir(file_dir):
    if file_type in file:
        if file not in ignore_list:
            file_list.append(file)

In [38]:
for file in file_list:
    print(file)

a-0_一级页面1_0.html
a-1-1-1_____1_0_1.html
a-1-1-2_____2_0_1.html
a-1-1_二级页面1_0.html
a-1-1_二级页面2_0_1.html
a-1-1_二级页面3_0_1.html
a-2-1_二级页面4_0.html
b-0_一级页面3_0_2.html
b-1-1-1_____3_0_2.html
b-1-1-2_____4_0_2.html
b-1-1_二级页面5_0_2.html
b-1-2-1_____5_0_2.html
b-1-2_二级页面6_0_2.html


In [8]:
page_list = []
case_list = []
child_case_list = []

In [4]:
base = soup.find('div', id='base')

overview = base.find_all('div', attrs={'data-label': '页面说明'})
normal = base.find_all('div', attrs={'data-label': '普通用例'})
abnormal = base.find_all('div', attrs={'data-label': '异常用例'})
logic = base.find_all('div', attrs={'data-label': '数据逻辑'})

In [7]:
overview_desc = splitContent(overview[0])

page = re.split(r'\s+', overview_desc[0])
page.extend(overview_desc[1:])
page_code = page[0]

In [13]:
if normal:
    for i in normal:
        case_info, child_case = getUseCase(i)
        case_info.append('normal')
        case_list.append(case_info)
        child_case_list.extend(child_case)

In [None]:
if abnormal:
    for i in abnormal:
        case_info, child_case = getUseCase(i)
        case_info.append('abnormal')
        case_list.append(case_info)
        child_case_list.extend(child_case)

In [None]:
if logic:
    for i in logic:
        case_info, child_case = getUseCase(i)
        case_info.append('logic')
        case_list.append(case_info)
        child_case_list.extend(child_case)

In [11]:
child_case_list

[['A-0', 'N1', 'N1-1', '普通子用例说明'],
 ['A-0', 'N1', 'N1-2', '普通子用例说明，包括：- 这里是具体的说明- 这里是具体的说明'],
 ['A-0', 'N1', 'N1-3', '普通子用例说明'],
 ['A-0', 'N2', 'N2-1', '普通子用例说明'],
 ['A-0', 'N2', 'N2-2', '普通子用例说明，包括：- 这里是具体的说明- 这里是具体的说明'],
 ['A-0', 'N2', 'N2-3', '普通子用例说明，普通子用例说明，普通子用例说明']]

In [35]:
file_list

['a-0_一级页面1_0.html',
 'a-1-1-1_____1_0_1.html',
 'a-1-1-2_____2_0_1.html',
 'a-1-1_二级页面1_0.html',
 'a-1-1_二级页面2_0_1.html',
 'a-1-1_二级页面3_0_1.html',
 'a-2-1_二级页面4_0.html',
 'b-0_一级页面3_0_2.html',
 'b-1-1-1_____3_0_2.html',
 'b-1-1-2_____4_0_2.html',
 'b-1-1_二级页面5_0_2.html',
 'b-1-2-1_____5_0_2.html',
 'b-1-2_二级页面6_0_2.html']