# Collect the metadata from AirScript Document Site

## Get the meta data

In order to get the page links from [document site](https://airsheet.wps.cn/docs/), we use some hack approach to get the site configuration from this url:

```bash
mkdir data
cd data
wget https://qn.cache.wpscdn.cn/asdocs/assets/chunks/metadata.cf317f8e.js
```

> The `cf317f8e` is likely to changed, just locate with `metadata.*`

In [1]:
FILE_METADATA_PATH = 'data/metadata.cf317f8e.js'
OUTPUT_FILE_PATH = 'data/not_labeled.jsonl'
API_DOC_NAME = 'API文档(2.0)'
API_DOC_VERSION = '2.0'

import re
import json

with open(FILE_METADATA_PATH, 'r') as f:
    js_content = f.read()

data = []

def extrac_from_apidoc(doc):
    if ('items' in doc):
        for item in doc['items']:
            extrac_from_apidoc(item);
        return;
    data.append(doc)

def map_to_output(doc):
    return {
        'text': doc['text'],
        'version': API_DOC_VERSION,
        'link': doc['link'],
        'label': [],
    }

def output():
    with open(OUTPUT_FILE_PATH, 'w') as f:
        for doc in data:
            # Use utf8 encode
            json_line = json.dumps(map_to_output(doc), ensure_ascii=False)
            f.write(json_line + '\n')

# 使用正则表达式查找匹配项
pattern = r"window\.__VP_SITE_DATA__\s*=\s*JSON\.parse\((.*?)\);"
match = re.search(pattern, js_content)

if match:
    extracted_text = match.group(1)  # 提取到的文本
    # 使用 json 包解析
    try:
        json_str = json.loads(extracted_text) # 去除无关文本
        json_dict = json.loads(json_str)
        side_bar = json_dict['themeConfig']['sidebar']['/']
        api_doc = list(filter(lambda x: x['text'] == API_DOC_NAME, side_bar))[0]
        api_doc = extrac_from_apidoc(api_doc)
        output()
    except json.JSONDecodeError as e:
        print("JSON 解析错误:", e)
else:
    print("未找到匹配项, 请检查脚本是否存在")