2.1,按需要运行下面的命令下载所需的库

In [None]:
#!pip install tqdm
#!pip install lxml

In [1]:
def clean_html(text):
        """去掉 <p>、</p> 及其他尖括号里的内容"""
        if not text:
            return ""
        # 去掉所有尖括号及其中内容
        clean = re.sub(r"<.*?>", "", text)
        # 去掉多余空格
        clean = clean.strip()
        return clean

def parse_group(file_path, fid):
    tree = etree.parse(file_path)
    root = tree.getroot()  # <item>

    # state, who
    state = root.findtext("state")
    who = root.findtext("who")
    # category name
    category_name = root.findtext("category/name")

    # description 清理 HTML 标签
    description = clean_html(root.findtext("description"))

    # topics -> list of topic_item name
    topics = []
    for t in root.findall(".//topics/topic_item"):
        name = t.findtext("name")
        if name:
            topics.append(name)

    # name
    name = root.findtext("name")

    # organizer name
    organizer_name = root.findtext("organizer/name")

    # country, city, link
    country = root.findtext("country")
    city = root.findtext("city")
    link = root.findtext("link")

    # 构建字典
    group_dict = {
        "type": "Group",
        "id": fid,
        "state": state,
        "who": who,
        "category_name": category_name,
        "description": description,
        "topics": topics,
        "name": name,
        "organizer_name": organizer_name,
        "country": country,
        "city": city,
        "link": link,
    }

    return group_dict


def parse_member(file_path, fid):
    tree = etree.parse(file_path)
    root = tree.getroot()  # <member>

    # topics -> list of topic names
    topics = []
    for t in root.findall(".//topics/topics_item"):
        name = t.findtext("name")
        if name:
            topics.append(name)

    member_dict = {
        "type": "Member",
        "topics": topics,
        "id": fid,
        "link": root.findtext("link"),
        "city": root.findtext("city"),
        "name": root.findtext("name"),
        "state": root.findtext("state"),
        "country": root.findtext("country"),
    }

    return member_dict

def parse_pastevent(file_path, fid):
    tree = etree.parse(file_path)
    root = tree.getroot()  # <item>
    event_hosts = []
    for t in root.findall(".//event_hosts/event_hosts_item"):
        name = t.findtext("name")
        if name:
            event_hosts.append(name)
    past_event_dict = {
        "type": "PastEvent",
        "id": fid,
        "name": root.findtext("name"),
        "event_url": root.findtext("event_url"),
        "group_name": root.findtext("group/name"),
        "event_hosts": event_hosts,
        "address": root.findtext("venue/address_1"),
        "city": root.findtext("venue/city"),
        "country": root.findtext("venue/country"),
        "state": root.findtext("venue/state"),
        "yes_rsvp_count": root.findtext("yes_rsvp_count"),
        "description": clean_html(root.findtext("description")),

    }

    return past_event_dict


def parse_rsvps(file_path, fid):
    print(f"🧾 RSVPs: {file_path}")
    tree = parse_xml(file_path)
    if tree is None:
        return None
    root = tree.getroot()

    # 示例：提取所有 <rsvp> 或 <item> 元素
    items = []
    for item in root.xpath(".//rsvp | .//item"):
        member = item.findtext(".//member/name")
        response = item.findtext(".//response")
        guests = item.findtext(".//guests")
        items.append({
            "member": member,
            "response": response,
            "guests": guests,
        })

    return {
        "type": "RSVPs",
        "id": fid,
        "count": len(items),
        "items": items,
    }


In [None]:
import os
import re
from pathlib import Path
from pathlib import Path
import tqdm
import json
from lxml import etree

# 通用正则：类型 + 可选分隔符(空格/_/-) + id + .xml
FNAME_RE = re.compile(r"^(Group|Memeber|PastEvent|RSVPs)[ _-]?([A-Za-z0-9]+)[ _-]?([A-Za-z0-9\(\)]*)\.xml$", re.IGNORECASE)
def parse_file_by_name(file_path):
    filename = os.path.basename(file_path)
    m = FNAME_RE.match(filename)
    if not m:
        print(f"⚠️ 未匹配: {filename}")
        return None

    ftype = m.group(1).lower()
    fid = m.group(2)

    if ftype == "group":
        return parse_group(file_path, fid)
        
    elif ftype == "memeber":
        return parse_member(file_path, fid)
        
    elif ftype == "pastevent":
        return parse_pastevent(file_path, fid)
        
    elif ftype == "rsvps":
        return None # 不处理该类文件
    else:
        print(f"⚠️ 未知类型: {filename}")
        return None


def parse_all_xml_in_folder(folder_path):
    folder_path = Path(folder_path)
    output_folder = Path("./Final_result")
    output_folder.mkdir(exist_ok=True)  # 确保文件夹存在

    for p in tqdm.tqdm(folder_path.iterdir()):
        if p.is_file() and p.suffix.lower() == ".xml":
            result = parse_file_by_name(str(p))  # 假设返回的是 dict
            if result:
                # 用原文件名去掉后缀作为 json 文件名
                json_file = output_folder / (p.stem + ".json")
                with open(json_file, "w", encoding="utf-8") as f:
                    json.dump(result, f, ensure_ascii=False, indent=2)

parse_all_xml_in_folder("./All_Unpack")


2.2,2.3，与上面相同

In [None]:
#!pip install spacy
#!python -m spacy download en_core_web_sm


In [3]:
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from pathlib import Path
from tqdm import tqdm
import json
import re

def safe_str(x):
    return str(x) if x is not None else ""

# 1. 加载英文模型
nlp = spacy.load("en_core_web_sm", disable=["ner", "parser"])

def normalize_doc(text):
    """单条文本规范化处理：去除停用词、标点、数字，词形还原"""
    doc = nlp(text)
    tokens = []
    for token in doc:
        if token.is_stop or token.is_punct or token.is_space or token.like_num:
            continue
        lemma = token.lemma_.lower().strip()
        if lemma:
            tokens.append(lemma)
    return tokens


def process_file(file_path):
    """读取单个文件并返回 (文件名, 分词结果)"""
    try:
        
        FNAME_RE = re.compile(r"^(Group|Memeber|PastEvent|RSVPs)[ _-]?([A-Za-z0-9]+)[ _-]?([A-Za-z0-9\(\)]*)\.json$", re.IGNORECASE)
        fname = Path(file_path).name
        m = FNAME_RE.match(fname)
        if not m:
            print(f"⚠️ 未匹配: {fname}")
            return None
        ftype = m.group(1).lower()
        with open(file_path, "r", encoding="utf-8") as f:
            text = json.load(f)
        if ftype == "group":
            text = safe_str(text.get("who", ""))+ " " + safe_str(text.get("description", ""))+ " " + safe_str(text.get("category_name", ""))+ " " + safe_str(text.get("topics", "")).strip("[]")+" " + safe_str(text.get("name", ""))+ " " +safe_str(text.get("organizer_name", ""))+ " " + safe_str(text.get("city", ""))
        elif ftype == "memeber" :
            text = safe_str(text.get("name", ""))+ " " + safe_str(text.get("city", ""))+ " " + safe_str(text.get("topics", ""))+" " + safe_str(text.get("joined", "")).strip("[]")
        elif ftype == "pastevent" :
            text = safe_str(text.get("name", ""))+ " " + safe_str(text.get("description", ""))+ " " + safe_str(text.get("group_name", ""))+ " " + safe_str(text.get("address", ""))+" " + safe_str(text.get("city", ""))
        elif ftype == "rsvps" :
            return None  # 不处理该类文件
        else:
            print(f"⚠️ 未知类型: {file_path}")
            return None
        tokens = normalize_doc(text)
        file_key = Path(file_path).stem  # 文件名不含后缀
        return file_key, tokens
    except Exception as e:
        print(f"❌ 处理 {file_path} 时出错：{e}")
        return None


def parse_folder_to_dict(folder_path, output_path=None):
    """批量读取文件夹中所有文本文件 -> {filename: tokens}（单进程版本）"""
    folder = Path(folder_path)
    files = [p for p in folder.iterdir() if p.is_file() and p.suffix.lower() == ".json"]
    
    results = {}
    for file in tqdm(files, desc="Processing files"):
        result = process_file(str(file))
        if result:
            key, tokens = result
            results[key] = tokens
            #print(f"Processed {key}: {tokens[:10]}...")  # 仅打印前10个词以示例

    # 可选：保存到文件
    if output_path:
        import json
        with open(output_path, "w", encoding="utf-8") as f:
            json.dump(results, f, ensure_ascii=False, indent=2)

    return results





In [None]:

folder = "./Final_result"  # 你的文件夹
output_file = "normalized_tokens.json"
token_dict = parse_folder_to_dict(folder, output_file)