In [18]:
import os
from selenium import webdriver
from selenium.webdriver.edge.service import Service
from selenium.webdriver.edge.options import Options
import time
from bs4 import BeautifulSoup
from urllib.parse import urlparse, parse_qs,unquote
import base64
from tqdm import tqdm
import json
import openai
from openai import OpenAI
openai.api_key = os.environ['OPENAI_API_KEY']

from concurrent.futures import ThreadPoolExecutor

In [10]:
conference_dict = {
    "A": ["ACL", "ICML", "SIGKDD", "AAAI", "NeurIPS", "IJCAI","CVPR", "SIGMOD", "ICDE", "ICCV"],
    "B": ["EMNLP", "EAACL"]
}

# years
years = list(range(2019, 2025))
years = [str(i) for i in years]

In [11]:
# 创建文件夹
for key in conference_dict.keys():
    os.makedirs(f"./{key}", exist_ok=True)
    for conference in conference_dict[key]:
        os.makedirs(f"./{key}/{conference}", exist_ok=True)
# 创建临时文件夹
os.makedirs("./tmp", exist_ok=True)
os.makedirs("./tmp/get_search_result", exist_ok=True)

In [12]:
# parser
def parser_result_html(html_txt):
    # 使用BeautifulSoup解析页面源码
    soup = BeautifulSoup(html_txt, 'html.parser')

    # 找到<main>容器
    main_container = soup.find('main')

    # 找到<ol>容器
    ol_container = main_container.find('ol')

    # 初始化大列表
    result_list = []

    # 遍历每个<li>容器
    for li in ol_container.find_all('li'):
        # 找到第一个<h2>容器下的<a>容器
        h2 = li.find('h2')
        if h2:
            a_tag = h2.find('a')
            if a_tag:
                # 提取href和内部文本
                text = a_tag.get_text(strip=True)
                href = a_tag.get('href')

                parsed_url = urlparse(href)
                query_params = parse_qs(parsed_url.query)

                encoded_url = query_params.get('u', [''])[0][2:]
                encoded_url += "=" * ((4 - len(encoded_url) % 4) % 4)
                encoded_url = encoded_url.replace("_","/").replace("-","+")
                
                try:
                    decoded_url = base64.b64decode(encoded_url).decode('utf-8')
                except Exception as e:
                    print(parsed_url.query)
                    raise e
                # 将结果添加到列表中
                result_list.append([text, decoded_url])
    return result_list

# search conferences
def search_conference(level, conference, year, driver):
    url = f"https://www.bing.com/search?q={year}+{conference}+accepted+paper&num=10"
    
    driver.get(url)
    time.sleep(5)
    response_txt = driver.page_source
    
    results = parser_result_html(response_txt)
    with open(f"./tmp/get_search_result/{level}_{conference}_{year}.txt", "w", encoding="utf-8") as f:
        for item in results:
            f.write(f"{item[0]}\t{item[1]}\n")

In [13]:
options = Options()
options.add_argument("--headless")  # 无头模式，不打开浏览器窗口
options.add_argument("--disable-gpu")  # 禁用 GPU 渲染
options.add_argument("--no-sandbox")  # 禁用沙盒模式
options.add_argument("--disable-dev-shm-usage")  # 避免内存不足问题
service = Service(r"D:\Applications\edgedriver\msedgedriver.exe")
driver = webdriver.Edge(service=service, options=options)

In [14]:
if os.path.exists("./tmp/param_string_list.json"):
    param_string_list = json.load(open("./tmp/param_string_list.json"))
else:
    param_string_list = []
old_param_list = []
new_param_list = []
for key in conference_dict.keys():
    for conference in conference_dict[key]:
        for year in years:
            param_string = f"{key}_{conference}_{year}"
            if param_string in param_string_list:
                old_param_list.append((key, conference, year))
            else:
                new_param_list.append((key, conference, year))
param_list = old_param_list + new_param_list
with open("./tmp/param_string_list.json", "w", encoding="utf-8") as f:
    new_param_string_list =[f"{i[0]}_{i[1]}_{i[2]}" for i in param_list]
    json.dump(new_param_string_list, f)

In [15]:
# 加载checkpoint文件
if os.path.exists("./tmp/get_search_result_checkpoint.json"):
    get_search_result_checkpoint = json.load(open("./tmp/get_search_result_checkpoint.json", "r", encoding="utf-8"))
else:
    get_search_result_checkpoint = []


In [16]:
try:
    for param in tqdm(param_list):
        param_string = "_".join(param)
        if param_string in get_search_result_checkpoint:
            print(f"skipping {param_string}")
            continue
        search_conference(*param, driver)
        get_search_result_checkpoint.append(param_string)
except Exception as e:
    print(e)
    raise e
finally:
    json.dump(get_search_result_checkpoint, open("./tmp/get_search_result_checkpoint.json", "w", encoding="utf-8"))
    driver.quit()

100%|██████████| 78/78 [00:00<00:00, 39708.18it/s]

skipping A_ACL_2019
skipping A_ACL_2020
skipping A_ACL_2021
skipping A_ACL_2022
skipping A_ACL_2023
skipping A_ACL_2024
skipping A_ICML_2019
skipping A_ICML_2020
skipping A_ICML_2021
skipping A_ICML_2022
skipping A_ICML_2023
skipping A_ICML_2024
skipping A_SIGKDD_2019
skipping A_SIGKDD_2020
skipping A_SIGKDD_2021
skipping A_SIGKDD_2022
skipping A_SIGKDD_2023
skipping A_SIGKDD_2024
skipping A_AAAI_2019
skipping A_AAAI_2020
skipping A_AAAI_2021
skipping A_AAAI_2022
skipping A_AAAI_2023
skipping A_AAAI_2024
skipping A_NeurIPS_2019
skipping A_NeurIPS_2020
skipping A_NeurIPS_2021
skipping A_NeurIPS_2022
skipping A_NeurIPS_2023
skipping A_NeurIPS_2024
skipping A_IJCAI_2019
skipping A_IJCAI_2020
skipping A_IJCAI_2021
skipping A_IJCAI_2022
skipping A_IJCAI_2023
skipping A_IJCAI_2024
skipping A_CVPR_2019
skipping A_CVPR_2020
skipping A_CVPR_2021
skipping A_CVPR_2022
skipping A_CVPR_2023
skipping A_CVPR_2024
skipping A_SIGMOD_2019
skipping A_SIGMOD_2020
skipping A_SIGMOD_2021
skipping A_SIGMOD_2




In [17]:
client = OpenAI(base_url="https://api.deepseek.com/beta")
prompt_template = '''请你阅读以下内容，并判断其中最可能是顶级会议接受论文的网址是那个，并给出回答。
AAAI尽量选择跟https://dblp.org/有关的网址。
CVPR选择原网址。
输出格式为: \"网址为: <url>\"，后面不添加任何附加的理由和解释。
'''

def get_most_possible_url(input):
    global client
    global prompt_template
    response = client.chat.completions.create(
        model="deepseek-chat",
        messages=[
            {"role": "system", "content": prompt_template},
            {"role": "user", "content": input}
        ],
        temperature=0,
        stream=False
    )
    return response.choices[0].message.content.replace("网址为: ","")

In [18]:
# 加载checkpoint文件
if os.path.exists("./tmp/get_most_possible_url_checkpoint.json"):
    get_most_possible_url_checkpoint = json.load(open("./tmp/get_most_possible_url_checkpoint.json", "r", encoding="utf-8"))
else:
    get_most_possible_url_checkpoint = []

In [19]:
# 加载输出文件
if os.path.exists("./tmp/get_most_possible_url_output.json"):
    get_most_possible_url_output = json.load(open("./tmp/get_most_possible_url_output.json", "r", encoding="utf-8"))
else:
    get_most_possible_url_output = []

In [20]:
try:
    for param in tqdm(param_list):
        param_string = "_".join(param)
        if param_string in get_most_possible_url_checkpoint:
            print(f"skipping {param_string}")
            continue
        input = open(f"./tmp/get_search_result/{param_string}.txt", "r", encoding="utf-8").read()
        url = get_most_possible_url(input)
        get_most_possible_url_output.append(url)
        get_most_possible_url_checkpoint.append(param_string)
except Exception as e:
    print(e)
    raise e
finally:
    json.dump(get_most_possible_url_checkpoint, open("./tmp/get_most_possible_url_checkpoint.json", "w", encoding="utf-8"))
    json.dump(get_most_possible_url_output, open("./tmp/get_most_possible_url_output.json", "w", encoding="utf-8"))


100%|██████████| 78/78 [01:55<00:00,  1.48s/it]


In [21]:
# 创立页面文件夹
os.makedirs("./tmp/html",exist_ok=True)

In [22]:
def get_html(url, driver, save_path):
    driver.get(url)
    time.sleep(10)
    html = driver.page_source
    with open(save_path, "w", encoding="utf-8") as f:
        f.write(html)
    return html

In [23]:
options = Options()
options.add_argument("--headless")  # 无头模式，不打开浏览器窗口
options.add_argument("--disable-gpu")  # 禁用 GPU 渲染
options.add_argument("--no-sandbox")  # 禁用沙盒模式
options.add_argument("--disable-dev-shm-usage")  # 避免内存不足问题
service = Service(r"D:\Applications\edgedriver\msedgedriver.exe")
driver = webdriver.Edge(service=service, options=options)

In [24]:
url_list = json.load(open("./tmp/get_most_possible_url_output.json"))

In [25]:
# 加载checkpoint

if os.path.exists("./tmp/get_html_checkpoint.json"):
    get_html_checkpoint = json.load(open("./tmp/get_html_checkpoint.json", "r", encoding="utf-8"))
else:
    get_html_checkpoint = []

In [26]:
try:
    for param, url in tqdm(zip(param_list, url_list)):
        param_string = "_".join(param)
        if param_string in get_html_checkpoint:
            print(f"skipping {param_string}")
            continue
        save_path =f"./tmp/html/{param_string}.html"
        get_html(url, driver, save_path)
        get_html_checkpoint.append(param_string)
except Exception as e:
    print(e)
    raise e
finally:
    json.dump(get_html_checkpoint, open("./tmp/get_html_checkpoint.json", "w", encoding="utf-8"))
    driver.quit()

33it [11:05, 20.17s/it]


Message: unknown error: net::ERR_CONNECTION_RESET
  (Session info: MicrosoftEdge=132.0.2957.127)
Stacktrace:
	GetHandleVerifier [0x00007FF6E1320AF5+13637]
	Microsoft::Applications::Events::EventProperty::empty [0x00007FF6E15ABC04+2078868]
	Microsoft::Applications::Events::EventProperty::empty [0x00007FF6E15066E6+1401718]
	(No symbol) [0x00007FF6E10933F7]
	(No symbol) [0x00007FF6E1087AB7]
	(No symbol) [0x00007FF6E10890CD]
	(No symbol) [0x00007FF6E1087CBF]
	(No symbol) [0x00007FF6E1087923]
	(No symbol) [0x00007FF6E1087678]
	(No symbol) [0x00007FF6E1085897]
	(No symbol) [0x00007FF6E1085CEC]
	(No symbol) [0x00007FF6E1097FFA]
	(No symbol) [0x00007FF6E11190CE]
	(No symbol) [0x00007FF6E10FDDCA]
	(No symbol) [0x00007FF6E10D3DA7]
	(No symbol) [0x00007FF6E11188FD]
	(No symbol) [0x00007FF6E10FDA33]
	(No symbol) [0x00007FF6E10D32F4]
	(No symbol) [0x00007FF6E10D2626]
	(No symbol) [0x00007FF6E10D2EE1]
	(No symbol) [0x00007FF6E115E194]
	(No symbol) [0x00007FF6E120607F]
	(No symbol) [0x00007FF6E1161B8

WebDriverException: Message: unknown error: net::ERR_CONNECTION_RESET
  (Session info: MicrosoftEdge=132.0.2957.127)
Stacktrace:
	GetHandleVerifier [0x00007FF6E1320AF5+13637]
	Microsoft::Applications::Events::EventProperty::empty [0x00007FF6E15ABC04+2078868]
	Microsoft::Applications::Events::EventProperty::empty [0x00007FF6E15066E6+1401718]
	(No symbol) [0x00007FF6E10933F7]
	(No symbol) [0x00007FF6E1087AB7]
	(No symbol) [0x00007FF6E10890CD]
	(No symbol) [0x00007FF6E1087CBF]
	(No symbol) [0x00007FF6E1087923]
	(No symbol) [0x00007FF6E1087678]
	(No symbol) [0x00007FF6E1085897]
	(No symbol) [0x00007FF6E1085CEC]
	(No symbol) [0x00007FF6E1097FFA]
	(No symbol) [0x00007FF6E11190CE]
	(No symbol) [0x00007FF6E10FDDCA]
	(No symbol) [0x00007FF6E10D3DA7]
	(No symbol) [0x00007FF6E11188FD]
	(No symbol) [0x00007FF6E10FDA33]
	(No symbol) [0x00007FF6E10D32F4]
	(No symbol) [0x00007FF6E10D2626]
	(No symbol) [0x00007FF6E10D2EE1]
	(No symbol) [0x00007FF6E115E194]
	(No symbol) [0x00007FF6E120607F]
	(No symbol) [0x00007FF6E1161B83]
	Microsoft::Applications::Events::EventProperty::to_string [0x00007FF6E16681F9+269801]
	Microsoft::Applications::Events::ILogConfiguration::operator* [0x00007FF6E12B71B1+519377]
	Microsoft::Applications::Events::ILogConfiguration::operator* [0x00007FF6E12B24A4+499652]
	Microsoft::Applications::Events::ILogConfiguration::operator* [0x00007FF6E12B25E9+499977]
	Microsoft::Applications::Events::ILogConfiguration::operator* [0x00007FF6E12A7566+454790]
	BaseThreadInitThunk [0x00007FFBC3FE7374+20]
	RtlUserThreadStart [0x00007FFBC491CC91+33]


In [27]:
# html裁剪
os.makedirs("./tmp/html_clean", exist_ok=True)

In [14]:
def parser_dblp(target_path):
    html_content = open(target_path, "r", encoding="utf-8").read()
    soup = BeautifulSoup(html_content, "lxml")
    body_tag = soup.find("body")
    if not body_tag:
        raise ValueError("No body tag found")
    title_spans = body_tag.find_all("span", class_="title")
    result = ""
    for span in title_spans:
        result += span.get_text(strip=True) + "\n"
    level, conference, year = target_path.split("/")[-1].split(".")[0].split("_")
    save_path = f"./{level}/{conference}/{year}.txt"
    with open(save_path, "w", encoding="utf-8") as f:
        f.write(result)

In [None]:
# target_paths = [f"./tmp/html/A_AAAI_{i}.html" for i in range(2019, 2025)]
    
# # 使用 ThreadPoolExecutor 进行多线程处理
# with ThreadPoolExecutor(max_workers=4) as executor:
#     executor.map(parser_dblp, target_paths)

./tmp/html/A_AAAI_2019.html
./tmp/html/A_AAAI_2020.html
./tmp/html/A_AAAI_2021.html
./tmp/html/A_AAAI_2022.html
./tmp/html/A_AAAI_2023.html
./tmp/html/A_AAAI_2024.html


In [19]:
# target_paths = [f"./tmp/html/A_ACL_{i}.html" for i in range(2019, 2025)]
    
# # 使用 ThreadPoolExecutor 进行多线程处理
# with ThreadPoolExecutor(max_workers=4) as executor:
#     executor.map(parser_dblp, target_paths)

./tmp/html/A_ACL_2021.html
./tmp/html/A_ACL_2022.html
./tmp/html/A_ACL_2019.html
./tmp/html/A_ACL_2020.html
./tmp/html/A_ACL_2024.html
./tmp/html/A_ACL_2023.html


In [20]:
# target_paths = [f"./tmp/html/A_CVPR_{i}.html" for i in range(2019, 2025)]
    
# # 使用 ThreadPoolExecutor 进行多线程处理
# with ThreadPoolExecutor(max_workers=4) as executor:
#     executor.map(parser_dblp, target_paths)

./tmp/html/A_CVPR_2019.html
./tmp/html/A_CVPR_2020.html
./tmp/html/A_CVPR_2021.html
./tmp/html/A_CVPR_2022.html
./tmp/html/A_CVPR_2023.html
./tmp/html/A_CVPR_2024.html


In [21]:
# target_paths = [f"./tmp/html/A_ICCV_{i}.html" for i in range(2019, 2025, 2)]
    
# # 使用 ThreadPoolExecutor 进行多线程处理
# with ThreadPoolExecutor(max_workers=3) as executor:
#     executor.map(parser_dblp, target_paths)

./tmp/html/A_ICCV_2019.html
./tmp/html/A_ICCV_2021.html
./tmp/html/A_ICCV_2023.html


In [22]:
# target_paths = [f"./tmp/html/A_ICDE_{i}.html" for i in range(2019, 2025)]
    
# # 使用 ThreadPoolExecutor 进行多线程处理
# with ThreadPoolExecutor(max_workers=4) as executor:
#     executor.map(parser_dblp, target_paths)

./tmp/html/A_ICDE_2020.html
./tmp/html/A_ICDE_2019.html
./tmp/html/A_ICDE_2021.html
./tmp/html/A_ICDE_2022.html
./tmp/html/A_ICDE_2023.html
./tmp/html/A_ICDE_2024.html


In [23]:
# target_paths = [f"./tmp/html/A_ICML_{i}.html" for i in range(2019, 2025)]
    
# # 使用 ThreadPoolExecutor 进行多线程处理
# with ThreadPoolExecutor(max_workers=4) as executor:
#     executor.map(parser_dblp, target_paths)

./tmp/html/A_ICML_2019.html
./tmp/html/A_ICML_2020.html
./tmp/html/A_ICML_2021.html
./tmp/html/A_ICML_2022.html
./tmp/html/A_ICML_2023.html
./tmp/html/A_ICML_2024.html


In [24]:
# target_paths = [f"./tmp/html/A_IJCAI_{i}.html" for i in range(2019, 2025)]
    
# # 使用 ThreadPoolExecutor 进行多线程处理
# with ThreadPoolExecutor(max_workers=4) as executor:
#     executor.map(parser_dblp, target_paths)

./tmp/html/A_IJCAI_2021.html
./tmp/html/A_IJCAI_2020.html
./tmp/html/A_IJCAI_2022.html
./tmp/html/A_IJCAI_2019.html
./tmp/html/A_IJCAI_2023.html
./tmp/html/A_IJCAI_2024.html


In [25]:
# target_paths = [f"./tmp/html/A_NeurIPS_{i}.html" for i in range(2019, 2024)]
    
# # 使用 ThreadPoolExecutor 进行多线程处理
# with ThreadPoolExecutor(max_workers=4) as executor:
#     executor.map(parser_dblp, target_paths)

./tmp/html/A_NeurIPS_2019.html
./tmp/html/A_NeurIPS_2020.html
./tmp/html/A_NeurIPS_2021.html
./tmp/html/A_NeurIPS_2022.html
./tmp/html/A_NeurIPS_2023.html


In [48]:
def parser_papercopilot(target_path):
    html_content = open(target_path, "r", encoding="utf-8").read()
    soup = BeautifulSoup(html_content, "lxml")
    body_tag = soup.find("body")
    if not body_tag:
        raise ValueError("No body tag found")
    
    result = ""

    # 查找id为paperlist的table容器
    paperlist_table = body_tag.find("table", id='3D"paperlist"')
    if not paperlist_table:
        raise ValueError("No table with id 'paperlist' found")
    
    # 查找table下的tbody容器
    tbody = paperlist_table.find("tbody")
    if not tbody:
        raise ValueError("No tbody found in the table")
    result = ""
    
    # 遍历tbody下的所有tr组件
    for tr in tbody.find_all("tr"):
        # 获取tr下的第二个td组件
        td_list = tr.find_all("td")
        if len(td_list) > 1:  # 确保至少有两个td组件
            second_td = td_list[1]
            # 获取第二个td组件下的<a>标签
            a_tag = second_td.find("a")
            if a_tag:
                # 获取<a>标签内部的文本（文章标题）并添加到result中
                result += a_tag.get_text(strip=True) + "\n"

    level, conference, year = target_path.split("/")[-1].split(".")[0].split("_")
    save_path = f"./{level}/{conference}/{year}.txt"
    with open(save_path, "w", encoding="utf-8") as f:
        f.write(result.replace("=\n",""))

In [50]:
# parser_papercopilot("./tmp/html/A_NeurIPS_2024.mhtml")

In [51]:
# target_paths = [f"./tmp/html/A_SIGKDD_{i}.html" for i in range(2019, 2025)]
    
# # 使用 ThreadPoolExecutor 进行多线程处理
# with ThreadPoolExecutor(max_workers=4) as executor:
#     executor.map(parser_dblp, target_paths)

./tmp/html/A_SIGKDD_2019.html
./tmp/html/A_SIGKDD_2020.html
./tmp/html/A_SIGKDD_2021.html
./tmp/html/A_SIGKDD_2022.html
./tmp/html/A_SIGKDD_2023.html
./tmp/html/A_SIGKDD_2024.html


In [52]:
# target_paths = [f"./tmp/html/A_SIGMOD_{i}.html" for i in range(2019, 2025)]
    
# # 使用 ThreadPoolExecutor 进行多线程处理
# with ThreadPoolExecutor(max_workers=4) as executor:
#     executor.map(parser_dblp, target_paths)

./tmp/html/A_SIGMOD_2019.html
./tmp/html/A_SIGMOD_2022.html
./tmp/html/A_SIGMOD_2020.html
./tmp/html/A_SIGMOD_2023.html
./tmp/html/A_SIGMOD_2021.html


In [97]:
def parser_eaacl(target_path):
    html_content = open(target_path, "r", encoding="utf-8").read()
    soup = BeautifulSoup(html_content, "lxml")
    body_tag = soup.find("body")
    if not body_tag:
        raise ValueError("No body tag found")
    
    result = ""

    # 遍历<body>中的所有<p>标签
    # print(body_tag.find_all("p"))
    for p_tag in body_tag.find_all("p"):
        # 检查<p>标签中是否包含<strong>标签
        strong_tag = p_tag.find("strong")
        if strong_tag:
            # 如果包含<strong>标签，提取其文本并拼接到result中
            result += strong_tag.get_text().strip() + "\n"
        b_tag_list = p_tag.find_all("b")
        if b_tag_list:
            for b_tag in b_tag_list:
                result += b_tag.get_text().strip() + "\n"

    for li_tag in body_tag.find_all("li"):
        # 检查<li>标签中是否包含<strong>标签
        strong_tag = li_tag.find("strong")
        if strong_tag:
            # 如果包含<strong>标签，提取其文本并拼接到result中
            result += strong_tag.get_text().strip() + "\n"



    level, conference, year = target_path.split("/")[-1].split(".")[0].split("_")
    save_path = f"./{level}/{conference}/{year}.txt"
    with open(save_path, "w", encoding="utf-8") as f:
        f.write(result.replace("=\n",""))

In [98]:
# target_paths = [f"./tmp/html/B_EAACL_{i}.html" for i in range(2019, 2025)]
    
# # 使用 ThreadPoolExecutor 进行多线程处理
# with ThreadPoolExecutor(max_workers=4) as executor:
#     executor.map(parser_eaacl, target_paths)

In [102]:
def parser_emnlp(target_path):
    html_content = open(target_path, "r", encoding="utf-8").read()
    soup = BeautifulSoup(html_content, "lxml")
    body_tag = soup.find("body")
    if not body_tag:
        raise ValueError("No body tag found")
    
    result = ""

        # 遍历body中的所有li组件
    for li in body_tag.find_all("li"):
        span = li.find("span")
        if span:
            result += span.get_text(strip=True) + "\n"

    # 遍历body中的所有article组件
    for article in body_tag.find_all("article"):
        span = article.find("span")
        if span:
            result += span.get_text(strip=True) + "\n"

    # 遍历body中的所有p组件
    for p in body_tag.find_all("p"):
        strong = p.find("strong")
        if strong:
            result += strong.get_text(strip=True) + "\n"

    # 遍历body中的所有strong组件
    for strong in body_tag.find_all("strong"):
        a = strong.find("a")
        if a:
            result += a.get_text(strip=True) + "\n"

    level, conference, year = target_path.split("/")[-1].split(".")[0].split("_")
    save_path = f"./{level}/{conference}/{year}.txt"
    with open(save_path, "w", encoding="utf-8") as f:
        f.write(result.replace("=\n",""))

In [103]:
target_paths = [f"./tmp/html/B_EMNLP_{i}.html" for i in range(2019, 2025)]
    
# 使用 ThreadPoolExecutor 进行多线程处理
with ThreadPoolExecutor(max_workers=4) as executor:
    executor.map(parser_emnlp, target_paths)