### combine latex


In [9]:
import requests
import urllib.request
from zipfile import ZipFile
import os
from pathlib import Path
import tarfile
from verbalist.datasets.openreview.arxiv_cleaner import run_arxiv_cleaner
import shutil
from tqdm import tqdm

In [6]:
import pandas as pd
import json

dataset = pd.read_csv("./verbalist/datasets/openreview/openreview.csv")
dataset = dataset[dataset["arxiv_link"] != "-"]

dataset = {item["paper_url"].strip(): item for item in dataset.to_dict("records")}

with open("./verbalist/datasets/openreview/openreview.json") as f:
    json_dataset = json.load(f)


json_dataset
for item in json_dataset:
    paper_url = item["paper_url"]
    if paper_url in dataset:
        dataset[paper_url]["reviews"] = item["reviews"]

dataset = list(dataset.values())

In [None]:
dataset[0]

In [10]:
def download_arxiv_paper(url):
    latex_source = url.replace("abs", "e-print")
    zip_name = latex_source.split("e-print/")[1]

    folder_path = f"./verbalist/datasets/openreview/papers/{zip_name}"
    zip_save_path = f"{folder_path}.zip"

    Path(folder_path).mkdir(exist_ok=True, parents=True)

    if not os.path.isfile(zip_save_path):
        urllib.request.urlretrieve(latex_source, zip_save_path)

    with tarfile.open(zip_save_path) as zip_obj:
        zip_obj.extractall(folder_path)
    os.remove(zip_save_path)

    parameters = {
        "input_folder": folder_path,
        "resize_images": False,
        "im_size": 500,
        "compress_pdf": False,
        "pdf_im_resolution": 500,
        "images_allowlist": {},
        "keep_bib": False,
        "commands_to_delete": [],
        "commands_only_to_delete": [],
        "environments_to_delete": [],
        "use_external_tikz": None,
        "svg_inkscape": None,
        "config": None,
        "verbose": False,
    }
    run_arxiv_cleaner(parameters)
    shutil.rmtree(folder_path)
    clean_folder_path = f"{folder_path}_arXiv"
    os.rename(clean_folder_path, folder_path)


for item in tqdm(dataset):
    paper_url = item["arxiv_link"]
    # arxiv_url = download_arxiv_paper(url=paper_url)

100%|██████████| 65/65 [06:03<00:00,  5.59s/it]


### combine latex into one


In [5]:
import re
import os
import glob, os


def remove_new_lines(string):
    return re.sub(r"[\n\t]{3,}", "\n", string)


def find_main_tex(main_folder):
    main_tex_path = ""
    for item in glob.glob(f"{main_folder}/*.tex"):
        tex_content = open(item).read()

        if "\documentclass" in tex_content:
            main_tex_path = item
            break

    return main_tex_path


def get_paper_text(paper_folder):
    # main_tex_path = f"{paper_folder}/main.tex"
    main_tex_path = find_main_tex(paper_folder)
    main_tex = open(main_tex_path).read()
    main_tex = remove_new_lines(main_tex)
    main_tex = remove_new_lines(main_tex)
    all_sections = re.findall(r"\\input{.*}", main_tex)
    all_sections = [
        item.replace("\\input{", "").replace("}", "") for item in all_sections
    ]

    section_contents = {item: "" for item in all_sections}

    for section_name in all_sections:
        section_path = f"{paper_folder}/{section_name}"

        if not ".tex" in section_path:
            section_path += ".tex"

        section_content = open(section_path).read()
        section_content = remove_new_lines(section_content)
        section_content = remove_new_lines(section_content)
        section_contents[section_name] = section_content

    for section_name in all_sections:
        section_content = section_contents[section_name]
        replace_string = f"\\input({section_name})"
        replace_string = replace_string.replace("(", "{")
        replace_string = replace_string.replace(")", "}")
        # print(replace_string)
        main_tex = main_tex.replace(replace_string, section_content)
    return main_tex


base_path = "verbalist/datasets/openreview/papers/"

# "verbalist/datasets/openreview/papers/1911.10082",
# "verbalist/datasets/openreview/papers/2003.05961",
# "verbalist/datasets/openreview/papers/2003.09711",
# "verbalist/datasets/openreview/papers/2005.02987",
papers_paths = [base_path + item for item in os.listdir(base_path)]
# papers_paths
for papers_path in papers_paths:
    print(papers_path)
    print("=" * 100)
    latex = get_paper_text(papers_path)
    print(latex)
    print("=" * 100)
    print("=" * 100)
    print("=" * 100)

verbalist/datasets/openreview/papers/2203.09553

\documentclass[11pt]{article}

\usepackage[]{EMNLP2022}

\usepackage{times}
\usepackage{latexsym}

\usepackage[T1]{fontenc}

\usepackage[utf8]{inputenc}

\usepackage[ruled,linesnumbered,vlined]{algorithm2e}
\SetAlFnt{\small}
\SetAlCapFnt{\small}
\SetAlCapNameFnt{\small}
\newcommand{\var}{\texttt}
\let\oldnl\nl%
\newcommand{\nonl}{\renewcommand{\nl}{\let\nl\oldnl}}%

\usepackage{amsfonts,amssymb}
\usepackage{bbm}
\usepackage{multirow}
\usepackage{amsmath}
\usepackage{booktabs} %
\usepackage{tablefootnote}
\usepackage{graphicx}
\usepackage{caption}
\usepackage{subcaption}
\usepackage{makecell}
\usepackage{bbding}
\usepackage{color}
\usepackage{arydshln} %
\newcommand\topalign[1]{%
  \setbox0\hbox{#1}%
  \raisebox{\dimexpr-\ht0+\dp0\relax}{\usebox0}}
\newcommand{\fedr}{\textsc{FedR}}
\newcommand{\fede}{\textsc{FedE}}

\newcommand\blfootnote[1]{%
  \begingroup
  \renewcommand\thefootnote{}\footnote{#1}%
  \addtocounter{footnote}{-1}%
  \endg