### combine latex


In [1]:
import requests
import urllib.request
from zipfile import ZipFile
import os
from pathlib import Path
import tarfile
from verbalist.datasets.openreview.arxiv_cleaner import run_arxiv_cleaner
import shutil
from tqdm import tqdm

In [2]:
import pandas as pd
import json

dataset = pd.read_csv("./verbalist/datasets/openreview/openreview.csv")
dataset = dataset[dataset["arxiv_link"] != "-"]

dataset = {item["paper_url"].strip(): item for item in dataset.to_dict("records")}

with open("./verbalist/datasets/openreview/openreview.json") as f:
    json_dataset = json.load(f)


json_dataset
for item in json_dataset:
    paper_url = item["paper_url"]
    if paper_url in dataset:
        dataset[paper_url]["reviews"] = item["reviews"]

dataset = list(dataset.values())


def download_arxiv_paper(url):
    latex_source = url.replace("abs", "e-print")
    zip_name = latex_source.split("e-print/")[1]

    folder_path = f"./verbalist/datasets/openreview/papers/{zip_name}"
    zip_save_path = f"{folder_path}.zip"

    Path(folder_path).mkdir(exist_ok=True, parents=True)

    if not os.path.isfile(zip_save_path):
        urllib.request.urlretrieve(latex_source, zip_save_path)

    with tarfile.open(zip_save_path) as zip_obj:
        zip_obj.extractall(folder_path)
    os.remove(zip_save_path)

    parameters = {
        "input_folder": folder_path,
        "resize_images": False,
        "im_size": 500,
        "compress_pdf": False,
        "pdf_im_resolution": 500,
        "images_allowlist": {},
        "keep_bib": False,
        "commands_to_delete": [],
        "commands_only_to_delete": [],
        "environments_to_delete": [],
        "use_external_tikz": None,
        "svg_inkscape": None,
        "config": None,
        "verbose": False,
    }
    run_arxiv_cleaner(parameters)
    shutil.rmtree(folder_path)
    clean_folder_path = f"{folder_path}_arXiv"
    os.rename(clean_folder_path, folder_path)


for item in tqdm(dataset):
    paper_url = item["arxiv_link"]
    # arxiv_url = download_arxiv_paper(url=paper_url)

100%|██████████| 65/65 [00:00<00:00, 924168.68it/s]


### combine latex into one


In [3]:
import re
import os
import glob, os


def remove_new_lines(string):
    return re.sub(r"[\n\t]{3,}", "\n", string)


def find_main_tex(main_folder):
    main_tex_path = ""
    for item in glob.glob(f"{main_folder}/*.tex"):
        tex_content = open(item).read()

        if "\documentclass" in tex_content:
            main_tex_path = item
            break

    return main_tex_path


def get_paper_text(paper_folder):
    # main_tex_path = f"{paper_folder}/main.tex"
    main_tex_path = find_main_tex(paper_folder)
    main_tex = open(main_tex_path).read()
    main_tex = remove_new_lines(main_tex)
    main_tex = remove_new_lines(main_tex)
    all_sections = re.findall(r"\\input{.*}", main_tex)
    all_sections = [
        item.replace("\\input{", "").replace("}", "") for item in all_sections
    ]

    section_contents = {item: "" for item in all_sections}

    for section_name in all_sections:
        section_path = f"{paper_folder}/{section_name}"

        if not ".tex" in section_path:
            section_path += ".tex"

        section_content = open(section_path).read()
        section_content = remove_new_lines(section_content)
        section_content = remove_new_lines(section_content)
        section_contents[section_name] = section_content

    for section_name in all_sections:
        section_content = section_contents[section_name]
        replace_string = f"\\input({section_name})"
        replace_string = replace_string.replace("(", "{")
        replace_string = replace_string.replace(")", "}")
        # print(replace_string)
        main_tex = main_tex.replace(replace_string, section_content)
    return main_tex


base_path = "verbalist/datasets/openreview/papers/"

papers_paths = [base_path + item for item in os.listdir(base_path)]

for i in range(len(dataset)):
    dataset_item = dataset[i]
    folder_name = dataset_item["arxiv_link"].split("/")[-1]
    papers_path = base_path + folder_name
    latex = get_paper_text(papers_path)
    # print(latex)
    dataset[i]["latex"] = latex

In [5]:
dataset[0]

{'paper_url': 'https://openreview.net/forum?id=VvRbhkiAwR',
 'paper_id': 'VvRbhkiAwR',
 'arxiv_link': 'https://arxiv.org/abs/2008.12172',
 'reviews': [{'id': '1cp_MEsz_cI',
   'original': None,
   'number': 3,
   'cdate': 1594023893979,
   'ddate': None,
   'tcdate': 1594023893979,
   'tmdate': 1594023893979,
   'tddate': None,
   'forum': 'VvRbhkiAwR',
   'replyto': 'VvRbhkiAwR',
   'invitation': 'aclweb.org/ACL/2020/Workshop/NLP-COVID/Paper25/-/Official_Review',
   'content': {'title': 'Review of "Cross-language sentiment analysis of European Twitter messages"  -- interesting trends analysis but some more approach comparisons and tables for the data would be good.',
    'review': 'The authors present an interesting, important and relevant trend analysis of sentiment across languages in several locales during the Covid-19 pandemic, using geo-tagged European Twitter data and pre-trained cross-lingual embeddings within a neural model.\n\nThe main contributions of the paper are: 1) the g

In [20]:
prompts_dataset = []
for dataset_item in dataset:
    for item in dataset_item["reviews"]:
        # print(item)
        title = item["content"].get("title", "").strip()
        review = item["content"].get("review", "").strip()
        rating = item["content"].get("rating", "").strip()
        confidence = item["content"].get("confidence", "").strip()
        
        if rating != "":
            rating = "Rating: " + rating
        if confidence != "":
            confidence = "Confidence: " + confidence

        full_review = f"{title}\n{review}\n{rating}\n{confidence}"
        # print(full_review)
        # print("=" * 100)
        prompts_dataset.append({
            "full_review":  full_review,
            "latex": dataset_item['latex'],
            'paper_url': dataset_item['paper_url'],
            'arxiv_url': dataset_item['arxiv_link']
        })

In [23]:
len(prompts_dataset)
# pd.DataFrame(prompts_dataset)

Unnamed: 0,full_review,latex,paper_url,arxiv_url
0,"Review of ""Cross-language sentiment analysis o...","\n\documentclass[11pt,a4paper]{article}\n\usep...",https://openreview.net/forum?id=VvRbhkiAwR,https://arxiv.org/abs/2008.12172
1,"Review on ""Cross-language sentiment analysis o...","\n\documentclass[11pt,a4paper]{article}\n\usep...",https://openreview.net/forum?id=VvRbhkiAwR,https://arxiv.org/abs/2008.12172
2,Review\nThis is a mostly well-written overview...,"\n\documentclass[11pt,a4paper]{article}\n\usep...",https://openreview.net/forum?id=VvRbhkiAwR,https://arxiv.org/abs/2008.12172
3,Excellent description of a critical COVID-19 d...,"\n\documentclass[11pt,a4paper]{article}\n\Pass...",https://openreview.net/forum?id=0gLzHrE_t3z,https://arxiv.org/abs/2004.10706
4,Overview of a highly important Covid-19 datase...,"\n\documentclass[11pt,a4paper]{article}\n\Pass...",https://openreview.net/forum?id=0gLzHrE_t3z,https://arxiv.org/abs/2004.10706
...,...,...,...,...
145,2nd Place Scheme on Action Recognition Track o...,\n\documentclass[runningheads]{llncs}\n\usepac...,https://openreview.net/forum?id=R6YWiPVOQBo,https://arxiv.org/abs/2008.03996
146,"Interesting method, unclear explanation\n#### ...",\n\documentclass[runningheads]{llncs}\n\usepac...,https://openreview.net/forum?id=R6YWiPVOQBo,https://arxiv.org/abs/2008.03996
147,"The proposed method is somehow novel, but it l...",\n\documentclass[manuscript]{acmart}\n\n\usepa...,https://openreview.net/forum?id=atWaELmguNj7,https://arxiv.org/abs/2208.12133
148,The evaluation and sub-materials show good res...,\n\documentclass[manuscript]{acmart}\n\n\usepa...,https://openreview.net/forum?id=atWaELmguNj7,https://arxiv.org/abs/2208.12133


In [24]:
from datasets import Dataset

dataset = Dataset.from_list(dataset)

In [25]:
# dataset.push_to_hub("dim/openreview_raw_65")
# dataset.push_to_hub("dim/openreview_prompts_65")

Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 69.23ba/s]
Pushing dataset shards to the dataset hub: 100%|██████████| 1/1 [00:02<00:00,  2.32s/it]
Downloading metadata: 100%|██████████| 21.0/21.0 [00:00<00:00, 254kB/s]
