In [1]:
import tarfile
import json
import os
import re

from tqdm import tqdm
from typing import List

In [2]:
filename = "/data/horse/ws/s9650707-llm_secrets/datasets/unarxive/unarXive_230324_open_subset.tar.xz"

md_target_dir = "/data/horse/ws/s9650707-llm_secrets/datasets/unarxive/md3"

MAX_FILES = None
FILES_IN_TAR = 5599  # only used for progress bar when extracting

In [3]:
"""def get_jsonl_members(filename: str, max_files: int=None):
    tar_file = tarfile.open(filename)

    jsonl_members = []
    count = 0
    for tar_member in tar_file:
        jsonl_members.append(tar_file.extractfile(tar_member).read())
        count += 1
        if max_files is not None and count >= max_files:
            break

    return jsonl_members
"""

def get_jsonl_members(filename: str, max_files: int = None):
    with tarfile.open(filename) as tar_file:
        count = 0
        for tar_member in tar_file:
            yield tar_file.extractfile(tar_member).read()
            count += 1
            if max_files is not None and count >= max_files:
                break

def get_num_files_in_tar(filename: str) -> int:
    with tarfile.open(filename) as tar_file:
        return len(tar_file.getmembers())

In [4]:
def extract_from_json(json_string: str):
    json_obj = json.loads(json_string)
    return json_obj

In [5]:
def strip_clean(text: str|None) -> str:
    if text is not None:
        return re.sub(r'\s+', ' ', text.strip())
    else:
        return None
    
def get_abstract(json_obj) -> str:
    return strip_clean(json_obj['abstract']['text'])
    
def get_title(json_obj) -> str:
    return strip_clean(json_obj['metadata']['title'])

def get_authors(json_obj) -> str:
    return strip_clean(json_obj['metadata']['authors'])

def get_journal_ref(json_obj) -> str:
    return strip_clean(json_obj['metadata']['journal-ref'])
    
def get_doi(json_obj) -> str:
    return strip_clean(json_obj['metadata']['doi'])

def get_content(json_obj, replace_objects=False) -> List[str]:
    body_text = json_obj['body_text']
    sections = []

    # references to formulas etc. are in json_obj['ref_entries']
    #print(json_obj['ref_entries'])
    prev_sec_name = None
    for section in body_text:
        #print(section.keys())
        text = section['text']
        #print(section['sec_number'], section['section'], section['content_type'], section['sec_type'])
        #print(text)
        if replace_objects and len(section['ref_spans']) >= 1:
            for span in section['ref_spans']:
                # we remove the references to formulas etc.
                ref = span['ref_id']
                reference = json_obj['ref_entries'][ref]
                #print(span["text"])
                if reference['type'] == 'formula':
                    text = text.replace(span['text'], f"${reference['latex']}$")
                elif reference['type'] == 'figure':
                    text = text.replace(span['text'], '') #f"Figure: {reference['caption']}")
                elif reference['type'] == 'table':
                    text = text.replace(span['text'], '') #f"Table: {reference['caption']}")
                else:
                    raise Exception(f"PROBLEM: Unhandled reference type: {reference}")

        if replace_objects and len(section['cite_spans']) >= 1:
            for span in section['cite_spans']:
                ref = span['ref_id']
                text = text.replace(ref, '')
        
        #print(text)
        sec_name = f"{section['sec_number']}: {section['section']}"
        if sec_name == prev_sec_name:
            sec_name = None
        else:
            prev_sec_name = sec_name
        #print(sec_name, prev_sec_name)

        # remove sections which are shorter than 100 characters
        # - should remove most almost empty sections such as faultly included titles
        #   author information, formulas, etc.
        if len(text) > 150:
            sections.append((strip_clean(sec_name), strip_clean(text)))
    return sections


In [6]:
def create_md(json_obj, target_dir: str, filename: str) -> str:
    authors = get_authors(json_obj)
    title = get_title(json_obj)
    abstract = get_abstract(json_obj)
    journal = get_journal_ref(json_obj)
    doi = get_doi(json_obj)
    content = get_content(json_obj, replace_objects=True)

    with open(os.path.join(target_dir, filename), 'w') as f:
        f.write(f"# {title}\n")
        f.write(f"## Authors\n{authors}\n")
        f.write(f"## Published in\n{journal}\n")
        f.write(f"## DOI\n{doi}\n")
        f.write(f"## Abstract\n{abstract}\n")
        f.write(f"## Content\n")
        for section in content: 
            sec_name = section[0]
            sec_content = section[1]
            if sec_name is not None:
                f.write(f"### {sec_name}\n")
            f.write(f"{sec_content}\n")
        

In [7]:
if FILES_IN_TAR is None:
    FILES_IN_TAR = get_num_files_in_tar(filename)
total_jsonl_files = min(MAX_FILES if MAX_FILES is not None else 1e24, FILES_IN_TAR)

print(f"Total number of jsonl files to be processed: {total_jsonl_files}")


Total number of jsonl files to be processed: 5599


In [8]:
count = 0
for json_string in tqdm(get_jsonl_members(filename, MAX_FILES), total=total_jsonl_files):
    lines = json_string.split(b'\n')
    for line in lines:
        if len(line) == 0:
            continue
        json_obj = extract_from_json(line)
        create_md(json_obj, target_dir=md_target_dir, filename=f"doc_{count:08d}.md")

        count += 1

100%|██████████| 5599/5599 [28:01<00:00,  3.33it/s]  
