In [105]:
import requests
from bs4 import BeautifulSoup
import os
from pdfminer.high_level import extract_text
from pdfminer.layout import LAParams
from pdfminer.high_level import extract_pages
from pdfminer.layout import LTTextContainer
import os

def pdf_to_html(pdf_path, output_html_path):
    # Extract text and layout information from PDF
    laparams = LAParams()
    html_content = "<html><body>"

    for page_layout in extract_pages(pdf_path, laparams=laparams):
        html_content += "<div class='page'>"
        for element in page_layout:
            if isinstance(element, LTTextContainer):
                text = element.get_text()
                html_content += f"<p>{text}</p>"
        html_content += "</div>"

    html_content += "</body></html>"

    # Save the HTML content to a file
    with open(output_html_path + '.html', 'w', encoding='utf-8') as f:
        f.write(html_content)

    print(f"HTML file saved to {output_html_path}")
    return output_html_path

def search_biorxiv(query, max_results=10, perpage=75, page=0, paper_links=[]):
    search_url = f"https://www.biorxiv.org/search/%2522{query}%2522%20numresults%3A{perpage}%20sort%3Apublication-date%20direction%3Adescending?page={page}"
    print(search_url)
    response = requests.get(search_url)
    soup = BeautifulSoup(response.text, 'html.parser')
    papers = soup.find_all('li', class_='search-result')
    total = int(soup.find('h1', {"id": 'page-title'}).get_text().split()[0])
    for paper in papers:
        link = paper.find('a', class_='highwire-cite-linked-title')
        
        if link:
            paper_links.append(f"https://www.biorxiv.org{link.get('href')}")
    if (perpage * page > total):
        print('done finding papers')
        print('found ' + str(len(paper_links)))
        return paper_links[0:total]
    else:
        print('getting the next page')
        print(page)
        return search_biorxiv(query, max_results, perpage, page + 1, paper_links)

def download_pdf(paper_url, output_dir='papers'):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    response = requests.get(paper_url + '.full.pdf', stream=True)
    #https://www.biorxiv.org/content/10.1101/2021.06.23.449389v1.full.pdf
    #https://www.biorxiv.org/content/10.1101/2024.07.31.606100v1

    paper_id = paper_url.split('/')[-1]
    file_path = os.path.join(output_dir, f"{paper_id}.pdf")


    with open(file_path, 'wb') as fd:
        for chunk in response.iter_content(2048):
            fd.write(chunk)

    print(f"Downloaded: {file_path}")
    return file_path

import os
import json

def create_jsonl_from_directory(directory_path, output_jsonl_path, max_tokens=2048):
    # List all files in the directory
    file_list = os.listdir(directory_path)

    with open(output_jsonl_path, 'w', encoding='utf-8') as jsonl_file:
        for filename in file_list:
            file_path = os.path.join(directory_path, filename)
            
            # Check if the path is a file (and not a directory)
            if os.path.isfile(file_path):
                # Read the contents of the file
                with open(file_path, 'r', encoding='utf-8') as f:
                    file_contents = f.read()

                # Split the file contents into chunks of max_tokens words
                words = file_contents.split()
                for i in range(0, len(words), max_tokens):
                    chunk = words[i:i + max_tokens]
                    chunk_text = " ".join(chunk)

                    # Create a JSON object with the chunk contents
                    json_object = {"text": chunk_text}

                    # Write the JSON object as a JSONL line
                    jsonl_file.write(json.dumps(json_object) + '\n')

def main():
    query = "graph genome"
    max_results = 100
    
    print("Searching for papers...")
    paper_urls = search_biorxiv(query, max_results=max_results)
    print(paper_urls)
    # these ones get stuck parsing
    remove = [
        "https://www.biorxiv.org/content/10.1101/2024.02.14.580266v1",
        "https://www.biorxiv.org/content/10.1101/2023.10.04.560829v1"
    ]
    print("Downloading papers...")
    for r in remove:
        if r in paper_urls:
            paper_urls.remove(r)
    for url in paper_urls:
        pdfpath = download_pdf(url)
        htmlpath = pdf_to_html(pdfpath, 'html/' + pdfpath)

if __name__ == "__main__":
    main()

# Example usage:
directory_path = "html/papers"
output_jsonl_path = "output.jsonl"
create_jsonl_from_directory(directory_path, output_jsonl_path, 2048)

Searching for papers...
https://www.biorxiv.org/search/%2522graph genome%2522%20numresults%3A75%20sort%3Apublication-date%20direction%3Adescending?page=0
getting the next page
0
https://www.biorxiv.org/search/%2522graph genome%2522%20numresults%3A75%20sort%3Apublication-date%20direction%3Adescending?page=1
getting the next page
1
https://www.biorxiv.org/search/%2522graph genome%2522%20numresults%3A75%20sort%3Apublication-date%20direction%3Adescending?page=2
getting the next page
2
https://www.biorxiv.org/search/%2522graph genome%2522%20numresults%3A75%20sort%3Apublication-date%20direction%3Adescending?page=3
getting the next page
3
https://www.biorxiv.org/search/%2522graph genome%2522%20numresults%3A75%20sort%3Apublication-date%20direction%3Adescending?page=4
getting the next page
4
https://www.biorxiv.org/search/%2522graph genome%2522%20numresults%3A75%20sort%3Apublication-date%20direction%3Adescending?page=5
getting the next page
5
https://www.biorxiv.org/search/%2522graph genome%252

TypeError: 'PDFObjRef' object is not iterable

In [43]:
!cat html/papers/*.html > papers.txt

In [45]:
import json

def create_jsonl_from_directory(directory_path, output_jsonl_path):
    # List all files in the directory
    file_list = os.listdir(directory_path)

    with open(output_jsonl_path, 'w', encoding='utf-8') as jsonl_file:
        for filename in file_list:
            file_path = os.path.join(directory_path, filename)
            
            # Check if the path is a file (and not a directory)
            if os.path.isfile(file_path):
                # Read the contents of the file
                with open(file_path, 'r', encoding='utf-8') as f:
                    file_contents = f.read()

                # Create a JSON object with the file contents
                json_object = {"text": file_contents}

                # Write the JSON object as a JSONL line
                jsonl_file.write(json.dumps(json_object) + '\n')

# Example usage:
directory_path = "html/papers"
output_jsonl_path = "output.jsonl"
create_jsonl_from_directory(directory_path, output_jsonl_path)

In [140]:
import os
import json

def create_jsonl_from_directory(directory_path, output_jsonl_path, max_tokens=2048):
    # List all files in the directory
    file_list = os.listdir(directory_path)

    with open(output_jsonl_path, 'w', encoding='utf-8') as jsonl_file:
        for filename in file_list:
            file_path = os.path.join(directory_path, filename)
            
            # Check if the path is a file (and not a directory)
            if os.path.isfile(file_path):
                # Read the contents of the file
                with open(file_path, 'r', encoding='utf-8') as f:
                    file_contents = f.read()

                # Split the file contents into chunks of max_tokens words
                words = file_contents.split()
                for i in range(0, len(words), max_tokens):
                    chunk = words[i:i + max_tokens]
                    chunk_text = " ".join(chunk)

                    # Create a JSON object with the chunk contents
                    json_object = {"text": chunk_text}

                    # Write the JSON object as a JSONL line
                    jsonl_file.write(json.dumps(json_object) + '\n')

# Example usage:
directory_path = "html/papers"
output_jsonl_path = "output.jsonl"
create_jsonl_from_directory(directory_path, output_jsonl_path, 2048)


In [141]:
!head output.jsonl


{"text": "<html><body><div class='page'><p>bioRxiv preprint </p><p>doi: </p><p>https://doi.org/10.1101/2023.05.12.540616 </p><p>; </p><p>this version posted June 30, 2023. </p><p>The copyright holder for this preprint </p><p>(which was not certified by peer review) is the author/funder, who has granted bioRxiv a license to display the preprint in perpetuity. It is made </p><p>available under a </p><p>CC-BY 4.0 International license . </p><p>Compression algorithm for colored de Bruijn graphs </p><p>Amatur Rahman1 </p><p>Yoann Dufresne4,5 </p><p>Paul Medvedev1,2,3 </p><p>1 Department of Computer Science and Engineering, The Pennsylvania State University 2 Department of Biochemistry and Molecular Biology, The Pennsylvania State University 3 Huck Institutes of the Life Sciences, The Pennsylvania State University 4 Institut Pasteur, Universit\u00b4e Paris Cit\u00b4e, G5 Sequence Bioinformatics, Paris, France 5 Institut Pasteur, Universit\u00b4e Paris Cit\u00b4e, Bioinformatics and Biostatis

In [144]:
import os
import json

def create_jsonl_from_directory(directory_path, output_jsonl_path, max_tokens=2048):
    # List all files in the directory
    file_list = os.listdir(directory_path)

    with open(output_jsonl_path, 'w', encoding='utf-8') as jsonl_file:
        for filename in file_list:
            file_path = os.path.join(directory_path, filename)
            
            # Check if the path is a file (and not a directory)
            if os.path.isfile(file_path):
                # Read the contents of the file
                with open(file_path, 'r', encoding='utf-8') as f:
                    file_contents = f.read()

                # Split the file contents into chunks of max_tokens characters
                for i in range(0, len(file_contents), max_tokens):
                    chunk = file_contents[i:i + max_tokens]

                    # Create a JSON object with the chunk contents
                    json_object = {"text": chunk}

                    # Write the JSON object as a JSONL line
                    jsonl_file.write(json.dumps(json_object) + '\n')

# Example usage:
directory_path = "html/papers"
output_jsonl_path = "output.jsonl"
create_jsonl_from_directory(directory_path, output_jsonl_path)


In [145]:
!wc -l output.jsonl

   19217 output.jsonl


In [146]:
!head -n 19000 output.jsonl > data/train.jsonl

In [147]:
!tail -n +19000 output.jsonl > data/valid.jsonl

In [148]:
!tail -n +50 data/valid.jsonl > data/test.jsonl

In [149]:
!wc -l data/train.jsonl

   19000 data/train.jsonl


In [150]:
!wc -l data/valid.jsonl

     218 data/valid.jsonl


In [151]:
!wc -l data/test.jsonl

     169 data/test.jsonl


In [152]:
!head data/train.jsonl

{"text": "<html><body><div class='page'><p>bioRxiv preprint \n</p><p>doi: \n</p><p>https://doi.org/10.1101/2023.05.12.540616\n</p><p>; \n</p><p>this version posted June 30, 2023. \n</p><p>The copyright holder for this preprint\n</p><p>(which was not certified by peer review) is the author/funder, who has granted bioRxiv a license to display the preprint in perpetuity. It is made \n</p><p>available under a\n</p><p>CC-BY 4.0 International license\n.\n</p><p>Compression algorithm for colored de Bruijn graphs\n</p><p>Amatur Rahman1\n</p><p>Yoann Dufresne4,5\n</p><p>Paul Medvedev1,2,3\n</p><p>1 Department of Computer Science and Engineering, The Pennsylvania State University\n2 Department of Biochemistry and Molecular Biology, The Pennsylvania State University\n3 Huck Institutes of the Life Sciences, The Pennsylvania State University\n4 Institut Pasteur, Universit\u00b4e Paris Cit\u00b4e, G5 Sequence Bioinformatics, Paris, France\n5 Institut Pasteur, Universit\u00b4e Paris Cit\u00b4e, Bioin