In [1]:
import asyncio
import os
import re
import json
from copy import copy, deepcopy
from pathlib import Path
from pprint import pprint, PrettyPrinter
from time import time, sleep
from typing import List, Dict
from uuid import uuid4
from collections import defaultdict

import evaluate
import openai
import requests
import tiktoken
from bs4 import BeautifulSoup, Comment
from doctran import Doctran, ExtractProperty
from dotenv import load_dotenv, find_dotenv
from evaluate import load
from langchain.chat_models import ChatOpenAI
from langchain.docstore.document import Document
from langchain.document_loaders import WebBaseLoader
from langchain.embeddings import (
    OpenAIEmbeddings,
    HuggingFaceEmbeddings,
)
from langchain.llms import OpenAI
from langchain.text_splitter import (
    MarkdownTextSplitter,
    MarkdownHeaderTextSplitter,
    LineType,
    RecursiveCharacterTextSplitter,
)
# Load model directly
from transformers import AutoProcessor, AutoModelForTokenClassification
from loguru import logger
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

from main import (
    extract_plan_and_content_wikipedia,
    compare_documents_plans,
    compare_documents_sections,
    extract_plan_and_content_patent,
)

_ = load_dotenv(find_dotenv())
openai.api_key = os.getenv('OPENAI_API_KEY')

llm_default = ChatOpenAI(model_name="gpt-3.5-turbo", streaming=True)
llm_16k = ChatOpenAI(model_name="gpt-3.5-turbo-16k", streaming=True)

def num_tokens_from_string(string: str, encoding_name: str = "gpt-3.5-turbo") -> int:
    """Returns the number of tokens in a text string."""
    try:
        encoding = tiktoken.get_encoding(encoding_name)
    except ValueError:
        encoding = tiktoken.encoding_for_model(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens

def convert_to_markdown(article_dict):
    md_text = ""

    for heading, content in article_dict.items():
        # heading is of form: 'h3 Example'
        # Define the markdown equivalent for the heading level
        heading_level = "#" * int(heading[1])
        heading = heading[3:]
        # Append the heading and the content to the markdown text
        md_text += f"{heading_level} {heading}\n\n{content}\n\n"

    return md_text


def truncated_pprint(obj, N=5):
    """Pretty print an object, truncating lists and strings to N items/characters
    for easier viewing of plan_json objects"""
    def truncate(item, N):
        if isinstance(item, list) and N is not None:
            return item[:N] + (['...'] if len(item) > N else [])
        if isinstance(item, str) and N is not None:
            N = 125
            return item[:N] + ('...' if len(item) > N else '')
        return item

    def trunc_recursive(item, N):
        if isinstance(item, list):
            return [trunc_recursive(i, N) for i in truncate(item, N)]
        elif isinstance(item, dict):
            return {k: trunc_recursive(v, N) for k, v in item.items()}
        else:
            return truncate(item, N)

    truncated_obj = trunc_recursive(obj, N)
    pprint(truncated_obj, sort_dicts=False)

# Test
data = {
    'long_list': list(range(100)),
    'long_string': 'a' * 100,
    'nested': {
        'nested_list': list(range(50)),
        'nested_string': 'b' * 50
    }
}

truncated_pprint(data, 5)


processor = AutoProcessor.from_pretrained("nielsr/layoutlmv3-finetuned-funsd")
model = AutoModelForTokenClassification.from_pretrained("nielsr/layoutlmv3-finetuned-funsd")

{'long_list': [0, 1, 2, 3, 4, '...'],
 'long_string': 'aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa',
 'nested': {'nested_list': [0, 1, 2, 3, 4, '...'],
            'nested_string': 'bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb'}}


Downloading (…)rocessor_config.json:   0%|          | 0.00/275 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/348 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.34k [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RobertaTokenizer'. 
The class this function is called from is 'LayoutLMv3TokenizerFast'.


Downloading pytorch_model.bin:   0%|          | 0.00/501M [00:00<?, ?B/s]

In [2]:
arxiv_papers = [
    "https://arxiv.org/pdf/2307.04438.pdf",
    "https://arxiv.org/pdf/2306.14697.pdf",
    "https://arxiv.org/pdf/2302.09051.pdf",
    "https://arxiv.org/pdf/2305.10091.pdf",
    "https://arxiv.org/pdf/2305.17474.pdf",
    "https://arxiv.org/pdf/2306.16960.pdf",
    "https://arxiv.org/pdf/2305.20069.pdf",
    "https://arxiv.org/pdf/2306.08451.pdf",
    "https://arxiv.org/pdf/2306.17003.pdf",
    "https://arxiv.org/pdf/2307.07573.pdf",
]

In [8]:
import requests
import re
import os

def sanitize_filename(filename):
    """Convert string to a valid filename."""
    s = str(filename).strip().replace(' ', '_')
    # Remove any character that is not a word character
    # (alphanumeric + underscore), not a hyphen, or not a period.
    return re.sub(r'(?u)[^-\w.]', '', s)

def get_arxiv_metadata(arxiv_id):
    """Fetch metadata for the given arXiv ID."""
    url = f'https://export.arxiv.org/api/query?id_list={arxiv_id}'
    response = requests.get(url)
    response.raise_for_status()
    data = response.text

    # Use regex to extract the title. There are better ways (like parsing XML),
    # but this is simple and should work for our purpose.
    match = re.search(r'<title>([^<]+)</title>', data)
    title = match.group(1) if match else None
    return {'title': title}

def download_arxiv_pdf(arxiv_url):
    """Given a arxiv_url, download the PDF to the data/arxiv directory."""
    arxiv_id = arxiv_url.split('/')[-1].replace('.pdf', '')
    metadata = get_arxiv_metadata(arxiv_id)

    if metadata.get('title'):
        filename = sanitize_filename(metadata['title']) + '.pdf'
    else:
        filename = f"{arxiv_id}.pdf"

    arxiv_dir = Path('data/arxiv')
    os.makedirs(arxiv_dir, exist_ok=True)

    response = requests.get(arxiv_url)
    output_file = arxiv_dir / filename
    with open(output_file, 'wb') as f:
        f.write(response.content)
    print(f"Downloaded to {output_file}")

# Example usage:
arxiv_url = 'https://arxiv.org/pdf/2306.14697.pdf'
# download_arxiv_pdf(arxiv_url)


In [9]:
for paper in arxiv_papers:
    download_arxiv_pdf(paper)

Downloaded to data\arxiv\Reconfigurable_Intelligent_Surface_Assisted_Railway_Communications_A__survey.pdf
Downloaded to data\arxiv\A_Survey_of_Software-Defined_Smart_Grid_Networks_Security_Threats_and__Defense_Techniques.pdf
Downloaded to data\arxiv\Complex_QA_and_language_models_hybrid_architectures_Survey.pdf
Downloaded to data\arxiv\Multi-Agent_Reinforcement_Learning_Methods_Applications_Visionary__Prospects_and_Challenges.pdf
Downloaded to data\arxiv\Macroeconomic_Effects_of_Inflation_Targeting_A_Survey_of_the_Empirical__Literature.pdf
Downloaded to data\arxiv\Sketching_a_Model_on_Fisheries_Enforcement_and_Compliance_--_A_Survey.pdf
Downloaded to data\arxiv\A_survey_on_the_complexity_of_learning_quantum_states.pdf
Downloaded to data\arxiv\A_Survey_on_Blood_Pressure_Measurement_Technologies_Addressing__Potential_Sources_of_Bias.pdf
Downloaded to data\arxiv\A_survey_on_algebraic_dilatations.pdf
Downloaded to data\arxiv\Literature_Survey_on_the_Container_Stowage_Planning_Problem.pdf


In [10]:
print()


