In [2]:
import asyncio
import os
import re
import json
from copy import copy, deepcopy
from pathlib import Path
from pprint import pprint, PrettyPrinter
from time import time, sleep
from typing import List, Dict
from uuid import uuid4
from collections import defaultdict

import evaluate
import openai
import requests
import tiktoken
from bs4 import BeautifulSoup, Comment
from doctran import Doctran, ExtractProperty
from dotenv import load_dotenv, find_dotenv
from evaluate import load
from langchain.chat_models import ChatOpenAI
from langchain.docstore.document import Document
from langchain.document_loaders import WebBaseLoader
from langchain.embeddings import (
    OpenAIEmbeddings,
    HuggingFaceEmbeddings,
)
from langchain.llms import OpenAI
from langchain.text_splitter import (
    MarkdownTextSplitter,
    MarkdownHeaderTextSplitter,
    LineType,
    RecursiveCharacterTextSplitter,
)
from loguru import logger
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

from main_ import (
    extract_plan_and_content_wikipedia,
    compare_documents_content,
    compare_documents_sections,
    extract_plan_and_content_patent,
)

_ = load_dotenv(find_dotenv())
openai.api_key = os.getenv('OPENAI_API_KEY')

llm_default = ChatOpenAI(model_name="gpt-4o-mini", streaming=True)
llm_16k = ChatOpenAI(model_name="gpt-4o-mini", streaming=True)

def num_tokens_from_string(string: str, encoding_name: str = "gpt-4o-mini") -> int:
    """Returns the number of tokens in a text string."""
    try:
        encoding = tiktoken.get_encoding(encoding_name)
    except ValueError:
        encoding = tiktoken.encoding_for_model(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens

def convert_to_markdown(article_dict):
    md_text = ""

    for heading, content in article_dict.items():
        # heading is of form: 'h3 Example'
        # Define the markdown equivalent for the heading level
        heading_level = "#" * int(heading[1])
        heading = heading[3:]
        # Append the heading and the content to the markdown text
        md_text += f"{heading_level} {heading}\n\n{content}\n\n"

    return md_text


def truncated_pprint(obj, N=5):
    """Pretty print an object, truncating lists and strings to N items/characters
    for easier viewing of plan_json objects"""
    def truncate(item, N):
        if isinstance(item, list) and N is not None:
            return item[:N] + (['...'] if len(item) > N else [])
        if isinstance(item, str) and N is not None:
            N = 125
            return item[:N] + ('...' if len(item) > N else '')
        return item

    def trunc_recursive(item, N):
        if isinstance(item, list):
            return [trunc_recursive(i, N) for i in truncate(item, N)]
        elif isinstance(item, dict):
            return {k: trunc_recursive(v, N) for k, v in item.items()}
        else:
            return truncate(item, N)

    truncated_obj = trunc_recursive(obj, N)
    pprint(truncated_obj, sort_dicts=False)

# Test
data = {
    'long_list': list(range(100)),
    'long_string': 'a' * 100,
    'nested': {
        'nested_list': list(range(50)),
        'nested_string': 'b' * 50
    }
}

truncated_pprint(data, 5)


  warn_deprecated(


{'long_list': [0, 1, 2, 3, 4, '...'],
 'long_string': 'aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa',
 'nested': {'nested_list': [0, 1, 2, 3, 4, '...'],
            'nested_string': 'bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb'}}


In [3]:
plan_json = await extract_plan_and_content_patent('data/patents/CONDOM FOR ORAL-GENITAL USE.txt')

[32m2025-01-02 11:04:12.242[0m | [1mINFO    [0m | [36mmain_[0m:[36mextract_plan_and_content[0m:[36m1092[0m - [1m
	Extracting plan and content for: data/patents/CONDOM FOR ORAL-GENITAL USE.txt[0m
[32m2025-01-02 11:04:12.242[0m | [1mINFO    [0m | [36mmain_[0m:[36mload_patent_file[0m:[36m186[0m - [1mLoading patent file: data/patents/CONDOM FOR ORAL-GENITAL USE.txt[0m
[32m2025-01-02 11:04:12.243[0m | [1mINFO    [0m | [36mmain_[0m:[36mload_patent_file[0m:[36m213[0m - [1mExtracted 4 sections from patent file named: ['title', 'descr', 'claim_1', 'pdfep'][0m
[32m2025-01-02 11:04:12.244[0m | [1mINFO    [0m | [36mmain_[0m:[36mdivide_sections_if_too_large[0m:[36m336[0m - [1mDividing sections if too large in plan and section content.[0m
[32m2025-01-02 11:04:12.426[0m | [1mINFO    [0m | [36mmain_[0m:[36mdivide_sections_if_too_large[0m:[36m353[0m - [1mContent character length: 20525 tokens: 4684 for 'descr'[0m
[32m2025-01-02 11:04:13.200[

In [4]:
truncated_pprint(plan_json)

{'id': '901018fe-0a22-40cb-9e61-fdb7068d3375',
 'title': 'CONDOM FOR ORAL-GENITAL USE',
 'abstract': 'EP\t0600858\tB1\t2000-05-17\ten\tDESCR\t1\t<heading '
             'id="h0001"><u><b>Background of the Invention</b></u></heading><p '
             'id="p0001" num=...',
 'title_embedding_1': [-0.01704478546567264,
                       0.014600377691821645,
                       0.00965871145344032,
                       -0.006282786973302903,
                       0.0017804532770638595,
                       '...'],
 'title_embedding_2': [-0.0156312994658947,
                       0.009681111201643944,
                       -0.025443455204367638,
                       0.007561311591416597,
                       -0.04576288163661957,
                       '...'],
 'abstract_embedding_1': [-0.018546617592705147,
                          0.004004383195714558,
                          0.0038718101190326724,
                          -0.013624421075753727,
                    

In [5]:
Path(f'{plan_json["title"]}.json').absolute()

PosixPath('/home/xavier/document_embedding_analysis/CONDOM FOR ORAL-GENITAL USE.json')

In [6]:
with open(f'{plan_json["title"]}.json', 'w') as file:
    json.dump(plan_json, file, indent=4)

In [7]:
plan_json['title']

'CONDOM FOR ORAL-GENITAL USE'

In [8]:
with open('condom.json', 'r') as file:
    loaded = json.load(file)

loaded.keys()

FileNotFoundError: [Errno 2] No such file or directory: 'condom.json'

In [9]:
with open('data/2022week30_EP0600000_extract.txt', 'r') as f:
    text = f.read()

with open('data/2022week30_EP0600000_extract.txt', 'r') as f:
    lines = f.readlines()

In [10]:
len(lines)

200

* \ten\t shows it's English
* \tfr\t shows it's French etc.
* Contains TITLE, DESCR, and CLAIM
* Sometimes CLAIM 1 and CLAIM 2 are written in different languages. Do we still need to translate them? No. Even though it says 'CLAIM 2' it's actually the same information, just take the english bit.

Results
* Only ever 1 x CLAIM
* Split on TITLE and everything is fine.

In [11]:
claim_en = lines[20]
claim_de = lines[21]
claim_fr = lines[22]
claim_de

'EP\t0668908\tB1\t2000-05-03\tde\tCLAIM\t2\t<claim id="c-de-01-0001" num="0001"><claim-text>Verfahren zur differenzierten Diagnose einer NematodenInfektion mit einem speziellen Nematoden-Typ, das im Fall von gemischten Nematoden-Infektionen anwendbar ist, wobei bei dem Verfahren eine DNA-Sequenz, die das β-Tubulin-Gen oder ein Teil dieses Gens dieser Nematode ist, als Oligonukleotid-Sonde und/oder Primer verwendet wird, und der Teil mindestens 10, vorzugsweise mindestens 18 Nukleotide, umfaßt.</claim-text></claim><claim id="c-de-01-0002" num="0002"><claim-text>Verfahren nach Anspruch 1, bei dem die spezielle Nematode Haemonchus contortus und das β-Tubulin-Gen das β-Tubulin-Gen von Haemonchus contortus ist.</claim-text></claim><claim id="c-de-01-0003" num="0003"><claim-text>Verfahren nach Anspruch 1 oder 2, bei dem die Sonde und/oder der Primer einen Teil der 3\'-Hälfte des β-Tubulin-Gens umfaßt.</claim-text></claim><claim id="c-de-01-0004" num="0004"><claim-text>Verfahren nach einem de

In [12]:
claim_en

'EP\t0668908\tB1\t2000-05-03\ten\tCLAIM\t1\t<claim id="c-en-01-0001" num="0001"><claim-text>A method for differentiated diagnosis of nematode infection with a specific type of nematode useful in the case of mixed nematode infections said method comprising use of a DNA sequence, said DNA sequence being the β-tubulin gene or a part of said gene of said nematode as oligonucleotide probe and/or primer, said part comprising at least 10, preferably at least 18 nucleotides.</claim-text></claim><claim id="c-en-01-0002" num="0002"><claim-text>A method according to claim 1 wherein the specific nematode is <u>Haemonchus</u> <u>contortus</u> and the β-tubulin gene is the β-tubulin gene of <u>Haemonchus</u> <u>contortus</u>.</claim-text></claim><claim id="c-en-01-0003" num="0003"><claim-text>A method according to claim 1 or 2 wherein the probe and/or primer comprises a part of the 3\' half of the β-tubulin gene.</claim-text></claim><claim id="c-en-01-0004" num="0004"><claim-text>A method according 

In [13]:
lines_en = [line for line in lines if '\ten\t' in line]
lines_en[0]

'EP\t0600858\tB1\t2000-05-17\ten\tTITLE\t1\tCONDOM FOR ORAL-GENITAL USE\n'

In [14]:
lines_en[4]

'EP\t0600865\tB1\t2000-09-20\ten\tTITLE\t1\tXYLANASE PRODUCTION\n'

In [15]:
a = defaultdict(str)
a[1] += 'hello'
a

defaultdict(str, {1: 'hello'})

In [16]:
def split_patents_into_individual_files(patents_file):
    """Read in a file containing many patents. Split each patent into its own file, keeping
    only the english parts, and write to disk."""
    # Read in file
    with open(patents_file, 'r') as f:
        lines = f.readlines()
    # Get all eng sections
    lines_en = [line for line in lines if '\ten\t' in line]
    # Split each on TITLE and write to its own file with TITLE as filename
    os.makedirs('data/patents', exist_ok=True)
    title = 'no title found'
    # Create dict of patents
    patents: defaultdict = defaultdict(str)
    for i, x in enumerate(lines_en):
        if '\tTITLE\t' in x:
            title = x.split('\t')[-1].strip()
        patents[title] += x

    # Write each patent to its own file
    for title, content in patents.items():
        filename_friendly_title = "".join(i for i in title if i not in "\/:*?<>|")
        with open(f'data/patents/{filename_friendly_title}.txt', 'w') as f:
            f.write(content)
            logger.info(f'Wrote file: {filename_friendly_title}.txt')

def load_patent_file(patent_file) -> Dict[str, str]:
    """Read in a patent file and return a dict with keys as section titles and values the content.

    Parameters
    ----------
    patent_file : str
        Path to the patent file.

    Returns
    -------
    patent_dict : dict
        Dict with keys as section titles and values the content. Keys are ['title',
        'descr', 'claim_1', 'claim_2', ..., 'claim_n', 'pdfep']. Not all patents
        will have all keys. All will have 'title' at a minimum.
    """
    # Read file
    with open(patent_file, 'r') as f:
        lines: list = f.readlines()

    # Get all english sections
    lines_en: list = [line for line in lines if '\ten\t' in line]

    # Convert into dict with keys as section titles and values the content
    patent_dict = {}
    total_claims = 1
    for x in lines_en:
        if '\tTITLE\t' in x:
            patent_dict['title'] = x
        elif '\tDESCR\t' in x:
            patent_dict['descr'] = x
        elif '\tCLAIM\t' in x:
            # Some patents have multiple claims, so we need to number them
            patent_dict[f'claim_{total_claims}'] = x
            total_claims += 1
        elif '\tPDFEP' in x:
            patent_dict['pdfep'] = x
        else:
            raise ValueError(f"Expected sections in [TITLE, DESCR, CLAIM, PDFEP]. Received: {x}")

    return patent_dict


In [17]:
a = load_patent_file('data/patents/XYLANASE PRODUCTION.txt')
a.keys()

dict_keys(['title', 'descr', 'claim_1', 'pdfep'])

In [18]:
patents: defaultdict = defaultdict(list)
total = 0
for i, x in enumerate(lines_en):
    if '\tTITLE\t' in x:
        print()
        total += 1
        print(f"Total Patents: {total}")
    print(rf"{i}/{len(lines_en)} - {repr(x[:125])}")
    patents[f"patent_{total}"].append(x)


Total Patents: 1
0/93 - 'EP\t0600858\tB1\t2000-05-17\ten\tTITLE\t1\tCONDOM FOR ORAL-GENITAL USE\n'
1/93 - 'EP\t0600858\tB1\t2000-05-17\ten\tDESCR\t1\t<heading id="h0001"><u><b>Background of the Invention</b></u></heading><p id="p0001" num='
2/93 - 'EP\t0600858\tB1\t2000-05-17\ten\tCLAIM\t1\t<claim id="c-en-01-0001" num="0001"><claim-text>A condom (31) suitable for oral-genital u'
3/93 - 'EP\t0600858\tB1\t2000-05-17\ten\tPDFEP\t1\thttps://data.epo.org/publication-server/pdf-document?cc=EP&pn=0600858&ki=B1&pd=2000-05-17'

Total Patents: 2
4/93 - 'EP\t0600865\tB1\t2000-09-20\ten\tTITLE\t1\tXYLANASE PRODUCTION\n'
5/93 - 'EP\t0600865\tB1\t2000-09-20\ten\tDESCR\t1\t<p id="p0001" num="0001">This invention lies in the field of recombinant DNA technology. '
6/93 - 'EP\t0600865\tB1\t2000-09-20\ten\tCLAIM\t1\t<claim id="c-en-01-0001" num="0001"><claim-text>A recombinant DNA material comprising a n'
7/93 - 'EP\t0600865\tB1\t2000-09-20\ten\tPDFEP\t1\thttps://data.epo.org/publication-server/pdf-doc

In [19]:
for x in patents['patent_1']:
    print(num_tokens_from_string(x))

30
4684
904
56


In [20]:
type(patents['patent_1'])

list

In [21]:
condom = patents['patent_1']
condom_dict = {}
total_claims = 1
for x in condom:
    if '\tTITLE\t' in x:
        condom_dict['title'] = x
    elif '\tDESCR\t' in x:
        condom_dict['descr'] = x
    elif '\tCLAIM\t' in x:
        condom_dict[f'claim_{total_claims}'] = x
        total_claims += 1
    elif '\tPDFEP' in x:
        condom_dict['pdfep'] = x
    else:
        raise ValueError(f"Expected sections in [TITLE, DESCR, CLAIM, PDFEP]. Received: {x}")

In [22]:
for k, v in condom_dict.items():
    print(k, num_tokens_from_string(v))

title 30
descr 4684
claim_1 904
pdfep 56


In [32]:
from typing import Dict


async def _extract_title(string: str) -> str:
    """Extract a title from `string` that is max 7 words long."""
    doctran = Doctran(
        openai_api_key=os.getenv("OPENAI_API_KEY"), openai_model="gpt-4o-mini"
    )
    document = doctran.parse(content=string)
    properties = ExtractProperty(
        name="title",
        description="The title of the document (max 7 words).",
        type="string",
        required=True,
    )
    document = document.extract(properties=[properties]).execute() # await document.extract(properties=[properties]).execute()
    return document.transformed_content

async def divide_sections_if_too_large(
    article_dict: Dict[str, str],
    max_section_length: int = 512,
    doc_type: str = "patent",
) -> Dict[str, str]:
    """This function takes an existing dictionary containing the plan and sections
    content (from above functions), checks if any section is too large (i.e., more
    than 512 tokens), divides such sections into smaller sections, generates a new
    title, and returns the updated dictionary
    """
    if doc_type not in ["patent", "wikipedia", "arxiv"]:
        raise ValueError(
            f"doc_type must be one of 'patent', 'wikipedia', or 'arxiv'. Got {doc_type}."
        )
    logger.info("Dividing sections if too large in plan and section content.")
    final_dict: Dict = {}
    start_dict = copy(article_dict)

    def is_reference_section(heading: str):
        """Returns True if heading is a reference section."""
        heading = heading.lower()
        result = (
            "reference" in heading
            or "further reading" in heading
            or "see also" in heading
        )
        return result

    for heading, content in start_dict.items():
        num_tokens = num_tokens_from_string(content)
        # Each section must contain something, otherwise the embedding models fail
        if num_tokens == 0:
            final_dict[heading] = " "
        # If the section is small enough, add it to the final dict
        elif num_tokens <= max_section_length:
            final_dict[heading] = content
        # If section is too big, split into smaller sections, extract title, and add to final dict
        else:
            # Split
            char_splitter = RecursiveCharacterTextSplitter(
                chunk_size=max_section_length,
                chunk_overlap=0,
                # ' ' separator means sometimes sentences will be cut in two to ensure
                # the chunk size is not exceeded
                separators=["\n\n", "\n", " "],
                length_function=num_tokens_from_string,
            )
            splits: List[str] = char_splitter.split_text(content)
            # Keep heading the same but add numbers to sections e.g. 'h2 Reference' -> 'h2 Reference 1'
            if doc_type == "wikipedia" and is_reference_section(heading):
                for i, split in enumerate(splits, start=1):
                    new_heading = f"{heading} {i}"
                    final_dict[new_heading] = split
                    logger.info(
                        f"Added '{new_heading}' split original heading '{heading}'"
                    )
            # Create new titles for each split
            for split in splits:
                # Headings are of the form h1, h2, h3 etc. we split it into more of the same level
                if doc_type == "wikipedia":
                    heading_level = int(heading[1])
                    title = await _extract_title(split)
                    new_heading = f"h{heading_level} {title}"
                # Heading levels aren't important for other doc_types
                else:
                    new_heading = await _extract_title(split)
                final_dict[new_heading] = split
                logger.info(f"Added '{new_heading}' split original heading '{heading}'")

    n_keys_start = len(start_dict.keys())
    n_keys_final = len(final_dict.keys())
    logger.info(
        f"\n\tFinished dividing sections if too large in plan and section content."
        f"\n\tStarted with {n_keys_start} sections and got {n_keys_final} final sections."
        f"\n\tThat's a {n_keys_final / n_keys_start:.2f}x increase in sections"
    )
    return final_dict

In [33]:
divided = await divide_sections_if_too_large(condom_dict)

[32m2025-01-02 11:09:16.767[0m | [1mINFO    [0m | [36m__main__[0m:[36mdivide_sections_if_too_large[0m:[36m33[0m - [1mDividing sections if too large in plan and section content.[0m
[32m2025-01-02 11:09:17.888[0m | [1mINFO    [0m | [36m__main__[0m:[36mdivide_sections_if_too_large[0m:[36m86[0m - [1mAdded 'Background of the Invention' split original heading 'descr'[0m
[32m2025-01-02 11:09:18.623[0m | [1mINFO    [0m | [36m__main__[0m:[36mdivide_sections_if_too_large[0m:[36m86[0m - [1mAdded 'Condoms and Cunnilingus Safety' split original heading 'descr'[0m
[32m2025-01-02 11:09:19.421[0m | [1mINFO    [0m | [36m__main__[0m:[36mdivide_sections_if_too_large[0m:[36m86[0m - [1mAdded 'Condom Design for Oral Sex' split original heading 'descr'[0m
[32m2025-01-02 11:09:20.172[0m | [1mINFO    [0m | [36m__main__[0m:[36mdivide_sections_if_too_large[0m:[36m86[0m - [1mAdded 'Condom for Oral-Genital Use' split original heading 'descr'[0m
[32m2025-

In [34]:
condom_dict.keys()

dict_keys(['title', 'descr', 'claim_1', 'pdfep'])

In [35]:
divided.keys()

dict_keys(['title', 'Background of the Invention', 'Condoms and Cunnilingus Safety', 'Condom Design for Oral Sex', 'Condom for Oral-Genital Use', 'Description of the Preferred Embodiments', 'Condom Design and Features', 'Condom Design for Cunnilingus', 'Condom Design for Oral Use', 'Condom Enhancements for Pleasure', 'EP 0600858 B1 Condom Claims', 'Condom Design Claims', 'pdfep'])

In [36]:
list(divided.values())[1]

'EP\t0600858\tB1\t2000-05-17\ten\tDESCR\t1\t<heading id="h0001"><u><b>Background of the Invention</b></u></heading><p id="p0001" num="0001">In recent years, the appearance of the HIV virus and the impending Acquired Immune Deficiency Syndrome epidemic have created an atmosphere of fear, caution and prudence in which sexually active adults must consider protecting themselves from infection by HIV virus as well as other venereal diseases. It is generally accepted as fact that the condom provides the best protection from venereal disease and HIV virus, aside from complete sexual abstinence. Unfortunately, such a positive barrier device is, at present, the only sure or certain way to prevent transmission of all STD\'s, many of which cause great personal suffering and, in the cases of AIDS and syphilis, death.</p><p id="p0002" num="0002">Condoms are generally designed to be applied to the male member, although recent adaptations have been introduced which permit their use by women in the fo

In [39]:
divided['EP 0600858 B1 Condom Claims']

'EP\t0600858\tB1\t2000-05-17\ten\tCLAIM\t1\t<claim id="c-en-01-0001" num="0001"><claim-text>A condom (31) suitable for oral-genital use, comprising an elongated tubular body (32) closed at one end (33) and open at the other end (34), said tubular body being of a flexible material impermeable to liquids; characterised in that the condom (31) is provided with first and second annular flanges (36, 37) positioned near the open end (34) of said tubular body (32) and spaced apart from each other; said annular flanges (36, 37) and tubular body being adapted to engage the lips of a user for oral-genital use and prevent skin-to-skin contact between the oral and genital surfaces.</claim-text></claim><claim id="c-en-01-0002" num="0002"><claim-text>A condom (31) according to claim 1, wherein said first and second flanges (36, 37) define therebetween an annular channel (38) adapted to receive the lips of the condom user to retain said condom (31) within the mouth.</claim-text></claim><claim id="c-e

In [40]:
re.findall('</heading>', ''.join(condom))

['</heading>', '</heading>', '</heading>']

In [41]:
condom_str = ''.join(condom)
for x in condom_str.split('</heading>'):
    print(x[-125:])

CONDOM FOR ORAL-GENITAL USE
EP	0600858	B1	2000-05-17	en	DESCR	1	<heading id="h0001"><u><b>Background of the Invention</b></u>
colour to enhance enjoyment during use.<!-- EPO <DP n="5"> --></p><heading id="h0002"><u>Brief Description of the Drawing</u>
tion for cunnilingus.</li></ul><!-- EPO <DP n="7"> --></p><heading id="h0003"><u>Description of the Preferred Embodiments</u>
P	0600858	B1	2000-05-17	en	PDFEP	1	https://data.epo.org/publication-server/pdf-document?cc=EP&pn=0600858&ki=B1&pd=2000-05-17



In [42]:
type(condom)

list

In [43]:
for k, v in divided.items():
    print(k, num_tokens_from_string(v))

title 30
Background of the Invention 511
Condoms and Cunnilingus Safety 512
Condom Design for Oral Sex 512
Condom for Oral-Genital Use 512
Description of the Preferred Embodiments 512
Condom Design and Features 512
Condom Design for Cunnilingus 513
Condom Design for Oral Use 511
Condom Enhancements for Pleasure 77
EP 0600858 B1 Condom Claims 512
Condom Design Claims 392
pdfep 56
