In [1]:
import os
from typing import List

def chunk_text(text: str, chunk_size: int = 512, overlap: int = 50) -> List[str]:
    """
    Chunk a large text into smaller pieces of a specified size with overlap.

    Args:
        text (str): The input text to be chunked.
        chunk_size (int): The size of each chunk.
        overlap (int): The number of overlapping tokens between chunks.

    Returns:
        List[str]: A list of text chunks.
    """
    words = text.split()
    chunks = []
    for i in range(0, len(words), chunk_size - overlap):
        chunk = " ".join(words[i:i + chunk_size])
        chunks.append(chunk)
    return chunks

def read_file(file_path: str) -> str:
    """
    Read the content of a file.

    Args:
        file_path (str): Path to the file.

    Returns:
        str: The content of the file.
    """
    with open(file_path, 'r', encoding='utf-8') as file:
        return file.read()

def save_chunks(chunks: List[str], output_dir: str, base_filename: str) -> None:
    """
    Save chunks to text files.

    Args:
        chunks (List[str]): The list of text chunks.
        output_dir (str): Directory to save the chunk files.
        base_filename (str): Base name for the chunk files.
    """
    os.makedirs(output_dir, exist_ok=True)
    for idx, chunk in enumerate(chunks):
        chunk_filename = f"{base_filename}_chunk_{idx+1}.txt"
        chunk_path = os.path.join(output_dir, chunk_filename)
        with open(chunk_path, 'w', encoding='utf-8') as chunk_file:
            chunk_file.write(chunk)



In [2]:
# Example usage
file_path = 'your_large_document.txt'
output_dir = 'chunked_files'
chunk_size = 512
overlap = 50

# Read the large file
text = read_file(file_path)

# Chunk the text
chunks = chunk_text(text, chunk_size, overlap)



FileNotFoundError: [Errno 2] No such file or directory: 'your_large_document.txt'

In [3]:
import xml.etree.ElementTree as ET

def parse_and_clean_xml(file_path: str) -> str:
    """
    Parse an XML file, extract text content, and clean it by removing excess newlines.

    Args:
        file_path (str): Path to the XML file.

    Returns:
        str: Cleaned text content extracted from the XML file.
    """
    tree = ET.parse(file_path)
    root = tree.getroot()

    # Extract and clean text
    text_content = []
    for elem in root.iter():
        if elem.text:
            # Strip leading/trailing whitespace and replace multiple newlines with a single one
            cleaned_text = elem.text.strip().replace("\n", " ").replace("\r", " ")
            text_content.append(cleaned_text)

    # Join all cleaned text into a single string, ensuring consistent spacing
    return " ".join(text_content)

In [4]:
a = parse_and_clean_xml(r'C:\Users\Work\pico-scholar\pmc_data\PMC1440874\pgen.0020058.nxml')
a

"PLoS Genet PLoS Genet pgen plge plosgen PLoS Genetics 1553-7390 1553-7404 Public Library of Science San Francisco, USA 16628246 1440874 10.1371/journal.pgen.0020058 05-PLGE-RA-0263R3 plge-02-04-14 Research Article Cell Biology Development Hematology Genetics/Gene Function Genetics/Disease Models Genetics/Epigenetics Eukaryotes Animals Vertebrates Mammals Mus (Mouse) Loss of Atrx Affects Trophoblast Development and the Pattern of X-Inactivation in Extraembryonic Tissues Trophoblast Defect in Mice Lacking Atrx Garrick David 1 Sharpe Jackie A 1 Arkell Ruth 2 Dobbie Lorraine 3 Smith Andrew J. H 3 Wood William G 1 Higgs Douglas R 1 Gibbons Richard J 1 * 1 2 3 Reik Wolf Editor The Babraham Institute, United Kingdom * To whom correspondence should be addressed. E-mail: richard.gibbons@molecular-medicine.oxford.ac.uk 4 2006 21 4 2006 2 4 e58 2 9 2005 3 3 2006 © 2006 Garrick et al. 2006 This is an open-access article distributed under the terms of the Creative Commons Attribution License, whic

In [5]:
import os

def get_first_file_by_type(directory: str, file_extension: str) -> str:
    """
    Get the first file with a specific extension in a directory.

    Args:
        directory (str): The directory to search in.
        file_extension (str): The file extension to filter by (e.g., '.xml').

    Returns:
        str: The path to the first matching file, or None if no match is found.
    """
    for file in os.listdir(directory):
        if file.endswith(file_extension):
            return os.path.join(directory, file)
    return None

# Example usage
directory_path = '../pmc_data/PMC1440874'
file_type = '.nxml'
first_xml_file = get_first_file_by_type(directory_path, file_type)

In [6]:
first_xml_file

'../pmc_data/PMC1440874\\pgen.0020058.nxml'

In [7]:
def get_chunks_from_file(directory):
    file_path = get_first_file_by_type(directory,'.nxml')
    parsed_file = parse_and_clean_xml(file_path)
    chunks = chunk_text(parsed_file)
    return chunks

In [8]:
get_chunks_from_file(directory_path)

['PLoS Genet PLoS Genet pgen plge plosgen PLoS Genetics 1553-7390 1553-7404 Public Library of Science San Francisco, USA 16628246 1440874 10.1371/journal.pgen.0020058 05-PLGE-RA-0263R3 plge-02-04-14 Research Article Cell Biology Development Hematology Genetics/Gene Function Genetics/Disease Models Genetics/Epigenetics Eukaryotes Animals Vertebrates Mammals Mus (Mouse) Loss of Atrx Affects Trophoblast Development and the Pattern of X-Inactivation in Extraembryonic Tissues Trophoblast Defect in Mice Lacking Atrx Garrick David 1 Sharpe Jackie A 1 Arkell Ruth 2 Dobbie Lorraine 3 Smith Andrew J. H 3 Wood William G 1 Higgs Douglas R 1 Gibbons Richard J 1 * 1 2 3 Reik Wolf Editor The Babraham Institute, United Kingdom * To whom correspondence should be addressed. E-mail: richard.gibbons@molecular-medicine.oxford.ac.uk 4 2006 21 4 2006 2 4 e58 2 9 2005 3 3 2006 © 2006 Garrick et al. 2006 This is an open-access article distributed under the terms of the Creative Commons Attribution License, whi

In [9]:
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
embedding_model = HuggingFaceEmbedding(model_name="allenai/scibert_scivocab_uncased")

  from .autonotebook import tqdm as notebook_tqdm
No sentence-transformers model found with name allenai/scibert_scivocab_uncased. Creating a new one with mean pooling.


In [10]:
from llama_index.core import Document

In [11]:
test = Document(text = parse_and_clean_xml(r'C:\Users\Work\pico-scholar\pmc_data\PMC1440874\pgen.0020058.nxml'))

In [12]:
import pandas as pd
def get_document_from_file(directory):
    file_path = get_first_file_by_type(directory,'.nxml')
    parsed_file = parse_and_clean_xml(file_path)
    pmcid = os.path.basename(directory)
    df = pd.read_csv('..\pubmed_sample\pubmed24n0541_updated.csv')
    pmid = df[df['PMCID'] == pmcid]['PMID'].values[0]
    doc = Document(text = str(parsed_file), metadata= {'PMCID': str(pmcid),'PMID':str(pmid)})
    return doc

In [13]:
base_path = '..\\pmc_data\\'
paths = os.listdir(base_path)
paths = [base_path + path for path in paths]
docs = [get_document_from_file(path) for path in paths]

In [14]:
docs[0]

Document(id_='2be91544-62fc-4ba8-8e67-3bc4035c1332', embedding=None, metadata={'PMCID': 'PMC1440874', 'PMID': '16628246'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, text="PLoS Genet PLoS Genet pgen plge plosgen PLoS Genetics 1553-7390 1553-7404 Public Library of Science San Francisco, USA 16628246 1440874 10.1371/journal.pgen.0020058 05-PLGE-RA-0263R3 plge-02-04-14 Research Article Cell Biology Development Hematology Genetics/Gene Function Genetics/Disease Models Genetics/Epigenetics Eukaryotes Animals Vertebrates Mammals Mus (Mouse) Loss of Atrx Affects Trophoblast Development and the Pattern of X-Inactivation in Extraembryonic Tissues Trophoblast Defect in Mice Lacking Atrx Garrick David 1 Sharpe Jackie A 1 Arkell Ruth 2 Dobbie Lorraine 3 Smith Andrew J. H 3 Wood William G 1 Higgs Douglas R 1 Gibbons Richard J 1 * 1 2 3 Reik Wolf Editor The Babraham Institute, United Kingdom * To whom correspondence should be addressed. E-mail: richard.gibbons@

In [15]:
os.path.basename('../pmc_data/PMC1440874')
import pandas as pd
df = pd.read_csv('..\pubmed_sample\pubmed24n0541_updated.csv')

In [16]:
df[df['PMCID'] == 'PMC1440874']['PMID'].values[0]

16628246

In [17]:
os.path.basename('../pmc_data/PMC1440874')

'PMC1440874'

In [18]:
docs

[Document(id_='2be91544-62fc-4ba8-8e67-3bc4035c1332', embedding=None, metadata={'PMCID': 'PMC1440874', 'PMID': '16628246'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, text="PLoS Genet PLoS Genet pgen plge plosgen PLoS Genetics 1553-7390 1553-7404 Public Library of Science San Francisco, USA 16628246 1440874 10.1371/journal.pgen.0020058 05-PLGE-RA-0263R3 plge-02-04-14 Research Article Cell Biology Development Hematology Genetics/Gene Function Genetics/Disease Models Genetics/Epigenetics Eukaryotes Animals Vertebrates Mammals Mus (Mouse) Loss of Atrx Affects Trophoblast Development and the Pattern of X-Inactivation in Extraembryonic Tissues Trophoblast Defect in Mice Lacking Atrx Garrick David 1 Sharpe Jackie A 1 Arkell Ruth 2 Dobbie Lorraine 3 Smith Andrew J. H 3 Wood William G 1 Higgs Douglas R 1 Gibbons Richard J 1 * 1 2 3 Reik Wolf Editor The Babraham Institute, United Kingdom * To whom correspondence should be addressed. E-mail: richard.gibbons

In [27]:
import os
import csv
from sqlalchemy import URL
from llama_index.core import StorageContext, VectorStoreIndex, Document
from llama_index.core.storage.index_store import SimpleIndexStore
from llama_index.vector_stores.tidbvector import TiDBVectorStore
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
import textwrap

tidb_connection_url = URL(
    "mysql+pymysql",
    username=os.environ['TIDB_USERNAME'],
    password=os.environ['TIDB_PASSWORD'],
    host=os.environ['TIDB_HOST'],
    port=4000,
    database="test",
    query={"ssl_verify_cert": True, "ssl_verify_identity": True},
)


# Define the TiDB Vector Store
USER_NAME = os.environ['DEVELOPER_NAME']
VECTOR_TABLE_NAME = "full_docs_v4"
tidbvec = TiDBVectorStore(
    connection_string=tidb_connection_url,
    table_name= '_'.join([USER_NAME, VECTOR_TABLE_NAME]),
    distance_strategy="cosine",
    vector_dimension=768, # SciBERT outputs 768-dimensional vectors
    drop_existing_table=False,
)

In [28]:
storage_context = StorageContext.from_defaults(vector_store=tidbvec)
# TiDB automatically persists the embeddings when you use it as your vector store.
for chunk in range(3,len(docs),3):
    index = VectorStoreIndex.from_documents(
        docs[chunk-3:chunk], 
        storage_context=storage_context, 
        embed_model=embedding_model,
        show_progress=True
    )

Parsing nodes: 100%|██████████| 3/3 [00:00<00:00, 48.38it/s]
Generating embeddings: 100%|██████████| 28/28 [00:34<00:00,  1.22s/it]
Parsing nodes: 100%|██████████| 3/3 [00:00<00:00, 38.00it/s]
Generating embeddings: 100%|██████████| 49/49 [01:00<00:00,  1.23s/it]
Parsing nodes: 100%|██████████| 3/3 [00:00<00:00, 45.74it/s]
Generating embeddings: 100%|██████████| 30/30 [00:36<00:00,  1.23s/it]
Parsing nodes: 100%|██████████| 3/3 [00:00<00:00, 69.75it/s]
Generating embeddings: 100%|██████████| 25/25 [00:30<00:00,  1.23s/it]
Parsing nodes: 100%|██████████| 3/3 [00:00<00:00, 88.22it/s]
Generating embeddings: 100%|██████████| 20/20 [00:24<00:00,  1.22s/it]
Parsing nodes: 100%|██████████| 3/3 [00:00<00:00, 57.68it/s]
Generating embeddings: 100%|██████████| 28/28 [00:34<00:00,  1.22s/it]
Parsing nodes: 100%|██████████| 3/3 [00:00<00:00, 749.79it/s]
Generating embeddings: 100%|██████████| 4/4 [00:04<00:00,  1.18s/it]
Parsing nodes: 100%|██████████| 3/3 [00:00<00:00, 90.89it/s]
Generating embed

KeyboardInterrupt: 