Import packages

In [1]:
import requests
from bs4 import BeautifulSoup
import os
#from langchain_chroma import Chroma
from langchain.vectorstores.chroma import Chroma
from langchain_community.document_loaders import DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
import hashlib
import json
from langchain_core.documents import Document
from tqdm import tqdm
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_core.documents import Document

Webscraping from Wikipedia and storing info in a folder

In [2]:
import requests
from bs4 import BeautifulSoup
import os

def fetch_wikipedia_html(title):
    # Fetch the HTML content of a Wikipedia page using the Wikipedia API
    URL = f"https://en.wikipedia.org/w/api.php"
    PARAMS = {
        "action": "parse",
        "page": title,
        "format": "json",
        "prop": "text",
        "redirects": True
    }
    
    response = requests.get(URL, params=PARAMS)
    response.raise_for_status()  # Raise an exception for HTTP errors
    
    data = response.json()
    return data['parse']['text']['*']

def save_html_to_file(html_content, filename):
    with open(filename, "w", encoding="utf-8") as file:
        file.write(html_content)

def fetch_and_save_article(title, base_dir):
    # Fetch and save the main article
    print(f"Fetching article: {title}")
    html_content = fetch_wikipedia_html(title)
    
    # Use BeautifulSoup to prettify the HTML and find hyperlinks
    soup = BeautifulSoup(html_content, "html.parser")
    pretty_html = soup.prettify()
    
    main_filename = os.path.join(base_dir, title.replace(" ", "_") + ".html")
    save_html_to_file(pretty_html, main_filename)
    print(f"Saved {title} as {main_filename}")
    
    # Fetch and save each hyperlink
    for link in soup.find_all('a', href=True):
        href = link['href']
        if href.startswith('/wiki/') and not ':' in href:
            linked_title = href.split('/wiki/')[1]
            linked_title = linked_title.replace('_', ' ')
            # Skip identifier links
            if any(identifier in linked_title.lower() for identifier in ["doi", "pmid", "isbn", "bibcode", "s2cid", "hdl", "arxiv", "pmc"]):
                continue
            try:
                linked_html = fetch_wikipedia_html(linked_title)
                linked_soup = BeautifulSoup(linked_html, "html.parser")
                linked_pretty_html = linked_soup.prettify()
                
                linked_filename = os.path.join(base_dir, linked_title.replace(" ", "_") + ".html")
                save_html_to_file(linked_pretty_html, linked_filename)
                print(f"Saved linked article {linked_title} as {linked_filename}")
            except Exception as e:
                print(f"Failed to fetch/save linked article {linked_title}: {e}")

# Directory to save the articles
base_dir = "wikiArticles"
if not os.path.exists(base_dir):
    os.makedirs(base_dir)

# Main article to fetch and save
main_article = "Temporal envelope and fine structure"
fetch_and_save_article(main_article, base_dir)

Fetching article: Temporal envelope and fine structure
Saved Temporal envelope and fine structure as wikiArticles/Temporal_envelope_and_fine_structure.html
Saved linked article Amplitude as wikiArticles/Amplitude.html
Saved linked article Audio frequency as wikiArticles/Audio_frequency.html
Saved linked article Loudness as wikiArticles/Loudness.html
Saved linked article Pitch (music) as wikiArticles/Pitch_(music).html
Saved linked article Timbre as wikiArticles/Timbre.html
Saved linked article Spatial hearing as wikiArticles/Spatial_hearing.html
Saved linked article Auditory system as wikiArticles/Auditory_system.html
Saved linked article Loudness as wikiArticles/Loudness.html
Saved linked article Pitch (music) as wikiArticles/Pitch_(music).html
Saved linked article Timbre as wikiArticles/Timbre.html
Saved linked article Auditory scene analysis as wikiArticles/Auditory_scene_analysis.html
Saved linked article Sound localization as wikiArticles/Sound_localization.html
Saved linked artic

Storing to Vector Database 

In [2]:
# Define a custom loader function that returns a Document object
def custom_bshtml_loader(file_path: str) -> Document:
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()
    soup = BeautifulSoup(content, 'lxml')
    text = soup.get_text()
    return Document(page_content=text, metadata={"source": file_path})

try:
    # Initialize the embeddings model
    embeddings_model = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
    print("Embeddings model initialized:", embeddings_model)

    # Initialize the vector store with the embedding function
    print("Initializing Chroma vector store...")
    docs_vectorstore = Chroma(
        collection_name="docs_store",
        persist_directory="docs-db",
        embedding_function=embeddings_model
    )
    print("Vector store initialized:", docs_vectorstore)

    # Load the HTML documents using the custom loader function
    file_paths = [os.path.join("wikiArticles", fname) for fname in os.listdir("wikiArticles") if fname.endswith('.html')]
    docs = [custom_bshtml_loader(fp) for fp in tqdm(file_paths, desc="Loading HTML files")]

    # Debugging: Print the number of documents loaded
    print(f"Loaded {len(docs)} documents")

    # Split the documents into smaller chunks
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000, chunk_overlap=200, add_start_index=True
    )
    splits = text_splitter.split_documents(docs)

    # Debugging: Print the number of splits created
    print(f"Created {len(splits)} splits")

    # Add documents to the vector store
    docs_vectorstore.add_texts([doc.page_content for doc in splits], metadatas=[doc.metadata for doc in splits])
    print("Documents added to the vector store.")

except ImportError as e:
    print(f"An import error occurred: {e}")
    if 'sentence_transformers' in str(e):
        print("Please install sentence-transformers with `pip install sentence-transformers`.")
    elif 'torch' in str(e):
        print("Please install torch with `pip install torch`.")
    else:
        print("Please check the error message and install the necessary packages.")
except AttributeError as e:
    print(f"An attribute error occurred: {e}")
    print("This might be due to a version incompatibility or bug in the library.")
except Exception as e:
    print(f"An error occurred: {e}")


  from tqdm.autonotebook import tqdm, trange


Embeddings model initialized: client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
) model_name='all-MiniLM-L6-v2' cache_folder=None model_kwargs={} encode_kwargs={} multi_process=False show_progress=False
Initializing Chroma vector store...
Vector store initialized: <langchain_community.vectorstores.chroma.Chroma object at 0x13fdb85c0>


Loading HTML files: 100%|██████████| 53/53 [00:04<00:00, 10.62it/s]


Loaded 53 documents
Created 4127 splits
Documents added to the vector store.


Computing Embeddings

* This takes around 40 minutes ouch...

In [3]:
# Function to compute stable hash for each document and append index for uniqueness
def stable_hash(doc: Document, index: int) -> str:
    return hashlib.sha1(json.dumps(doc.metadata, sort_keys=True).encode()).hexdigest() + f"_{index}"

# Compute stable hashes with a progress bar
split_ids = [stable_hash(doc, i) for i, doc in tqdm(enumerate(splits), desc="Computing stable hashes")]

# Prepare texts and metadata for embedding
texts = [doc.page_content for doc in splits]
metadatas = [doc.metadata for doc in splits]

# Compute embeddings with a progress bar
embeddings = []
for i in tqdm(range(0, len(texts), 100), desc="Computing embeddings"):
    batch = texts[i:i + 100]
    embeddings.extend(embeddings_model.embed_documents(batch))

# Add documents to the vector store
docs_vectorstore.add_texts(texts, metadatas=metadatas, ids=split_ids, embeddings=embeddings)


Computing stable hashes: 4127it [00:00, 125239.79it/s]
Computing embeddings: 100%|██████████| 42/42 [08:07<00:00, 11.60s/it]


['2d966a76e311640932391c7b921ff65d40c52dcf_0',
 'a8bd07bcbd35bceeadc6edf02de3f6c182c766dc_1',
 '69b8a582b7ed4b689d1b4cbea72d956cca07a768_2',
 '197cd1737e7fdb69281706fdf1dad7b32d4d0fa8_3',
 'e6b4623d9f0aa1c7d66a9a4b476bcb3e429bb0f1_4',
 '47db583b33ddd7f5ad2b70bd36526cd0b9e5d156_5',
 '985dcf4149f5332729acc506dc5c29b0d096bb65_6',
 '8f13ceb1fb701a69dc931066d37e129b70d823dd_7',
 '7dec63deeb1905aadf5009f69351ec9a29c2e84b_8',
 '128f58a065de5a522d4f3766ad791685b6e9db54_9',
 '0db7663df1ad168efb017f16cadf4578b33e4c25_10',
 'cb2f1c6368a4557959dc86fbd63706128f7e0598_11',
 '38b8352e41d1452af86ad35011846dec3f03e10c_12',
 '29f38fc07b07fa6cbebd38716ee9f35b583ea649_13',
 'e0257356ca783494b178b4ecde59052b93fefd70_14',
 '29a67cec9d0c1b7853ada89af255725e5ca8a3d6_15',
 '09ff99a250fcabe7ed0f53a0abb68276e959fdea_16',
 '1fa361e9960f1a0568215f4af51d47923a0a6811_17',
 '98dcde22004aa2abbe8969869df45fbed995d6dc_18',
 'bc4be3e8620e61793f6999dd84f57665abb5b9d9_19',
 'e874a0645ed723c196261ce090b18fd3085b2fe9_20',
 '

In [16]:
docs_vectorstore

<langchain_community.vectorstores.chroma.Chroma at 0x13fdb85c0>

LLM 

In [21]:
from langchain_community.llms.ollama import Ollama

In [22]:
retriever = docs_vectorstore.as_retriever(search_kwargs={"k": 20})

In [23]:
# create a RAG prompt that includes the question and the source documents
from langchain_core.prompts import ChatPromptTemplate

template = """
You are an assistant for question-answering tasks.
Given the following extracted parts of a long document and a question, create a final answer with references ("SOURCES").
If you don't know the answer, just say that you don't know. Don't try to make up an answer.
ALWAYS return a "SOURCES" part in your answer.

QUESTION: {question}
=========
{source_documents}
=========
FINAL ANSWER: """
prompt = ChatPromptTemplate.from_template(template)

In [28]:
llm = Ollama(model="llama3.1")

In [29]:
llm_chain = prompt | llm

In [30]:
from typing import List

from langchain_core.runnables import RunnableParallel, RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser


def format_docs(docs: List[Document]) -> str:
    return "\n\n".join(
        f"Content: {doc.page_content}\nSource: {doc.metadata['source']}" for doc in docs
    )


rag_chain_from_docs = (
    RunnablePassthrough.assign(
        source_documents=(lambda x: format_docs(x["source_documents"]))
    )
    | prompt
    | llm
    | StrOutputParser()
)
rag_chain = RunnableParallel(
    {
        "source_documents": retriever,
        "question": RunnablePassthrough(),
    }
).assign(answer=rag_chain_from_docs)


Ask a question

In [31]:
question = "What is temporal fine structure?"
response = rag_chain.invoke(question)
answer = response["answer"]
answer


"It appears you've provided a snippet of text from Wikipedia articles related to temporal theory in hearing. The final answer isn't explicitly stated in the provided text. However, based on the context, I can offer an interpretation.\n\nTemporal theory in hearing suggests that the consistent timing patterns of nerve firings code for a consistent pitch percept. This theory was first proposed by August Seebeck."

In [32]:
question = "Explain temporal fine structure to a child."
response = rag_chain.invoke(question)
answer = response["answer"]
answer


"It appears that you've provided a series of text blocks related to the temporal theory in hearing. However, there is no specific question asked at the end.\n\nIf I were to extract a final answer from this information, it would be:\n\nThe temporal fine structure (TFS) plays a crucial role in the representation of frequency components of complex sounds, and its deterioration as it passes through successive stages of the auditory pathway remains a problem."

Visualization 

In [33]:
import pandas as pd

response = docs_vectorstore.get(include=["metadatas", "documents", "embeddings"])
df = pd.DataFrame(
    {
        "id": response["ids"],
        "source": [metadata.get("source") for metadata in response["metadatas"]],
        "page": [metadata.get("page", -1) for metadata in response["metadatas"]],
        "document": response["documents"],
        "embedding": response["embeddings"],
    }
)
df["contains_answer"] = df["document"].apply(lambda x: "Eichler" in x)
df["contains_answer"].to_numpy().nonzero()

(array([], dtype=int64),)

In [34]:
# add the question and answer with their embeddings to the dataframe
question_row = pd.DataFrame(
    {
        "id": ["question"],
        "question": [question],
        "embedding": [embeddings_model.embed_query(question)],
    }
)
answer_row = pd.DataFrame(
    {
        "id": ["answer"],
        "answer": [answer],
        "embedding": [embeddings_model.embed_query(answer)],
    }
)
df = pd.concat([question_row, answer_row, df])
df


Unnamed: 0,id,question,embedding,answer,source,page,document,contains_answer
0,question,Explain temporal fine structure to a child.,"[0.001478569582104683, 0.014106594026088715, 0...",,,,,
0,answer,,"[0.007883812300860882, -0.07473494857549667, 0...",It appears that you've provided a series of te...,,,,
0,00177fab0977228844fe42c47319bcf1da0e7ab6_3860,,"[0.029091322794556618, -0.009398755617439747, ...",,wikiArticles/Hearing_aid.html,-1.0,Iceland\n \n\n\n [\n \n\n\n edit\n ...,False
1,001b682a-900c-4a71-b21f-c5113d308e7a,,"[-0.047521285712718964, -0.14120860397815704, ...",,wikiArticles/Auditory_system.html,-1.0,pons\n \n (superior cerebellar peduncle:\n ...,False
2,00295ab4-e71a-43ec-8da8-6a7707405cde,,"[0.011854511685669422, -0.07800266146659851, -...",,wikiArticles/Cochlear_nucleus.html,-1.0,Octopus cells\n \n are found in a small re...,False
...,...,...,...,...,...,...,...,...
8249,ffe71e34-2161-41ed-a09a-01f821d6bc09,,"[0.039672791957855225, -0.06971942633390427, 0...",,wikiArticles/Temporal_envelope_and_fine_struct...,-1.0,978-3-319-63449-4\n \n\n .\n \n...,False
8250,fff30a1bccb8dcb48d5a1747ea90a2df205059a1_3966,,"[0.023147236555814743, 0.02283116802573204, -0...",,wikiArticles/Cochlea.html,-1.0,History\n \n\n\n [\n \n\n\n edit\n ...,False
8251,fff4538d326ec920a06ebbd61ba540cb3b508734_2906,,"[0.01867934688925743, 0.043569713830947876, -0...",,wikiArticles/Amplitude_modulation.html,-1.0,978-0-87259-096-0\n \n\n .\n ...,False
8252,fff8673d3aa5c56ca7fc637e3cb1f26089a35e97_1816,,"[0.0651249885559082, -0.05731048807501793, -0....",,wikiArticles/Temporal_envelope_and_fine_struct...,-1.0,[178]\n \n\n discrimination of the fundamen...,False


In [35]:
# calculate the distance (L2 norm) between the question and the document embeddings
import numpy as np

question_embedding = embeddings_model.embed_query(question)
df["dist"] = df.apply(
    lambda row: np.linalg.norm(np.array(row["embedding"]) - question_embedding),
    axis=1,
)


In [36]:
df

Unnamed: 0,id,question,embedding,answer,source,page,document,contains_answer,dist
0,question,Explain temporal fine structure to a child.,"[0.001478569582104683, 0.014106594026088715, 0...",,,,,,0.000000
0,answer,,"[0.007883812300860882, -0.07473494857549667, 0...",It appears that you've provided a series of te...,,,,,0.932286
0,00177fab0977228844fe42c47319bcf1da0e7ab6_3860,,"[0.029091322794556618, -0.009398755617439747, ...",,wikiArticles/Hearing_aid.html,-1.0,Iceland\n \n\n\n [\n \n\n\n edit\n ...,False,1.494712
1,001b682a-900c-4a71-b21f-c5113d308e7a,,"[-0.047521285712718964, -0.14120860397815704, ...",,wikiArticles/Auditory_system.html,-1.0,pons\n \n (superior cerebellar peduncle:\n ...,False,1.170760
2,00295ab4-e71a-43ec-8da8-6a7707405cde,,"[0.011854511685669422, -0.07800266146659851, -...",,wikiArticles/Cochlear_nucleus.html,-1.0,Octopus cells\n \n are found in a small re...,False,1.228846
...,...,...,...,...,...,...,...,...,...
8249,ffe71e34-2161-41ed-a09a-01f821d6bc09,,"[0.039672791957855225, -0.06971942633390427, 0...",,wikiArticles/Temporal_envelope_and_fine_struct...,-1.0,978-3-319-63449-4\n \n\n .\n \n...,False,1.315219
8250,fff30a1bccb8dcb48d5a1747ea90a2df205059a1_3966,,"[0.023147236555814743, 0.02283116802573204, -0...",,wikiArticles/Cochlea.html,-1.0,History\n \n\n\n [\n \n\n\n edit\n ...,False,1.336406
8251,fff4538d326ec920a06ebbd61ba540cb3b508734_2906,,"[0.01867934688925743, 0.043569713830947876, -0...",,wikiArticles/Amplitude_modulation.html,-1.0,978-0-87259-096-0\n \n\n .\n ...,False,1.449333
8252,fff8673d3aa5c56ca7fc637e3cb1f26089a35e97_1816,,"[0.0651249885559082, -0.05731048807501793, -0....",,wikiArticles/Temporal_envelope_and_fine_struct...,-1.0,[178]\n \n\n discrimination of the fundamen...,False,1.298662
