### LLM experiments

In [None]:
# !pip install torch
# !pip install transformers

In [None]:
# !pip install langchain langchain_community langchain_core sentence-transformers faiss-cpu grandalf tiktoken optimum auto-gptq tiktoken langchainhub

In [37]:
from langchain_community.llms import Ollama

llm = Ollama(model='llama3')

In [38]:
output = llm('tell me a joke')

In [None]:
print(output)

Here's one:

Why couldn't the bicycle stand up by itself?

(wait for it...)

Because it was two-tired!

Hope that made you laugh!


In [None]:
from langchain_core.prompts import PromptTemplate
from langchain_community.chat_models import ChatOllama

prompt = PromptTemplate(
    template="""
        <|begin_of_text|><|start_header_id|>system<|end_header_id|>
        You are a helpful AI assistant for travel tips and recommendations<|eot_id|>

        <|start_header_id|>user<|end_header_id|>
        Question: {question}
        Context: {context} <|eot_id|>

        <|start_header_id|>assistant<|end_header_id|>
    """,
    input_variables=["question", "context"],
)

llm = ChatOllama(model='llama3', temperature=0)

In [None]:
from langchain import hub
from langchain.agents import AgentExecutor, create_react_agent, load_tools

llm = ChatOllama(model='llama3', temperature=0)
tools = load_tools(
    ["arxiv"],
)
prompt = hub.pull("hwchase17/react")

agent = create_react_agent(llm, tools, prompt)
agent_executor = AgentExecutor(agent=agent, tools=tools, verbose=True)

In [None]:
agent_executor.invoke(
    {
        "input": "What's the paper 1605.08386 about?",
    }
)



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mLet's get started!

Thought: The question is asking about a specific paper, so I should use the arxiv tool to search for it.

Action: arxiv
Action Input: 1605.08386[0m[36;1m[1;3mPublished: 2016-05-26
Title: Heat-bath random walks with Markov bases
Authors: Caprice Stanley, Tobias Windisch
Summary: Graphs on lattice points are studied whose edges come from a finite set of
allowed moves of arbitrary length. We show that the diameter of these graphs on
fibers of a fixed integer matrix can be bounded from above by a constant. We
then study the mixing behaviour of heat-bath random walks on these graphs. We
also state explicit conditions on the set of moves so that the heat-bath random
walk, a generalization of the Glauber dynamics, is an expander in fixed
dimension.[0m[32;1m[1;3mThought: Okay, I've got the observation from using the arxiv tool! It looks like the paper "Heat-bath random walks with Markov bases" by Caprice Sta

{'input': "What's the paper 1605.08386 about?",
 'output': 'The paper "Heat-bath random walks with Markov bases" by Caprice Stanley and Tobias Windisch, published on May 26th, 2016, is about studying graphs formed from allowed moves on lattice points and exploring heat-bath random walks on them.'}

In [None]:
import os

os.environ["GOOGLE_CSE_ID"] = ""
os.environ["GOOGLE_API_KEY"] = ""

In [None]:
from langchain_community.utilities import GoogleSearchAPIWrapper
from langchain_core.tools import Tool

search = GoogleSearchAPIWrapper()

tool = Tool(
    name="google_search",
    description="Search Google for recent results.",
    func=search.run,
)

In [None]:
# %pip install BeautifulSoup4
# %pip install tiktoken
# %pip install pypdf
# %pip install langchain_nomic
# %pip install chromadb

In [None]:
# !pip install -U sentence-transformers==2.2.2 >> devnull

### RAG

In [None]:
from datasets import load_dataset
from langchain.docstore.document import Document
from langchain.vectorstores import Chroma
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain.prompts import PromptTemplate
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from transformers import AutoTokenizer

from tqdm import tqdm
import numpy as np

In [None]:
emb_model_name = "sentence-transformers/all-mpnet-base-v2"
emb_model = HuggingFaceEmbeddings(
    model_name=emb_model_name,
    encode_kwargs={'normalize_embeddings': False}
)

In [None]:
SIZE=1_000

data = load_dataset('bigbio/medal', split=f"train[:{SIZE}]")

base = [
    Document(page_content=doc["text"], metadata={"label": doc["label"][0]}) for doc in tqdm(data) #.select(np.arange(1000000))
]

# We use a hierarchical list of separators specifically tailored for splitting Markdown documents
# This list is taken from LangChain's MarkdownTextSplitter class
MARKDOWN_SEPARATORS = [
    "\n#{1,6} ",
    "```\n",
    "\n\\*\\*\\*+\n",
    "\n---+\n",
    "\n___+\n",
    "\n\n",
    "\n",
    " ",
    "",
]
chunk_size = 1000
text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(
        AutoTokenizer.from_pretrained(emb_model_name),
        chunk_size=chunk_size,
        chunk_overlap=int(chunk_size / 10),
        add_start_index=True,
        strip_whitespace=True,
        separators=MARKDOWN_SEPARATORS,
    )

docs_processed = []
for doc in tqdm(base):
    docs_processed += text_splitter.split_documents([doc])

persist_directory = "DB"
vectordb = Chroma(persist_directory=persist_directory, embedding_function=emb_model)
for i, doc in enumerate(tqdm(docs_processed)):
    vectordb.add_documents([doc], ids=[f'{i}'])

vectordb.persist()
retriever = vectordb.as_retriever()

100%|██████████| 1000/1000 [00:00<00:00, 30155.11it/s]
100%|██████████| 1000/1000 [00:07<00:00, 132.97it/s]
100%|██████████| 1491/1491 [02:28<00:00, 10.02it/s]
  warn_deprecated(


In [None]:
# docs_processed

[Document(metadata={'label': 'transverse aortic constriction', 'start_index': 0}, page_content='velvet antlers vas are commonly used in traditional chinese medicine and invigorant and contain many PET components for health promotion the velvet antler peptide svap is one of active components in vas based on structural study the svap interacts with tgfÎ² receptors and disrupts the tgfÎ² pathway we hypothesized that svap prevents cardiac fibrosis from pressure overload by blocking tgfÎ² signaling SDRs underwent TAC tac or a sham operation T3 one month rats received either svap mgkgday or vehicle for an additional one month tac surgery induced significant cardiac dysfunction FB activation and fibrosis these effects were improved by treatment with svap in the heart tissue tac remarkably increased the expression of tgfÎ² and connective tissue growth factor ctgf ROS species C2 and the phosphorylation C2 of smad and ERK kinases erk svap inhibited the increases in reactive oxygen species C2 ctg

In [None]:
from langchain_community.document_loaders import WebBaseLoader
from langchain_community.vectorstores import Chroma
from langchain.embeddings import HuggingFaceInstructEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter

urls = [
    "https://lilianweng.github.io/posts/2023-06-23-agent/",
    "https://lilianweng.github.io/posts/2023-03-15-prompt-engineering/",
    "https://lilianweng.github.io/posts/2023-10-25-adv-attack-llm/",
]

docs = [WebBaseLoader(url).load() for url in urls]
docs_list = [item for sublist in docs for item in sublist]

text_splitter = RecursiveCharacterTextSplitter(chunk_size=250, chunk_overlap=0)
doc_splits = text_splitter.split_documents(docs_list)

vectorstore = Chroma.from_documents(
    documents=doc_splits,
    collection_name="rag-chroma",
    embedding= emb_model,
)

retriever = vectorstore.as_retriever()

In [None]:
llm = ChatOllama(model='llama3', temperature=0)

In [None]:
from langchain_core.prompts import PromptTemplate
from langchain_community.chat_models import ChatOllama

prompt = PromptTemplate(
    template="""
        <|begin_of_text|><|start_header_id|>system<|end_header_id|>
        You are a helpful AI assistant for travel tips and recommendations<|eot_id|>

        <|start_header_id|>user<|end_header_id|>
        Question: {question}
        Context: {context} <|eot_id|>

        <|start_header_id|>assistant<|end_header_id|>
    """,
    input_variables=["question", "context"],
)

In [None]:
def format_docs(docs):
    return "\n\n".join([d.page_content for d in docs])

chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [None]:
chain.invoke('agent memory')

In [None]:
from langchain_core.prompts import PromptTemplate
from langchain_community.chat_models import ChatOllama

prompt_summarize = PromptTemplate(
    template="""
        <|begin_of_text|><|start_header_id|>system<|end_header_id|>
        You are a helpful AI assistant which should summurize and highlight main information of the given Text.<|eot_id|>

        <|start_header_id|>user<|end_header_id|>
        Text: {question} <|eot_id|>

        <|start_header_id|>assistant<|end_header_id|>
    """,
    input_variables=["question"],
)

In [None]:
def format_docs(docs):
    return "\n\n".join([d.page_content for d in docs])

chain = (
    {"question": RunnablePassthrough()}
    | prompt_summarize
    | llm
    | StrOutputParser()
)

In [None]:
text = """The procedure (using counting rods) for solving simultaneous linear equations now called Gaussian elimination appears in the ancient Chinese mathematical text Chapter Eight: Rectangular Arrays of The Nine Chapters on the Mathematical Art. Its use is illustrated in eighteen problems, with two to five equations.[4]

Systems of linear equations arose in Europe with the introduction in 1637 by René Descartes of coordinates in geometry. In fact, in this new geometry, now called Cartesian geometry, lines and planes are represented by linear equations, and computing their intersections amounts to solving systems of linear equations.

The first systematic methods for solving linear systems used determinants and were first considered by Leibniz in 1693. In 1750, Gabriel Cramer used them for giving explicit solutions of linear systems, now called Cramer's rule. Later, Gauss further described the method of elimination, which was initially listed as an advancement in geodesy.[5]

In 1844 Hermann Grassmann published his "Theory of Extension" which included foundational new topics of what is today called linear algebra. In 1848, James Joseph Sylvester introduced the term matrix, which is Latin for womb.

Linear algebra grew with ideas noted in the complex plane. For instance, two numbers w and z in
𝐶
{\displaystyle \mathbb {C} } have a difference w – z, and the line segments wz and 0(w − z) are of the same length and direction. The segments are equipollent. The four-dimensional system
𝐻
{\displaystyle \mathbb {H} } of quaternions was discovered by W.R. Hamilton in 1843.[6] The term vector was introduced as v = xi + yj + zk representing a point in space. The quaternion difference p – q also produces a segment equipollent to pq. Other hypercomplex number systems also used the idea of a linear space with a basis.

Arthur Cayley introduced matrix multiplication and the inverse matrix in 1856, making possible the general linear group. The mechanism of group representation became available for describing complex and hypercomplex numbers. Crucially, Cayley used a single letter to denote a matrix, thus treating a matrix as an aggregate object. He also realized the connection between matrices and determinants, and wrote "There would be many things to say about this theory of matrices which should, it seems to me, precede the theory of determinants".[5]

Benjamin Peirce published his Linear Associative Algebra (1872), and his son Charles Sanders Peirce extended the work later.[7]

The telegraph required an explanatory system, and the 1873 publication of A Treatise on Electricity and Magnetism instituted a field theory of forces and required differential geometry for expression. Linear algebra is flat differential geometry and serves in tangent spaces to manifolds. Electromagnetic symmetries of spacetime are expressed by the Lorentz transformations, and much of the history of linear algebra is the history of Lorentz transformations.

The first modern and more precise definition of a vector space was introduced by Peano in 1888;[5] by 1900, a theory of linear transformations of finite-dimensional vector spaces had emerged. Linear algebra took its modern form in the first half of the twentieth century, when many ideas and methods of previous centuries were generalized as abstract algebra. The development of computers led to increased research in efficient algorithms for Gaussian elimination and matrix decompositions, and linear algebra became an essential tool for modelling and simulations.[5]"""

res = chain.invoke(text)

In [None]:
print(res)

Here's a summary of the text:

**Early History**

* Ancient Chinese mathematical text "The Nine Chapters on the Mathematical Art" (circa 1000 BCE) contains procedures for solving simultaneous linear equations using counting rods.
* In Europe, systems of linear equations emerged with René Descartes' introduction of coordinates in geometry (1637).
* Leibniz and Gabriel Cramer developed methods for solving linear systems using determinants (1693-1750).

**Development of Linear Algebra**

* Hermann Grassmann published "Theory of Extension" (1844), introducing foundational concepts of linear algebra.
* James Joseph Sylvester coined the term "matrix" (1848).
* Arthur Cayley introduced matrix multiplication and inverse matrices (1856), making possible the general linear group.
* Benjamin Peirce published "Linear Associative Algebra" (1872), and his son Charles Sanders Peirce extended the work later.

**Modern Developments**

* The telegraph industry required a system for explaining electromag

### Retrivier

In [None]:
from datasets import load_dataset
from langchain.docstore.document import Document
from langchain.vectorstores import Chroma
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain.prompts import PromptTemplate
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from transformers import AutoTokenizer

from tqdm import tqdm
import numpy as np

In [None]:
emb_model_name = "sentence-transformers/all-mpnet-base-v2"
emb_model = HuggingFaceEmbeddings(
    model_name=emb_model_name,
    encode_kwargs={'normalize_embeddings': False}
)

In [None]:
#using local persisted db
persist_directory = "DB"
vectordb = Chroma(persist_directory=persist_directory, embedding_function=emb_model)

In [None]:
#not using local persisted db
SIZE=1_000

data = load_dataset('bigbio/medal', split=f"train[:{SIZE}]")

base = [
    Document(page_content=doc["text"], metadata={"label": doc["label"][0]}) for doc in tqdm(data) #.select(np.arange(1000000))
]

# We use a hierarchical list of separators specifically tailored for splitting Markdown documents
# This list is taken from LangChain's MarkdownTextSplitter class
MARKDOWN_SEPARATORS = [
    "\n#{1,6} ",
    "```\n",
    "\n\\*\\*\\*+\n",
    "\n---+\n",
    "\n___+\n",
    "\n\n",
    "\n",
    " ",
    "",
]
chunk_size = 1000
text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(
        AutoTokenizer.from_pretrained(emb_model_name),
        chunk_size=chunk_size,
        chunk_overlap=int(chunk_size / 10),
        add_start_index=True,
        strip_whitespace=True,
        separators=MARKDOWN_SEPARATORS,
    )

docs_processed = []
for doc in tqdm(base):
    docs_processed += text_splitter.split_documents([doc])

persist_directory = "DB"
vectordb = Chroma(persist_directory=persist_directory, embedding_function=emb_model)
for i, doc in enumerate(tqdm(docs_processed)):
    vectordb.add_documents([doc], ids=[f'{i}'])

vectordb.persist()
retriever = vectordb.as_retriever()

### Semantic Splitter

In [2]:
!pip install --quiet langchain_experimental langchain_openai

In [None]:
import numpy as np

In [3]:
with open("asr_result.txt") as f:
    state_of_the_union = f.read()

In [38]:
from langchain_experimental.text_splitter import SemanticChunker

emb_model = HuggingFaceEmbeddings()

text_splitter = SemanticChunker(emb_model)



In [58]:
text_splitter = SemanticChunker(emb_model, breakpoint_threshold_type='percentile')
docs = text_splitter.create_documents([state_of_the_union])

lens = [len(doc.page_content) for doc in docs]
len(lens), np.mean(lens), np.max(lens), np.min(lens), np.array(np.array(lens) > 250).sum() / len(lens)

(72, 1148.111111111111, 7548, 3, 0.6944444444444444)

In [40]:
text_splitter = SemanticChunker(emb_model, breakpoint_threshold_type='standard_deviation')
docs = text_splitter.create_documents([state_of_the_union])

lens = [len(doc.page_content) for doc in docs]
len(lens), np.mean(lens), np.max(lens), np.min(lens), np.array(np.array(lens) > 250).sum() / len(lens)

(17, 4865.823529411765, 13374, 645, 1.0)

In [41]:
text_splitter = SemanticChunker(emb_model, breakpoint_threshold_type='interquartile')
docs = text_splitter.create_documents([state_of_the_union])

lens = [len(doc.page_content) for doc in docs]
len(lens), np.mean(lens), np.max(lens), np.min(lens), np.array(np.array(lens) > 250).sum() / len(lens)

(103, 802.2621359223301, 7548, 3, 0.5825242718446602)

In [42]:
text_splitter = SemanticChunker(emb_model, breakpoint_threshold_type='gradient')
docs = text_splitter.create_documents([state_of_the_union])

lens = [len(doc.page_content) for doc in docs]
len(lens), np.mean(lens), np.max(lens), np.min(lens), np.array(np.array(lens) > 250).sum() / len(lens)

(72, 1148.111111111111, 7685, 3, 0.625)

In [59]:
# MAX_LEN = 4

# def process_docs(docs, text_splitter):
#     new_docs=[]
#     for i, doc in enumerate(docs):
#         uppend = None
#         if len(doc) > MAX_LEN:
#             splitted = [chunk.page_content for chunk in text_splitter.create_documents([doc])]
#             uppend = process_docs(splitted, text_splitter)
#         else:
#             uppend = [doc]
#         print(doc)
#         print(uppend)
#         for up in uppend:
#             if len(new_docs[-1]) + len(up) < MAX_LEN:
#                 new_docs[-1] = new_docs[-1] + up
#             else:
#                 new_docs.append(up)
        

In [60]:
text_splitter = SemanticChunker(emb_model, breakpoint_threshold_type='standard_deviation')
docs = text_splitter.create_documents([state_of_the_union])

lens = [len(doc.page_content) for doc in docs]
len(lens), np.mean(lens), np.max(lens), np.min(lens)

(17, 4865.823529411765, 13374, 645)

### Model

In [6]:
# !pip install langchain langchain_community langchain_core tiktoken langchainhub chromadb transformers langchain-huggingface >> devnull
# !pip install sentence-transformers==2.2.2 >> devnull
# !pip install langchain_huggingface
# !pip install --quiet langchain_experimental langchain_openai

In [61]:
from langchain_core.prompts import PromptTemplate
from langchain_community.chat_models import ChatOllama
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain.docstore.document import Document

from transformers import AutoTokenizer
from transformers import pipeline

import torch
from tqdm import tqdm

In [95]:
# %pip install fpdf

In [155]:
from fpdf import FPDF

def text2pdf(text, output_name='output.pdf'):
    pdf = FPDF()
    pdf.add_page()
    pdf.set_font('Arial', size=12)
    pdf.write(5, text)
    pdf.output(output_name)

In [137]:
PROMPTS = {'full_summary': PromptTemplate(
            template="""
                <|begin_of_text|><|start_header_id|>system<|end_header_id|>
                You get one part of lecture's transcript. Explain each mentioned topic in this part.
                All parts go sequentially each other, so don't repeat what you said previously.<|eot_id|>

                <|start_header_id|>user<|end_header_id|>
                Transcript: {question} <|eot_id|>
                
                <|start_header_id|>assistant<|end_header_id|>
            """,
            input_variables=["question"],
        ),
        'summary': PromptTemplate(
            template="""
                <|begin_of_text|><|start_header_id|>system<|end_header_id|>
                You were given lecture's transcript.
                Explain each mentioned topic.<|eot_id|>

                <|start_header_id|>user<|end_header_id|>
                Transcript: {question} <|eot_id|>
                
                <|start_header_id|>assistant<|end_header_id|>
            """,
            input_variables=["question"],
        ),
        'explain': PromptTemplate(
            template="""
                <|begin_of_text|><|start_header_id|>system<|end_header_id|>
                Answer given question using information from the context<|eot_id|>

                <|start_header_id|>user<|end_header_id|>
                Question: {question}
                Context: {context} <|eot_id|>

                <|start_header_id|>assistant<|end_header_id|>
            """,
            input_variables=["question", "context"],
        )}

In [151]:
class AudioLLM():
    def __init__(self, vectordb, prompts):
        self._device = "cuda:0" if torch.cuda.is_available() else "cpu"

        self._pipeline = pipeline("automatic-speech-recognition", model="openai/whisper-base", device=self._device)
        self._llm = ChatOllama(model='llama3', temperature=0)

        self._prompt_summarize = prompts['summary']
        self._prompt_explain = prompts['explain']
        self._promts_full_summary = prompts['full_summary']

        self._emb_model_name = "sentence-transformers/all-mpnet-base-v2"
        self._vectordb = vectordb
        self._retriever = self._vectordb.as_retriever()
        self._chunk_size = 250

        self._text_splitter = SemanticChunker(emb_model, breakpoint_threshold_type='standard_deviation')


    def _add_to_db(self, summary):
        # We use a hierarchical list of separators specifically tailored for splitting Markdown documents
        # This list is taken from LangChain's MarkdownTextSplitter class
        MARKDOWN_SEPARATORS = ["\n#{1,6} ", "```\n", "\n\\*\\*\\*+\n", "\n---+\n", "\n___+\n", "\n\n", "\n", " ",""]
        text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(
                AutoTokenizer.from_pretrained(self._emb_model_name),
                chunk_size=self._chunk_size,
                chunk_overlap=self._chunk_size // 5,
                add_start_index=True,
                strip_whitespace=True,
                separators=MARKDOWN_SEPARATORS,
            )
        doc = Document(page_content=summary, metadata={"label": "lecture"})
        docs_processed = text_splitter.split_documents([doc])
        
        # persist_directory = "DB"
        for i, doc in enumerate(tqdm(docs_processed, desc="Adding the transcripted audio")):
            self._vectordb.add_documents([doc], ids=[f'{i}'])
        self._vectordb.persist()
        self._retriever = self._vectordb.as_retriever()


    def _ASR(self, audio):
        outputs = self._pipeline(audio, max_new_tokens=256)
        self._add_to_db(outputs["text"])
        return outputs["text"]
    

    def _semantic_split(self, text):
        return [chunk.page_content for chunk in self._text_splitter.create_documents(text)]


    def full_summary(self, input, is_text=True):
        splitted, text = None, input
        if not is_text:
            text = self._ASR(input)
        splitted = self._semantic_split(text)

        summary = ''
        for chunk in tqdm(splitted):
            chain_summary = (
                {"question": RunnablePassthrough()}
                | self._promts_full_summary
                | self._llm
                | StrOutputParser()
            )
            chunk_summary = chain_summary.invoke(chunk)
            summary += chunk_summary + '\n'
        return summary


    def summarize(self, input, is_text=True):
        text = input
        if not is_text:
            text = self._ASR(input)
        
        chain_summary = (
            {"question": RunnablePassthrough()}
            | self._prompt_summarize
            | self._llm
            | StrOutputParser()
        )

        summary = chain_summary.invoke(text)
        return summary


    def explain(self, text):
        def format_docs(docs):
            return "\n\n".join([d.page_content for d in docs])

        chain_explain = (
            {"context": self._retriever | format_docs, "question": RunnablePassthrough()}
            | self._prompt_explain
            | self._llm
            | StrOutputParser()
        )

        explained = chain_explain.invoke(text)
        return explained


In [141]:
emb_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2", encode_kwargs={'normalize_embeddings': False})
vectordb = Chroma(persist_directory="DB", embedding_function=emb_model)

In [152]:
audio_llm = AudioLLM(vectordb, PROMPTS)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [145]:
with open("asr_result.txt", "r") as file:
    asr_res = file.readlines()

In [146]:
summary = audio_llm.summarize(asr_res)

print(len(summary))
print(summary)

1486
The transcript appears to be from a lecture or presentation on the topic of healthcare, biotechnology, and DNA. The speaker discusses various points related to these topics, including:

1. The cost of healthcare in the United States: The speaker notes that the US is one of the richest countries but spends more on healthcare than any other nation without showing significant improvements in population health.
2. Quality of care: The speaker compares the US healthcare system to those of other countries, such as Costa Rica and Cuba, which have similar life expectancies despite lower per capita spending.
3. Drug costs: The speaker notes that drug costs are rising rapidly, but physicians earn money for treatment rather than prevention.
4. Hospitalization: The speaker suggests that hospitals succeed when patients fill their beds, which may not be the most effective way to deliver healthcare.

The speaker also touches on biotechnology and DNA, discussing topics such as:

1. Retroviruses: 

In [147]:
explanation = audio_llm.explain('why the drug costs are rising according to lecture?')
print(len(explanation))
print(explanation)

347
According to the lecture, drug companies have high administration fees because it costs $2,500 for each injection, and they make a lot of money every time a doctor prescribes the medication. The main reason is that the company charges a significant amount for administering the shot, which is not related to the actual cost of the medicine itself.


In [153]:
full_summary = audio_llm.full_summary(asr_res)

print(len(full_summary))
print(full_summary)

100%|██████████| 17/17 [11:32<00:00, 40.74s/it]

28875
This is a transcript of a lecture introduction by a professor, introducing Dr. Sherry Renn as the co-director of the course, and then introducing the speaker for tonight's session, Professor Gilchew.

Here are some key topics mentioned in this part:

1. Dr. Sherry Renn: She is introduced as the co-director of the course, a professor of surgery, an expert in oncologic surgery, and chief of surgery at the VA hospital affiliated with Stanford Medical School.
2. Her achievements: She has received numerous teaching awards, served as director of the medical senate, and is nationally recognized for her work in surgery as a governor of the American College of Surgeons.
3. Professor Gilchew: He is introduced as the speaker for tonight's session, having an interesting history in medicine. He started out as a physics student at Princeton University, then got his PhD in physics at MIT, and later became a doctor at Harvard Medical School.
4. His career: He trained in internal medicine at Mass




In [156]:
text2pdf(full_summary, 'output3.pdf')

In [157]:
with open('full_summary.txt', 'w') as file:
    file.write(full_summary)

### Prompt Experiments

In [63]:
prompt = PromptTemplate(
            template="""
                <|begin_of_text|><|start_header_id|>system<|end_header_id|>
                Explain in details given term using information from context<|eot_id|>

                <|start_header_id|>user<|end_header_id|>
                Term: {question}
                Context: {context} <|eot_id|>

                <|start_header_id|>assistant<|end_header_id|>
            """,
            input_variables=["question", "context"],
        )

audio_llm.set_prompt(prompt, summary=False)

explanation = audio_llm.explain('why the drug costs are rising according to lecture?')
print(len(explanation))
print(explanation)

2608
The lecture discusses the rising cost of drugs and how it affects healthcare in the United States. The speaker mentions that a drug that costs $2.50 per shot can have a charge of $13,000 at Stanford Hospital. This highlights the significant disparity between the actual cost of the drug and the price paid by patients.

To understand why drug costs are rising, let's break down some key points from the lecture:

1. **Patent protection**: Pharmaceutical companies invest heavily in research and development to create new drugs. To recoup these investments, they rely on patent protection, which gives them a monopoly on the market for a certain period. This allows them to set high prices.
2. **Marketing and advertising**: Pharmaceutical companies spend significant amounts on marketing and advertising to promote their products. These costs are factored into the final price of the drug.
3. **Lobbying and influence**: The pharmaceutical industry has significant lobbying power, which can infl

In [67]:
prompt = PromptTemplate(
            template="""
                <|begin_of_text|><|start_header_id|>system<|end_header_id|>
                Take notes of given transcript.<|eot_id|>

                <|start_header_id|>user<|end_header_id|>
                Transcript: {question} <|eot_id|>

                <|start_header_id|>assistant<|end_header_id|>
            """,
            input_variables=["question"],
        )
audio_llm.set_prompt(prompt)

summary = audio_llm.summarize()

print(len(summary))
print(summary)

1549
I've taken notes of the given transcript. Here's a summary:

The speaker discusses the US healthcare system, stating that it is one of the richest countries but spends twice as much per capita on healthcare compared to other countries. They show a graph comparing the US to other countries in terms of spending and life expectancy, noting that the US is tied with Costa Rica and Cuba in terms of life expectancy.

The speaker suggests that the high cost of healthcare in the US can be attributed to factors such as rising drug costs and the fact that physicians earn money for treatment rather than prevention. They also mention that hospitals succeed when patients fill their beds, which may contribute to the problem.

The speaker expresses mixed feelings about expanding hospitals, suggesting that the ideal system would have hospitals shrinking instead of growing. They also discuss the importance of prevention in healthcare and the need to match utilization to need.

The speaker then shif

In [68]:
prompt = PromptTemplate(
            template="""
                <|begin_of_text|><|start_header_id|>system<|end_header_id|>
                You were given lecture's transcript.
                Explain each mentioned topic.<|eot_id|>

                <|start_header_id|>user<|end_header_id|>
                Transcript: {question} <|eot_id|>

                <|start_header_id|>assistant<|end_header_id|>
            """,
            input_variables=["question"],
        )
audio_llm.set_prompt(prompt)

summary = audio_llm.summarize()

print(len(summary))
print(summary)

1435
The transcript appears to be from a lecture or presentation on the topic of healthcare, biotechnology, and DNA. The speaker discusses various points related to these topics, including:

1. The cost of healthcare in the United States: The speaker notes that the US is one of the richest countries but spends more on healthcare than any other nation without showing significant improvements in population health.
2. Quality of care: The speaker compares the US healthcare system to those of other countries, such as Costa Rica and Cuba, which have similar life expectancies despite lower per capita spending.
3. Drug costs: The speaker notes that drug costs are rising rapidly, but physicians earn money for treatment rather than prevention.
4. Hospitalization: The speaker suggests that hospitals succeed when patients fill their beds, which may not be the most effective way to deliver healthcare.

The speaker also touches on biotechnology and DNA, discussing how retroviruses like HIV can inte

In [21]:
prompt = PromptTemplate(
            template="""
                <|begin_of_text|><|start_header_id|>system<|end_header_id|>
                Explain in details given term using information from context<|eot_id|>

                <|start_header_id|>user<|end_header_id|>
                Term: {question}
                Context: {context} <|eot_id|>

                <|start_header_id|>assistant<|end_header_id|>
            """,
            input_variables=["question", "context"],
        )
audio_llm.set_prompt(prompt, summary=False)

### Bot

In [None]:
# %pip install pyTelegramBotAPI

In [None]:
import os
import uuid
import telebot

bot = telebot.TeleBot('''')


@bot.message_handler(content_types=['voice'])
def voice_processing(message):
    filename = str(uuid.uuid4())
    dir_audio = './audio/'
    if not os.path.exists(dir_audio):
        os.makedirs(dir_audio)
    audio_ogg = dir_audio + filename + ".ogg"
    audio_mp3 = dir_audio + filename + ".mp3"

    file_info = bot.get_file(message.voice.file_id)
    downloaded_file = bot.download_file(file_info.file_path)
    with open(audio_ogg, 'wb') as new_file:
        new_file.write(downloaded_file)

    os.system("ffmpeg -i "+audio_ogg+"  "+audio_mp3)  # https://pypi.org/project/ftransc/

    summary = audio_llm.summarize(audio_mp3)

    bot.send_message(message.chat.id, summary)




@bot.message_handler()
def get_message(message):
    bot.send_message(message.chat.id, 'Processing request')
    explanation = audio_llm.explain(message.text)
    bot.delete_message(message.chat.id, message.message_id + 1)
    bot.send_message(message.chat.id, explanation)


bot.infinity_polling()

2024-07-04 00:15:20,353 (__init__.py:1101 MainThread) ERROR - TeleBot: "Infinity polling: polling exited"
2024-07-04 00:15:20,354 (__init__.py:1103 MainThread) ERROR - TeleBot: "Break infinity polling"


In [None]:
import os
import uuid
import telebot
from telebot import types

bot = telebot.TeleBot('')


@bot.message_handler(commands=['start'])
def start(message):
    markup = types.ReplyKeyboardMarkup()
    btn_full_summary = types.KeyboardButton('full summary')
    btn_




@bot.message_handler(content_types=['voice'])
def voice_processing(message):
    filename = str(uuid.uuid4())
    dir_audio = './audio/'
    if not os.path.exists(dir_audio):
        os.makedirs(dir_audio)
    audio_ogg = dir_audio + filename + ".ogg"
    audio_mp3 = dir_audio + filename + ".mp3"

    file_info = bot.get_file(message.voice.file_id)
    downloaded_file = bot.download_file(file_info.file_path)
    with open(audio_ogg, 'wb') as new_file:
        new_file.write(downloaded_file)

    os.system("ffmpeg -i "+audio_ogg+"  "+audio_mp3)  # https://pypi.org/project/ftransc/

    summary = audio_llm.summarize(audio_mp3)

    bot.send_message(message.chat.id, summary)




@bot.message_handler()
def get_message(message):
    bot.send_message(message.chat.id, 'Processing request')
    explanation = audio_llm.explain(message.text)
    bot.delete_message(message.chat.id, message.message_id + 1)
    bot.send_message(message.chat.id, explanation)


bot.infinity_polling()

In [1]:
text = """This is the transcript of a lecture, and it appears to be introducing the speakers and setting the stage for the evening's discussion. Here are some key topics mentioned:
1. Dr. Sherry Renn: The co-director of the course, a professor of surgery, expert in oncologic surgery, and chief of surgery at the VA hospital affiliated with Stanford Medical School.
2. Education: Dr. Renn is recognized for her work in education, having won various teaching awards and serving as director of the medical senate.
3. American College of Surgeons: Dr. Renn is a governor of this organization, which suggests she has a high level of expertise and recognition in the field of surgery.
4. Associate Dean in Academic Affairs: This indicates that Dr. Renn holds a leadership position within the academic institution.
5. Jill Helms: The speaker for next week's session, who will be discussing stem cell biology and regenerative medicine.
6. Molecular basis of life: The topic to be discussed tonight, which has undergone significant changes over the past 40-50 years with fundamental work taking place at Stanford University.
7. Arthur Cornberg: A Nobel Prize winner whose seminal investigations in DNA and transcription began when the school was founded in 1959.
8. Roger Cornberg: Arthur's son, who also won a Nobel Prize to continue his father's work on DNA and transcription.
9. Physics: The background of Professor Gilchew, who started out as an extraordinary student at Princeton University, then got his PhD in physics from MIT, before switching to medicine.
10. Harvard Medical School: Where Professor Gilchew completed his MD degree after initially studying physics.
11. Mass General Hospital: A distinguished hospital where Professor Gilchew trained in internal medicine.
12. Oncology: The area of medicine that Professor Gilchew chose to specialize in after completing his fellowship at Stanford.
13. DNA repair: The specific area of research that Professor Gilchew is working on, focusing on damage related to ionizing radiation and ultraviolet light.
These topics set the stage for a discussion about the molecular basis of life, with a focus on DNA repair and its implications for medicine."""
len(text)

2185

In [3]:
text = """"Here is a brief outline of the main ideas:
I. Introduction
* Dr. Sherry Renn: co-director of the course, professor of surgery, and chief of surgery at Stanford Medical School
* American College of Surgeons: Dr. Renn serves as governor, indicating her expertise in surgery
II. Academic Affairs
* Dr. Renn appointed associate dean in academic affairs at Stanford Medical School
III. Upcoming Lecture
* Jill Helms to discuss stem cell biology and regenerative medicine next week
IV. Molecular Basis of Life
* Topic of tonight's lecture: molecular basis of life, with significant changes over the past 40-50 years * Groundbreaking work done at Stanford University
V. DNA and Transcription
* Arthur Cornberg: Nobel Prize winner for his work on DNA transcription
* Son Roger Cornberg also won a Nobel Prize in the same field
* Fundamental work by the Cornbergs contributed to our understanding of molecular biology
VI. Professor Gilchew's Background
* Started as an extraordinary student in physics at Princeton University
* Earned PhD in physics from MIT, then switched to medicine and completed MD degree at Harvard Medical School
* Trained in internal medicine at Massachusetts General Hospital and specialized in oncology at Stanford
VII. DNA Repair
* Professor Gilchew's area of expertise: studying mechanisms of DNA damage caused by ionizing radiation and ultraviolet light
* Implications for disorders related to pediatric oncology"""
len(text)

1433