In [3]:
import re
import PyPDF2
import gensim
import nltk
from gensim import corpora
from gensim.models import LdaModel
from gensim.utils import simple_preprocess
from nltk.corpus import stopwords
from langchain.chains import LLMChain
from langchain.prompts import ChatPromptTemplate
from langchain.llms import OpenAI

In [4]:
def preprocess(text, stop_words):
    """
    Tokenizes and preprocesses the input text, removing stopwords and short tokens.
    
    Returns:
        list: A list of preprocessed tokens.
    """
    result = []
    for token in simple_preprocess(text, deacc=True):
        if token not in stop_words and len(token) > 3:
            result.append(token)
    return result

In [5]:
def get_topic_lists_from_pdf(file, num_topics, words_per_topic):
    """
    Extracts topics and their associated words from a PDF document using the Latent Dirichlet Allocation (LDA) algorithm.

    Returns:
        list: A list of num_topics sublists, each containing relevant words for a topic.
    """
    loader = PyPDF2.PdfFileReader(file)
    documents = [loader.getPage(i).extractText() for i in range(loader.numPages)]
    nltk.download('stopwords')
    stop_words = set(stopwords.words(['english', 'spanish']))
    processed_documents = [preprocess(doc, stop_words) for doc in documents]
    dictionary = corpora.Dictionary(processed_documents)
    corpus = [dictionary.doc2bow(doc) for doc in processed_documents]
    lda_model = LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=15)
    topics = lda_model.print_topics(num_words=words_per_topic)
    topics_ls = []
    for topic in topics:
        words = topic[1].split("+")
        topic_words = [word.split("*")[1].replace('"', '').strip() for word in words]
        topics_ls.append(topic_words)
    return topics_ls

In [6]:
def topics_from_pdf(llm, file, num_topics, words_per_topic):
    """
    Generates descriptive prompts for LLM based on topic words extracted from a PDF document.

    Returns:
        str: A response generated by the language model based on the provided topic words.
    """
    list_of_topicwords = get_topic_lists_from_pdf(file, num_topics, words_per_topic)
    string_lda = ""
    for lst in list_of_topicwords:
        string_lda += str(lst) + "\n"

    template_string = '''Describe the topic of each of the {num_topics} double-quote delimited lists in a simple sentence and also write down three possible different subthemes. The lists are the result of an algorithm for topic discovery. Do not provide an introduction or a conclusion, only describe the topics. Do not mention the word "topic" when describing the topics. Use the following template for the response.

    1: <<<(sentence describing the topic)>>>
    - <<<(Phrase describing the first subtheme)>>>
    - <<<(Phrase describing the second subtheme)>>>
    - <<<(Phrase describing the third subtheme)>>>

    2: <<<(sentence describing the topic)>>>
    - <<<(Phrase describing the first subtheme)>>>
    - <<<(Phrase describing the second subtheme)>>>
    - <<<(Phrase describing the third subtheme)>>>

    ...

    n: <<<(sentence describing the topic)>>>
    - <<<(Phrase describing the first subtheme)>>>
    - <<<(Phrase describing the second subtheme)>>>
    - <<<(Phrase describing the third subtheme)>>>

    Lists: """{string_lda}""" '''

    prompt_template = ChatPromptTemplate.from_template(template_string)
    chain = LLMChain(llm=llm, prompt=prompt_template)
    response = chain.run({
        "string_lda": string_lda,
        "num_topics": num_topics
    })
    return response

In [7]:
def extract_dialogues_from_pdf(file_path):
    """
    Extracts clinician-patient dialogues from a PDF file.

    Returns:
        list: A list of tuples containing clinician and patient utterances.
    """
    dialogues = []
    with open(file_path, 'rb') as file:
        reader = PyPDF2.PdfFileReader(file)
        for page_num in range(reader.numPages):
            page_text = reader.getPage(page_num).extractText()
            dialogue_chunks = re.split(r'Clinician: |Patient: ', page_text)
            for i in range(1, len(dialogue_chunks), 2):
                clinician_utterance = dialogue_chunks[i].strip()
                patient_utterance = dialogue_chunks[i + 1].strip()
                dialogues.append(("Clinician", clinician_utterance))
                dialogues.append(("Patient", patient_utterance))
    return dialogues


In [8]:
def summarize_dialogues(llm, dialogues):
    """
    Summarizes clinician-patient dialogues using a language model.

    Returns:
        str: A summarized version of the dialogues generated by the language model.
    """
    dialogue_text = "\n".join([f"{speaker}: {utterance}" for speaker, utterance in dialogues])
    template_string = '''Summarize the clinician-patient dialogues provided below:

    {dialogue_text}
    '''

    prompt_template = ChatPromptTemplate.from_template(template_string)
    chain = LLMChain(llm=llm, prompt=prompt_template)
    response = chain.run({
        "dialogue_text": dialogue_text
    })
    return response

In [None]:
# Example usage:
openai_key = ""
llm = OpenAI(openai_api_key=openai_key, max_tokens=-1)

# Extract and summarize dialogues from PDF
file_path = ""
dialogues = extract_dialogues_from_pdf(file_path)
summary = summarize_dialogues(llm, dialogues)
print(summary)

# Extract topics and generate responses
num_topics = 6
words_per_topic = 30
topics_summary = topics_from_pdf(llm, file_path, num_topics, words_per_topic)
print(topics_summary)