<a href="https://colab.research.google.com/github/dscott-tantustech/ab_test_guide_in_python/blob/master/Sprint_26_V2__Text_Ingest_1900pp_pdf.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# MODULE 0: File Uploads
# An interactive menu will pop up to choose the files.
# 2 Files to upload:
#                   1.'Reference_Workbook_CY_Provisions_CY2024_DRS_Topics_Subtopics.xlsx'
#                   2.'CMS-2023-0121-0001_content_1920pp.pdf'

from google.colab import files
uploaded = files.upload()



Saving CMS-2023-0121-0001_content_1920pp.pdf to CMS-2023-0121-0001_content_1920pp.pdf
Saving Reference_Workbook_CY_Provisions_CY2024_DRS_Topics_Subtopics.xlsx to Reference_Workbook_CY_Provisions_CY2024_DRS_Topics_Subtopics.xlsx


In [79]:
#MODULE 1: Create data frame from the attached Topic/Subtopic file
import pandas as pd
Topic_SubTopic_df = pd.read_excel('Reference_Workbook_CY_Provisions_CY2024_DRS_Topics_Subtopics.xlsx')

# print the output
print(Topic_SubTopic_df.columns)
Topic_only_df = Topic_SubTopic_df[Topic_SubTopic_df['Subtopic'].isnull()]
Topic_only_df.drop(columns=['Subtopic', 'Topic-Subtopic'], inplace=True)

Topic_only_df

Index(['Topic', 'Subtopic', 'Topic-Subtopic', 'Start_Page', 'End Page'], dtype='object')


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Topic_only_df.drop(columns=['Subtopic', 'Topic-Subtopic'], inplace=True)


Unnamed: 0,Topic,Start_Page,End Page
0,II.B Determination of Practice Expense (PE) RVUs,12,48
4,II.C Potentially Misvalued Services Under the PFS,49,67
5,II.D Payment for Telehealth Services Under Sec...,68,129
48,II.F Evaluation and Management Visits,240,256
51,II.G Geographic Practice Cost Indices (GPCI),257,260
54,II.H Payment for Skin Substitutes,261,264
57,II.I Supervision of Outpatient Therapy Service...,265,276
63,II.J Advancing Access to Behavioral Health Ser...,277,307
71,II.K Proposals on Medicare Parts A and B Payme...,308,350
78,III.A Drugs and Biological Products Paid Under...,351,385


In [None]:
# MODULE 2: Convert Proposed Rules PDF to Text
!pip install PyPDF2
import PyPDF2

def convert_pdf_to_txt(file):
    pdf_file = open(file, 'rb')
    read_pdf = PyPDF2.PdfReader(pdf_file)
    number_of_pages = len(read_pdf.pages)

    segments = []
    for page_number in range(number_of_pages):
        page = read_pdf.pages[page_number]
        page_content = page.extract_text()
        segments.append(page_content)

    return segments

segments = convert_pdf_to_txt('CMS-2023-0121-0001_content_1920pp.pdf')

#ALT METHOD:
"""
!pip install pdfminer.six
from pdfminer.high_level import extract_text

def convert_pdf_to_txt(file):
    text_pdf = extract_text(file)
    return text_pdf.split("\n")

segments = convert_pdf_to_txt('CMS-2023-0121-0001_content_1920pp.pdf')
"""

In [None]:
print("The document has ",len(segments), " PAGES.")
print()
print("Here is the first page: ")
print("---------------------------------------------------------------------------------------------")
print(segments[0] )
print("---------------------------------------------------------------------------------------------")


In [None]:
#MODULE 3
#installations for the model below
!pip install transformers
!pip install -U sentence-transformers
!pip install nltk
!pip install openai
!pip install tiktoken

In [69]:
#MODULE 4: Separate Document into Sentences with Page Information and then add the Topic
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize

sentences_with_pages = []
page_number = 1

for page_content in segments:
    page_sentences = sent_tokenize(page_content)
    for sentence in page_sentences:
        sentences_with_pages.append(f"from p. {page_number}: [{sentence}]")
    page_number += 1




[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [73]:
print("The document has ",len(sentences_with_pages), " SENTENCES. (The tool uses a period, '.', as the sentence delimeter.)")
print()
print("Here is the first sentence: ")
print("---------------------------------------------------------------------------------------------")
print(sentences_with_pages[1] )
print("---------------------------------------------------------------------------------------------")

The document has  18988  SENTENCES. (The tool uses a period, '.', as the sentence delimeter.)

Here is the first sentence: 
---------------------------------------------------------------------------------------------
from p. 1: [ACTION:  Proposed rule.]
---------------------------------------------------------------------------------------------


In [None]:
#MODULE 5
#Build embedding for sentences using SentenceTransformer.
#NOTE: THIS CAN RUN FOR SEVERAL (~10-15) MINUTES

from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-MiniLM-L6-v2')

# This might take a while as it's processing a big document
embeddings = model.encode(sentences_with_pages, convert_to_tensor=True)

In [89]:
#Module 5b: Add Topics to segments
import pandas as pd
from nltk.tokenize import sent_tokenize

def merge_with_topic(segments, Topic_only_df):
  """
  Merges segments with topics based on page numbers and adds a 'Topic' column.

  Args:
      segments (list): List of page content strings.
      Topic_only_df (pd.DataFrame): Dataframe with topics, start pages, and end pages.

  Returns:
      pd.DataFrame: Dataframe with segments, page numbers, and a 'Topic' column.
  """

  # Initialize variables
  page_number = 1
  all_sentences = []

  for page_content in segments:
    # Tokenize sentences
    page_sentences = sent_tokenize(page_content)
    for sentence in page_sentences:
      # Create a dictionary for each sentence with page and topic (initially None)
      all_sentences.append({
          'text': f"from p. {page_number}: [{sentence}]",
          'page': page_number
      })
    page_number += 1



  # Convert list of dictionaries to a dataframe
  df_sentences = pd.DataFrame(all_sentences)

  # Merge Dataframes
  merged_df = df_sentences.merge(Topic_only_df, how='left', left_on='page', right_on='Start_Page')

  # Forward fill 'Topic' and handle sentences exceeding last topic's end page
  merged_df['Topic'] = merged_df['Topic'].fillna(method='ffill')
  merged_df.loc[merged_df['page'] > Topic_only_df['End Page'].max(), 'Topic'] = 'NA'

  return merged_df

merged_df = merge_with_topic(segments, Topic_only_df)

merged_df = merged_df[merged_df['Topic'].notnull()]


### THIS OPTIONAL BLOCK OF CODE CAN ADD THE TOPIC TO THE ACTUAL TEXT BEING CONSIDERED.
#merged_df['text'] = 'Topic: ' + merged_df['Topic'] + ', from p. ' + merged_df['page'].astype(str) + ': [' + merged_df['text'] + ']'
#merged_df = merged_df.loc[:, ['text', 'Topic']]
###
merged_df = merged_df.iloc[:,:3]
len(merged_df.text)
merged_df.head()

# Aggregate text for each Topic into a single entry
merged_df['text'] = merged_df['text'].astype(str)
merged_df_Topic_text_combined = merged_df.groupby('Topic')['text'].agg(lambda x: ' '.join(x)).reset_index()

merged_df_Topic_text_combined.head(33)  # Display the head of the summarized dataframe


Unnamed: 0,Topic,text
0,II.B Determination of Practice Expense (PE) RVUs,from p. 12: [In accordance with section 1848 o...
1,II.C Potentially Misvalued Services Under the PFS,from p. 49: [individual practitioners (or prac...
2,II.D Payment for Telehealth Services Under Sec...,from p. 68: [D. Payment for Medicare Teleheal...
3,II.F Evaluation and Management Visits,from p. 240: [F. Evaluation and Management (E...
4,II.G Geographic Practice Cost Indices (GPCI),from p. 257: [other than critical care visits ...
5,II.H Payment for Skin Substitutes,"from p. 261: [Additionally, we noted that we w..."
6,II.I Supervision of Outpatient Therapy Service...,from p. 265: [groupings) under any combination...
7,II.J Advancing Access to Behavioral Health Ser...,from p. 277: [injection training for insulin-d...
8,II.K Proposals on Medicare Parts A and B Payme...,from p. 308: [● Are there barriers to digital...
9,III.A Drugs and Biological Products Paid Under...,from p. 351: [and encouraging continued feedba...


In [None]:
# MODULE 6a: Find Sentences Related to a given Comment
from torch.nn import CosineSimilarity
import torch

cos_sim = CosineSimilarity()

# Function to find related sentences with page information
def find_related_sentences(comment):
    comment_embedding = model.encode([comment], convert_to_tensor=True)
    similarities = cos_sim(comment_embedding, embeddings)
    top_related_idx = torch.topk(similarities, k=10).indices
    related_sentences = [sentences_with_pages[idx] for idx in top_related_idx]
    return related_sentences

# Test it with an example comment
comment="""[BEGIN MC COMMENT: Soliciting Public Comment on Strategies for Updates to Practice Expense Data Collection Methodology
In the Proposed Rule, CMS included five questions related to the AMA PPI Survey:

a.	If CMS should consider aggregating data for certain physician specialties to generate indirect allocators so that PE/HR calculations based on PPI survey data would be less likely to over- allocate (or under-allocate) indirect PE to a given set of services, specialties, or practice types. Further, what thresholds or methodological approaches could be employed to establish such aggregations?
The AMA PPI survey uses stratification to control the distribution of sampled cases, either to match the distribution of the population or to differ from it in a controlled way. The use of stratification will improve the precision of estimates, both overall and within subgroups defined by the stratification. The AMA recommends that CMS postpone any consideration of the level of granularity of specialty-level data until after the PPI demonstrates the differences and similarities of practice costs by specialty. The AMA and Mathematica could consider recommendations related to this question once the study is completed.

b.	Whether aggregations of services, for purposes of assigning PE inputs, represent a fair, stable, and accurate means to account for indirect PE across various specialties or practice types?
The AMA believes that it is important for the CMS practice expense methodology to have a sufficient level of granularity to reflect actual practice costs incurred by physician practices. Ambulatory payment classification (APC) codes from the OPPS, for example, would not represent a fair, stable, and accurate means to account for indirect practice expense for the MFS due to lack of granularity.

Resource costs in the MFS are developed through an extremely granular “bottom-up” methodology in which the necessary resource costs are added line-by-line to achieve the actual costs for the physician to provide the care. In contrast, payment to facilities under the OPPS is calculated on the geometric mean of the costs of services in the same APC codes. To equate the rigorously developed line-item costs associated with services performed in the non-facility setting, with charges that are intended to be an average of “similar” services when performed in the facility is severely flawed because the two systems are making payments under vastly different assumptions.
While hospital charge information is updated on a rolling basis, it does not mean that these cost data are more accurate. Under the OPPS, each APC is assigned a cost weight based on the geometric mean costs of all the procedures assigned to that APC. These estimated costs are derived from hospital charges adjusted to costs using each hospital’s cost to charge ratio (CCR). Rather than estimating the costs of each resource on a per line-item basis, this ratio is an average at the hospital department level. Since the creation of the OPPS, this averaging mechanism has consistently resulted in charge compression. CMS defines charge compression as the “practice of applying a lower charge markup to higher cost services and a higher charge markup to lower cost services.” As a result, the cost-based weights may reflect some aggregation bias, undervaluing high-cost items and overvaluing low-cost items when an estimate of average markup, embodied in a single CCR, is applied to items of widely varying costs in the same cost center.


For the over 8,000 CPT/Healthcare Common Procedure Coding System (HCPCS) codes that have “Active” or “Restricted Coverage” status the CY2024 MFS NPRM Addendum B, there are only 162 unique APC codes in the CY2024 OPPS NPRM addendum B. Over 3,000 of the CPT/HCPCS codes that are “Active” or have “Restricted Coverage” status do not even have an assigned APC code.

c.	If and how CMS should balance factors that influence indirect PE inputs when these factors are likely driven by a difference in geographic location or setting of care, specific to individual practitioners (or practitioner types) versus other specialty/practice specific characteristics (for example, practice size, patient population served)?
In the PPI study, the AMA and Mathematica are controlling the number of sampled practices within strata defined by (1) specialty, (2) proportion of time in the facility setting, (3) practice size, (4) ownership type (individual ownership vs. more complex ownership types), (5) geographic region, and among practices with complex ownership, whether (6) the practice is part of a vertically integrated health system, and (7) private equity ownership.

The AMA and Mathematica are using these criteria for our Initial sampling, and if there is variance in the response rates between different practice types, we will also use these criteria to adjust the sampling midway through the data collection period. Finally, the AMA and Mathematica will develop final analysis weights to adjust for the probability of selection, practice eligibility, and cooperation, ensuring selected weighted totals match marginal population totals from the sample frame. In the survey itself, participating practices are asked to split out their provider compensation and time, staffing and other direct and indirect practice expenses at the Medicare specialty level, if possible.
The AMA and Mathematica could provide recommendations related to this question once the PPI survey is completed. The PPI sampling and weighting methodology should account for most of these factors.

d.	What possible unintended consequences may result if CMS were to act upon the respondents’ recommendations for any of highlighted considerations above?
Medicare payment differentials between the MFS and the OPPS are significant and have been growing, and this may be a factor in the decline in private practice. In fact, physician survey data indicate that payment and practice costs are two of the three leading reasons for private practices selling to hospitals or health systems.3 It is important to ensure that any potential changes to CMS practice expense methodology do not further exacerbate this relationship and instead work towards correcting site of service inconsistencies.

In last year’s NPRM, CMS provided an impact table related to the initiative of rebasing and revising the MEI weights. CMS noted that implementation of that change in the PE methodology would have shifted payment weights from physician work to practice expense principally favoring Diagnostic Testing Facility (+13 percent), Portable X-Ray Supplier (+13 percent), Independent Laboratory (+10 percent), and Radiation Therapy Centers (+6 percent) to the detriment of Cardiothoracic Surgery (-8 percent), Neurosurgery (-8 percent), Emergency Medicine (-8 percent), and Anesthesiology (-5 percent). Modest increases occur to specialties who provide services in the office with extremely expensive disposable supplies embedded into physician payment. Primary Care would face decreases (Family Medicine (-1

3 Kane CK. Recent Changes in Physician Practice Arrangements: Shifts Away from Private Practice and Towards Larger Practice Size Continue Through 2022. Chicago (IL): American Medical Association; 2023. Policy Research Perspective 2023. https://www.ama-assn.org/system/files/2022-prp-practice-arrangement.pdf


percent), Geriatrics (-2 percent), Internal Medicine (-2 percent) and Pediatrics (-2 percent)). Similar to that separate policy change, other changes to the PE methodology would cause massive shifts between specialties, as well as within specialties, and put the solvency of many physician practices and other health care organizations in jeopardy. Any changes that are considered should be made carefully to ensure they reflect actual practice costs incurred by physician practices. All changes that impact physician practices should be phased in.

e.	Whether specific types of outliers or non-response bias may require different analytical approaches and methodological adjustments to integrate refreshed data?
The AMA and Mathematica will develop final analysis weights to adjust for probability of selection, practice eligibility, and cooperation, ensuring selected weighted totals match marginal population totals from the sample frame. The AMA and Mathematica will evaluate the potential for nonresponse bias by conducting a nonresponse bias analysis. The AMA and Mathematica are using strata for our initial sampling, as described above. Also, if there is variance in the response rates between different practice types, these criteria will be utilized to adjust the sampling midway through the data collection period. END MC COMMENT]
"""
related_sentences = find_related_sentences(comment)
text = " ".join(related_sentences) + comment
text
print(comment)

In [97]:
# MODULE 7A
# Generate and present the final summary output in HTML format
#!pip install openai
#!pip install tiktoken
from IPython.display import HTML
import openai
import tiktoken

def generateRulesSummary(text, comment):

    openai.api_key = "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"

    def num_tokens_from_string(string: str, encoding_name: str) -> int:
        encoding = tiktoken.encoding_for_model(encoding_name)
        num_tokens = len(encoding.encode(string))
        return num_tokens

    def get_gpt4_summary(text, prompt=""):
        input_text = f"{text}\n\nPrompt: {prompt}"
        Char_length = len(input_text)
        model_choice = "gpt-4-turbo-preview" if num_tokens_from_string(input_text, "gpt-4") > 8192 else "gpt-4"
        response = openai.chat.completions.create(
            model=model_choice,
            messages=[{"role": "user", "content": input_text}],
            max_tokens=600,
            temperature=0.5,
            n=1,
        )
        generated_summary = response.choices[0].message.content.strip()
        return generated_summary

    #TopicSummary = get_gpt4_summary(text, f"""This is a summarization task. The input text provided here is in two parts: first, several sentences retreived from a very large reference document. And second, at the end of this string, there is a comment that was a response to the entire reference document. This response to the reference document is the last part of the comment and is clearly bracketed. PLease write a summary that brefly restates this last comment in brackets, then summarizes the various sentences from the reference document. Make your whole summary 150 words or less. Make the final output a concise but coherent analysis of the comment, and what the reference sentences say in relation to the comment.
  #You should have the following format for the output: [here you will place a VERY brief summary of the comment, about 10-20 words, NO MORE. This should have a Header: "COMMENT SUMMARY" Refer to it as the Major Commentor comment. This section should STAND ALONE in the output text, CLEARLY dlineated from the second part of your response, with at least 1-2 blank lines to separate it from what follows. Then you will complete your response with the text for Part 2:first, the following header: "RELEVANT TEXT from PROPOSED RULE (CMS-2023-0121-0001):", then finish with the rest of the summary. The balance of your output, which will be about 85% of its words count, will explain why these sentences from the reference document (refered to as the "Regulations Propsoal") are relevant to the MC comment. Also, BE SURE TO INCLUDE THE REGULATIONS PAGE NUMBER for any material referenced from the text. These page numbers must be HIGHLY ACCURATE, so that whatever is said to be discussed on that page is actually on that page. (Be VERY careful to compare the text to the page number.) It will be noted in each input sentence]""")
    TopicSummary = get_gpt4_summary(text, f"""This is a summarization task. The input text provided here is in two parts: first part is labeled 'COMMENT' and the second is a set of several sentences retreived from a very large reference document. The first part, the COMMENT, is a response to the entire reference document from which the second part was extracted. This comment (reponse to the reference document) is clearly bracketed. PLease write a summary that briefly restates this comment in brackets first, then goes on to summarize the various sentences from the reference document. Make your whole summary 150 words or less. Make the final output a concise but coherent analysis of the comment (15-25 words), and then ending with what the reference sentences say in relation to the comment (125-140 words).
  You should have the following format for the output: [here you will place the VERY brief summary of the comment, again: about 15-25 words, NO MORE. THe narrative of your summarization should refer to the 'COMMENT' as the Major Commentor (MC) comment in your first reference, and then MC comment after that. The second part of your response follows. Then you will complete your response with the text for the rest of the summary: first, the following header: "RELEVANT TEXT from PROPOSED RULE (CMS-2023-0121-0001):", then finish with the rest of the summary. The balance of your output after the first part will explain why these sentences from the reference document (refered to as the "Regulations Propsoal") are relevant to the MC comment. Also, BE SURE TO INCLUDE THE REGULATIONS PAGE NUMBER for any material referenced from the text. These page numbers must be HIGHLY ACCURATE, so that whatever is said to be discussed on that page is actually on that page. (Be VERY careful to compare the text to the page number.) THe page number infomration is given for each sentence, so you will be able to be sure. It will be noted in each input sentence you are given.""")

    # Generate the HTML format for the final output
    html_output = f'''
    <div style="font-family: Arial, sans-serif;">
        <h2>Summary of Relevant Text from Rules Proposal</h2>
        <p><strong>Summary of relevant text from PROPOSED RULE (CMS-2023-0121-0001)</strong></p>
        <p>{TopicSummary}</p>
        <p><strong>Complete Comment:</strong></p>
        <p><i>{comment}</i></p>
    </div>
    '''

    return HTML(html_output)

# Test the function using your comment
final_out = generateRulesSummary(text, comment)
final_out

In [101]:
#Module 6b (alternate approach: read data frame and iterate on each input text, writing out a data frame.)
#!pip install openai --upgrade
from IPython.display import HTML

TEST_merged_df_Topic_text_combined=merged_df_Topic_text_combined.head()


# Setup OpenAI API key
openai.api_key = "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"

# MODULE 7B - ALT Summary as Topic Data Frame
# Generate and present the final summary output in HTML format
#!pip install openai
#!pip install tiktoken
import pandas as pd
import openai
from IPython.display import HTML

def num_tokens_from_string(string: str, encoding_name: str='gpt-4') -> int:
    # Simulating encoding length function, adjust as needed
    return len(string)

def get_gpt4_summary(text, prompt=""):
        input_text = f"{text}\n\nPrompt: {prompt}"
        Char_length = len(input_text)
        model_choice = "gpt-4-turbo-preview" if num_tokens_from_string(input_text, "gpt-4") > 8192 else "gpt-4"
        response = openai.chat.completions.create(
            model=model_choice,
            messages=[{"role": "user", "content": input_text}],
            max_tokens=600,
            temperature=0.5,
            n=1,
        )
        generated_summary = response.choices[0].message.content.strip()
        return generated_summary

def summarize_topics(row):
    prompt = """This is a summarization task. The input text is several sentences retrieved from a very large reference document.
                Make your whole summary 500 words or less. Make the final output a VERY concise but coherent analysis of the question,
                and what the text says, picking up the themes and patterns in the overall text."""
    return get_gpt4_summary(row['text'], prompt)

# Apply summarization function to each row
TEST_merged_df_Topic_text_combined['Topic Summary'] = TEST_merged_df_Topic_text_combined.apply(summarize_topics, axis=1)

# Create final DataFrame
Final_Topic_Summary_df = TEST_merged_df_Topic_text_combined[['Topic', 'Topic Summary', 'text']]

def generate_html_output(df):
    html_content = '''
    <style>
        body {
            font-family: "Arial", sans-serif;
        }
        .summary-container {
            margin: 20px;
            padding: 15px;
            border-radius: 8px;
            background-color: #f9f9f9;
            box-shadow: 0 2px 4px rgba(0,0,0,0.1);
        }
        .topic-title {
            color: #333;
            font-size: 18px;
        }
        .summary-text {
            color: #555;
            font-size: 14px;
            margin-top: 5px;
        }
    </style>
    <div class="summary-container">
        <h2 style="color: #4B0082;">Summary Overview</h2>'''

    for _, row in df.iterrows():
        html_content += f'''
            <div>
                <h3 class="topic-title">Topic: {row["Topic"]}</h3>
                <p class="summary-text"><strong>Summary:</strong> {row["Topic Summary"]}</p>
            </div>'''

    html_content += '</div>'
    return HTML(html_content)


# Call the function to display HTML output
html_output = generate_html_output(Final_Topic_Summary_df)
html_output
# Display the final DataFrame
#Final_Topic_Summary_df.head()



###########################################################################################################################
# write out html file


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  TEST_merged_df_Topic_text_combined['Topic Summary'] = TEST_merged_df_Topic_text_combined.apply(summarize_topics, axis=1)


In [100]:
html_output


In [None]:
#Module 6c (PLAYGROUND for experimenting on different text inputs and input and output formats.)
from torch.nn import CosineSimilarity
import torch

cos_sim = CosineSimilarity()

# Function to find related sentences with page information
def find_related_sentences(comment):
    comment_embedding = model.encode([comment], convert_to_tensor=True)
    similarities = cos_sim(comment_embedding, embeddings)
    top_related_idx = torch.topk(similarities, k=15).indices
    related_sentences = [sentences_with_pages[idx] for idx in top_related_idx]
    return related_sentences

# Test it with an example comment
comment=""" What does this document have to say about Medicare Dental Services? """
related_sentences = find_related_sentences(comment)
text = " ".join(related_sentences) + comment
print(text)
#related_sentences

In [None]:
# MODULE 7B - ALT Summary as Q&A
# Generate and present the final summary output in HTML format
#!pip install openai
#!pip install tiktoken
from IPython.display import HTML
import openai
import tiktoken

def generateRulesSummary(text, comment):

    openai.api_key = "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"

    def num_tokens_from_string(string: str, encoding_name: str) -> int:
        encoding = tiktoken.encoding_for_model(encoding_name)
        num_tokens = len(encoding.encode(string))
        return num_tokens

    def get_gpt4_summary(text, prompt=""):
        input_text = f"{text}\n\nPrompt: {prompt}"
        Char_length = len(input_text)
        model_choice = "gpt-4-turbo-preview" if num_tokens_from_string(input_text, "gpt-4") > 8192 else "gpt-4"
        response = openai.chat.completions.create(
            model=model_choice,
            messages=[{"role": "user", "content": input_text}],
            max_tokens=600,
            temperature=0.5,
            n=1,
        )
        generated_summary = response.choices[0].message.content.strip()
        return generated_summary

    TopicSummary = get_gpt4_summary(text, f"""This is a summarization task. The input text is several sentences retreived from a very large reference document. At the end of this string, there is a question that was asked about this large document.
    Make your whole summary 150 words or less. Make the final output a VERY concise but coherent analysis of the question, and what the reference sentences say in relation to the comment.
  You should have the following format for the output: [here you will place a VERY brief summary of the questio , about 10-12 words, NO MORE.
  This section should STAND ALONE in the output text, CLEARLY dlineated from the second part of your response. Then you will complete your
  response with  the rest of the summary. The
  balance of your output, which will be about 90-95% of its words count, will explain what these sentences say in asnwer to the question, answering by re-stating the reference document (refered to
  as the "Regulations Propsoal") in terms relevant to the question. Also, BE SURE TO INCLUDE THE REGULATIONS PAGE NUMBER for any material referenced from the text. These page numbers must be HIGHLY ACCURATE, so that whatever is said to be discussed on that page is actually on that page. (Be VERY careful to compare the text to the page number.) It will be noted in each input sentence]""")

    # Generate the HTML format for the final output
    html_output = f'''
    <div style="font-family: Arial, sans-serif;">
        <h2>Rules Proposal Q&A </h2>
        <p><strong>Question:</strong></p>
        <p><i>{comment}</i></p>
        <p><strong>Text from PROPOSED RULE (CMS-2023-0121-0001)</strong></p>
        <p>{TopicSummary}</p>

    </div>
    '''

    return HTML(html_output)

# Test the function using your comment
final_out = generateRulesSummary(text, comment)
final_out

In [None]:
df.head()
len(df['title'])

1

In [None]:
len(df.text)
#df.head(1)
#df[2000:2010]


def get_last_word_or_chars(text, i=1,num_chars=10):  # Default: last 10 characters
    """Extracts the last word (or last `num_chars` characters) from a string."""
    if text:  # Handle empty strings
        words = text.strip().split()
        if words:
            return words[-i]  # Last word
        else:
            return text[-num_chars:]  # Last characters if no words
    else:
        return None  # Return None for empty strings
########################################
wordcount=10
for j in range(wordcount):
    print(get_last_word_or_chars(text,wordcount-j))

[FR
Doc.
2023-14624
Filed:
7/13/2023
4:15
pm;
Publication
Date:
8/7/2023]


In [68]:
# CHATGPT (REQUIRES A FEE)
#!pip uninstall openai  # Remove if already installed
#!pip install --upgrade openai

# !pip install --upgrade tiktoken  # Remove if already installed

import openai
import tiktoken
import time
import os

# Set your OpenAI API key (avoid exposing it in the script)
openai.api_key = "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"

# Pick model for encoding to count to
def get_summary(text, prompt, max_tokens=500):
    # Combine the input text and prompt for summarization
    input_text = f"{text}\n\nPrompt: {prompt}"

    # Make a request to the OpenAI API
    response = openai.chat.completions.create(model="gpt-4",  # Change to "gpt-4"
    messages=[
        {
            "role": "user",
            "content": f"{text}\n\nPrompt: {prompt}"
        }
    ],
    max_tokens=200,
    temperature=0.5,
    n=1)

    # Extract the generated summary from the API response
    generated_summary = response.choices[0].message.content.strip()

    return generated_summary

# Example usage
#my_long_text = "The world is not so much made for man as he thinks it is. FOr example, 70% of the surface of th earth is salt water. And that doesn't even mention that 99.99% of the earth itself is rock."

def num_tokens_from_string(string: str, encoding_name: str) -> int:
    """Returns the number of tokens in a text string."""
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens

my_long_text = "The AMA appreciates CMS for accepting the RUC/HCPAC values as recommended for Caregiver Training Services, a series of three new CPT codes established to capture functional caregiver training services   provided   to   caregivers   without   the   patient   present.   These   new   codes   will   enhance   communication between therapy practitioners, physicians, and caregivers, and reduce risk of patient injury and increase patient outcomes. CMS coverage of these services acknowledges the importance of caregiver training to alleviate the significant burden that falls greatest on caregivers in lower socioeconomic groups and diverse populations and supports both CMS and HHS initiatives on diversity, equity, and inclusion for family caregivers."
num_tokens_from_string(my_long_text, "cl100k_base")

prompt = "Summarize this in 6 sentences"
my_chatgpt_summary = get_summary(text=my_long_text, prompt=prompt)
#print(my_chatgpt_summary)

print(num_tokens_from_string(my_long_text, "cl100k_base"))


132


In [None]:
# PRIOR Groq
# SEE https://console.groq.com/playground for Groq notes and examples
#my Groq_api_key:  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
#!pip install groq openai

from groq import Groq

client = Groq(
    api_key="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"
    #api_key=os.environ.get("GROQ_API_KEY"),
)

chat_completion = client.chat.completions.create(
    #
    # Required parameters
    #
    messages=[
        # Set an optional system message. This sets the behavior of the
        # assistant and can be used to provide specific instructions for
        # how it should behave throughout the conversation.
        {
            "role": "system",
            "content": "you are a helpful assistant."
        },
        # Set a user message for the assistant to respond to.
        {
            "role": "user",
            "content": "Explain the importance of low latency LLMs",
        }
    ],

    # The language model which will generate the completion.
    model="mixtral-8x7b-32768",

    #
    # Optional parameters
    #

    # Controls randomness: lowering results in less random completions.
    # As the temperature approaches zero, the model will become deterministic
    # and repetitive.
    temperature=0.5,

    # The maximum number of tokens to generate. Requests can use up to
    # 2048 tokens shared between prompt and completion.
    max_tokens=1024,

    # Controls diversity via nucleus sampling: 0.5 means half of all
    # likelihood-weighted options are considered.
    top_p=1,

    # A stop sequence is a predefined or user-specified text string that
    # signals an AI to stop generating content, ensuring its responses
    # remain focused and concise. Examples include punctuation marks and
    # markers like "[end]".
    stop=None,

    # If set, partial message deltas will be sent.
    stream=False,
)

# Print the completion returned by the LLM.
print(chat_completion.choices[0].message.content)


Sure, I'd be happy to explain!

LLM stands for "Low-Latency Logistics Management," which refers to the process of managing the movement of goods and materials with a focus on minimizing delays and response times. Low latency is critical in LLMs for several reasons:

1. Improved Efficiency: Low latency enables real-time communication and decision-making, which can significantly improve the efficiency of logistics operations. For example, in a warehouse, low latency can help ensure that orders are picked, packed, and shipped quickly and accurately.
2. Enhanced Customer Satisfaction: In today's fast-paced world, customers expect quick and reliable delivery of their orders. Low latency in LLMs can help ensure that products are delivered on time, which can lead to higher customer satisfaction and loyalty.
3. Better Inventory Management: Low latency can help improve inventory management by enabling real-time tracking of inventory levels and locations. This can help prevent stockouts, oversto

### Gemma
(for more see https://blog.google/technology/developers/gemma-open-models/)


Using the Gemma ready-to-use Colab notebooks for text summarization is a great way to get started with Gemma's capabilities without any local setup. Here's how:

**1. Accessing the Notebook:**

* Search for "Gemma Colab notebooks" in your web browser. This should lead you to the relevant Google Colab search results.
* Alternatively, you can visit the official Gemma GitHub repository and navigate to the "examples" folder. Within this folder, you'll find Colab notebooks for various tasks, including text summarization. [https://github.com/google-deepmind/gemma](https://github.com/google-deepmind/gemma)

**2. Opening the Notebook:**

* Once you locate the desired notebook (e.g., "text_summarization.ipynb"), click on it to open it in Colab. This will launch a new Colab environment pre-configured with the necessary libraries and Gemma models.

**3. Running the Notebook:**

* Carefully review the code cells within the notebook. These cells typically explain the steps involved in the text summarization process and demonstrate how to use Gemma for this task.
* You can run each code cell individually by clicking the "Run" button (play icon) next to the cell. Alternatively, you can run the entire notebook by clicking "Runtime" -> "Run all" from the Colab menu.

**4. Providing Input Text:**

* The notebook might have designated sections where you can provide your input text for summarization. This could be a text string directly inserted into a code cell or uploading a text file.
* Follow the instructions within the notebook for providing your input.

**5. Generating Summary:**

* Once you've provided the input text and run the relevant code cells, the notebook will utilize Gemma to generate a summary of the text. This summary will be displayed within the notebook's output cells.

**Additional Tips:**

* The Colab notebooks often come with comments and explanations within the code cells. These can be helpful in understanding the process and modifying the code for different use cases.
* Feel free to explore different notebooks available for other LLM tasks offered by Gemma.

Remember, these notebooks are a starting point, and you can adapt and modify them based on your specific needs and exploration goals.

In [None]:
#!pip install bardapi
'''!pip install --upgrade bardapi

#Import the Bard class:
from bardapi import Bard

# Create a Bard instance with my API credentials:
# (see here to get an API key for Bard: https://console.cloud.google.com/apis/credentials/key/0f8c198a-85dc-4b93-9d83-5b8c92dac19f?project=regal-campaign-329818)
bard = Bard(token="AIzaSyBV2Oj1UV7nsD1OcTAZ0DPOwUge95emObs")

topicData = ['Request for Information (RFI): Drugs and Biologicals that are Not Usually Self-Administered by the Patient, and Complex Drug Administration Coding', 'The ACR applauds CMS’s undertaking to conduct a comprehensive review of the administration of biologicals not usually self-administered by the patient and complex drug administration coding, issues that have posed significant challenges to rheumatologists and other specialists in the House of Medicine. The ACR believes the critical components of providing best practice treatments options with biologics and immunomodulatory therapies and protecting the patient and care team relationship are at the highest level of ethical responsibility to our patients and their access to quality healthcare in treating their rheumatologic conditions.', 'CMS plays a crucial role in working with the specialties affected by these policies to identify long- term solutions to the current issues of down coding for services billed for the administration of biologics and the drugs added to the Self-Administered Drug (SAD) list. The ACR is concerned that the criteria used for these two policies contradict the proposed nondiscrimination rules created for Medicare Fee-for-Service.', '“Downcoding” The downcoding of complex chemotherapy services has reached a deep concern as it relates to the billing of biologics for the treatment of most non-oncologic conditions. This is mainly due to flawed billing and coding articles created by the Medicare Administrative Contractors (MAC) that have restricted which complex therapies will be reimbursed using the “chemotherapy” administration codes, forcing rheumatologists and other specialists (except for hematology and oncology) to bill these services with the therapeutic drug administration code. The advent and evolution of biologics and other immunomodulating therapies have revolutionized outcomes for patients with auto-immune diseases such as rheumatoid arthritis (RA), psoriasis, systemic lupus erythematosus, and vasculitis, which carry significant morbidity, mortality, and associated healthcare and societal costs. Despite the up-front costs, adding biologics to other treatment modalities has been cost-effective in appropriate patient populations.1', 'The American Medical Association (AMA) Current Procedural Terminology (CPT) states, “Chemotherapy administration codes 96401-96549 apply to parenteral administration of non- radionuclide antineoplastic drugs; and also to anti-neoplastic agents provided for treatment of noncancer diagnoses (e.g., cyclophosphamide for auto-immune conditions) or to substances such as certain monoclonal antibody agents, and other biologic response modifiers.” Yet, the MACs continue to utilize unsubstantiated criteria to determine which drugs should be defined as complex and warrant the use of complex administration codes.', 'In 2013, the American College of Rheumatology released its position outlining the common clinical situations that require the use of intravenous biologics as opposed to self-administered biologics and outlined the FDA indications, appropriate use, safety, and off-label use for biologics, which are far more complicated at the molecular level than traditional chemically synthesized drugs.2 Also, based on the evolution of biologics and monoclonal antibody treatments on the market for auto-immune diseases in rheumatology, gastroenterology, infectious disease, dermatology, neurology, and other key areas of medicine, it is appropriate to review the definition of “chemotherapy,” which is no longer a useful term, as indications and toxicity should distinguish drugs. The CPT manual also states that “The highly complex infusion of chemotherapy or other drug or biologic agents requires a physician or other qualified health care professional work and/or clinical staff monitoring well beyond that of therapeutic drug agents (96360-96379) because the incidence of severe adverse patient reactions are typically greater. These services can be provided by any physician or other qualified health care professional.” Accordingly, the ACR’s position statement indicates that given the complexity associated with the design, manufacturing, and storage of biologics and differences over time in the structure, efficacy, and safety of biologics, these treatments should be supervised and carried out by specially trained physicians and advanced practitioners who have the required knowledge, training, and experience to administer biologic agents and monitor adverse reactions. The ACR recommends that CPT work with the key stakeholders to change the terminology in the manual from “chemotherapy” to “immunomodulatory” therapies, which is more in line with drug indications.', 'As a reminder, the 2003 Medicare Modernization Act (MMA) Congress included language allowing the use of the chemotherapy administration code by physicians who administer non-oncologic medications in their offices. The policy also indicated that the same level of supervision was required, and there are no significant differences between earlier biologics and currently proposed biologics in their level of risk in administration. Another key point in the MMA also outlined that these treatments should cost the same to administer, including clinical labor costs, and that no specialty should be reimbursed more than the other. Unfortunately, this has also caused another issue in how “chemotherapy” is assigned with the J-codes versus monoclonal antibody and biologic therapies through HCPCS. There are more than one example of a particular drug used as cancer therapy and an immunomodulator to treat different types of arthritis and vasculitis. To reimburse differently based on specialty is not consistent with the MMA language. Also, the toxicity issues related to these infusions do not differ based on the indication of use.', 'There has been a great deal of progress in the use of biologics in many disciplines, with the expectation for continuous advancement in the future. The ACR believes that policies related to access and reimbursement for biologic treatment should be transparent and prioritize the well-being and health of patients across disease processes with a focus on reducing morbidity and mortality. We recommend that the agency convene stakeholder roundtables or workgroups to explore regulatory and legislative solutions to these policies to avoid unintended consequences with deleterious impacts on access and coverage for beneficiaries and their healthcare team.', 'Self-Administered Drug (SAD) List Reimbursement is inadequate for drugs and biologics placed on the existing Self-Administered Drug (SAD) exclusion list policy, and the process used to determine if a drug is usually self-administered by the patient lacks transparency. The ACR is deeply concerned about barriers that limit the ability of patients with rheumatoid arthritis or other autoimmune diseases to obtain affordable, high-quality, high-value healthcare, which includes appropriate treatment. Additionally, beneficiaries who lose access to medication by virtue of inclusion on the SAD list are at risk of delay and deprivation of therapeutic benefits. Arthritis is the leading cause of disability in the United States, and modern treatment approaches have revolutionized outcomes for patients with these diseases. Early aggressive therapy with various drugs, including biologics, has been shown to reduce joint damage and deformities and improve function, reducing work absenteeism, disability, death, costly procedures/surgeries, and hospitalizations.', "The ACR affirms the ethical responsibility of the healthcare team to place the welfare of the patient above all other considerations, as well as the importance of safeguarding the patient's relationship with their healthcare team. The ACR’s goal is to preserve patients’ access to care from rheumatology specialty care teams and the therapies necessary to treat their rheumatologic conditions. The ACR believes this is an important opportunity for CMS to work with the rheumatology community and other key stakeholders as trusted voices on the critical issue of an equitable approach to the criteria for the SAD list and the billing of the complex administration services, especially as it applies to inadvertent harmful consequences related to poor access to future medications."]
#topicName = "3A02 Use of JW modifier and use of JZ modifier"

#Call the get_answer method with your text and a question like "Summarize this in 6 sentences":
bard_summary = bard.get_answer(text=topicData, prompt="Summarize this in 6 sentences")
'''

'!pip install --upgrade bardapi\n\n#Import the Bard class:\nfrom bardapi import Bard\n\n# Create a Bard instance with my API credentials:\n# (see here to get an API key for Bard: https://console.cloud.google.com/apis/credentials/key/0f8c198a-85dc-4b93-9d83-5b8c92dac19f?project=regal-campaign-329818)\nbard = Bard(token="AIzaSyBV2Oj1UV7nsD1OcTAZ0DPOwUge95emObs")\n\ntopicData = [\'Request for Information (RFI): Drugs and Biologicals that are Not Usually Self-Administered by the Patient, and Complex Drug Administration Coding\', \'The ACR applauds CMS’s undertaking to conduct a comprehensive review of the administration of biologicals not usually self-administered by the patient and complex drug administration coding, issues that have posed significant challenges to rheumatologists and other specialists in the House of Medicine. The ACR believes the critical components of providing best practice treatments options with biologics and immunomodulatory therapies and protecting the patient and

In [None]:
# MEDICAL Summary with FalconsAI
# !pip install transformers
# !pip install sentencepiece
from transformers import pipeline
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
import pandas as pd

model_name = "Falconsai/medical_summarization"

MEDICAL_DOCUMENT = summary_df_EachTopic.head()["Text"].tolist()

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
summarizer = pipeline("summarization", model=model_name)

Medical_summary_DF = pd.DataFrame(columns=["Text", "Topic/Subtopic", "Summary"])

#THis needs to be fixed to correctly pick up the Topic/Subtopic
Medical_summary_DF['Text'] = MEDICAL_DOCUMENT
Medical_summary_DF['Topic/Subtopic'] = summary_df_EachTopic["Topic/Subtopic"][1:]

# Extract only the summary text from the dictionary
Medical_summary_DF['Summary'] = [summary['summary_text'] for summary in summarizer(MEDICAL_DOCUMENT, max_length=230, min_length=30, do_sample=False)]

Medical_summary_DF.to_excel('Medical_Summary_output.xlsx', index=False)
Medical_summary_DF


Your max_length is set to 230, but your input_length is only 165. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=82)
Token indices sequence length is longer than the specified maximum sequence length for this model (1238 > 512). Running this sequence through the model will result in indexing errors
Your max_length is set to 230, but your input_length is only 131. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=65)


Unnamed: 0,Text,Topic/Subtopic,Summary
0,The AMA appreciates that CMS adopted 91 percen...,,the AMA appreciates that CMS adopted 91 percen...
1,Conversion Factor Recommendation : To ens...,"Topic/Sub-topic 04: 7C1_Resource-Based Work, ...",background : to ensure Medicare patients maint...
2,Determination of Practice Expense (PE)...,Topic/Sub-topic 05: 2B_Determination of Pract...,the AMA strongly supports the CMS proposal to ...
3,Soliciting Public Comment on Strategie...,Topic/Sub-topic 06: 2B5_Solicitation: Updates...,the AMA and Mathematica are using strata for o...
4,The AMA appreciates CMS for accepting the RUC/...,Topic/Sub-topic 07: 2E26_Payment for Caregive...,the AMA appreciates CMS for accepting the RUC/...


In [None]:
#Another GROQ example script that works

#my Groq_api_key:  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
#!pip install groq #openai
# Import the necessary modules.
import pandas as pd
from groq import Groq

client = Groq(
    #api_key="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"
    #api_key="gsk_muVrSn1gk3R5lPmmCJDMWGdyb3FYV8UMtMM2qMgY6hAeNQ0VhkrEz"
    api_key="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"
)

# Define a function to send a request to the LLM
def ask_llm(prompt, text):
  # Send the request to the LLM
   response = client.chat.completions.create(
      messages=[
          {
              "role": "user",
              "content": prompt + text,
          }
      ],
      model="mixtral-8x7b-32768",
    )
  # Return the response from the LLM
   return response.choices[0].message.content

question = """Please Provide a 15–200-word summary of the following text blocks. I will give each of the 31 text blocks to you one at a time. Make sure that the summaries are always less than or equal to the size of the input text. Also, please make sure that each output sentence is complete, coherent, and correctly punctuated.    Finally, please consider the following feedback for prior summarizations done by different models (their strengths and weakness to be addressed in your summaries as identified by an SME) and try to be sure that the feedback is considered in your summarization results.
The feedback is divided into 3 main points:
1.	Sentences recapping what CMS is proposing or that go into deep background on the issue do not need to be included in the summaries. The output summary should summarize what the commenter's position and/or recommendation is without including the rehashing of CMS's proposal and all the back history and evidence that is supporting the recommendation/response.
2.	 Short comments (<6 sentences) are tough to "summarize" because they are so short and copy/pasting the comment just makes the most sense. This probably applies to any original text <6 sentences in length. Generally, we wouldn't spend much time trying to paraphrase such short comment text.
3.	The overall question to consider for the medium and long comments: Is it better to have something there for the reviewers to work with, even if not the right sentences were copied by the AI, than to have no pre-populated draft summaries for the reviewers to start with? This will be answered by how well your summaries capture the key aspects of the original text."""
#text="Continuation   of   Non-Facility   Payment  Rate CMS proposes that telehealth services provided to patients in their homes should be reported with place   of service (POS) code 10, which was established in the 2022 final rule. CMS established policy in the 2023 final rule that,   for calendar year 2023,   Medicare would continue paying for telehealth   services at the non-facility payment rate instead of returning to its pre-PHE policy of paying for these services at the reduced facility payment rates that apply to services provided in hospital settings. (Although the facility rates are lower than the non-facility rates, for services provided in hospitals and other facility settings, Medicare   makes   a   separate   payment   to   the   facility   in   addition   to   the   payment for the   physician   service.) In the current rule, CMS proposes to align with the telehealth-related flexibilities that were extended via the CAA, 2023, by continuing to pay for telehealth services provided to patients in their homes at the non- facility payment rate for 2024 when the services are reported with POS 10. The AMA appreciates CMS’s recognition that physicians who provide both in-person office services and telehealth services need to receive sufficient compensation to cover the expense of maintaining their medical office."

groq_summary_output_df = pd.DataFrame(columns=["Text", "Summary"])
#for i in range(len(summary_df_EachTopic_human["Text"])):
for i in range(20):
    # Extract the text from the current row
    text = summary_df_EachTopic_human["Text"].iloc[i]
    # Perform NLP task
    summary = ask_llm(question, "\n" + text)

    # Append a new row to the DataFrame with both text and summary
    groq_summary_output_df = groq_summary_output_df.append({"Text": text, "Summary": summary}, ignore_index=True)

#groq_summary_output_df


In [None]:
###################################################
#ALT METHOD
###################################################
# Define the PDF filename
!pip install textract
pdf_filename = "CMS-2023-0121-0001_content_1920pp.pdf"

import pandas as pd
import textract

# Extract text using Textract
text = textract.process(pdf_filename).decode("utf-8")  # Decode for proper string handling

# Function to split text into sections (heuristic approach)
# Function to split text into sections (including page number if possible)
def split_into_sections(text):
  sections = []
  current_section = {"title": "", "text": "", "page_start": None}
  for line in text.splitlines():
    # Identify sections and page numbers based on your PDF structure (adjust)
    if line.startswith("Section: ") and " (Page: " in line:  # Adjust based on your PDF format
      if current_section["text"]:
        sections.append(current_section)
      section_title = line.split("Section: ")[1].split(" (")[0].strip()
      page_start = int(line.split(" (Page: ")[1].split(")")[0])
      current_section = {"title": section_title, "text": "", "page_start": page_start}
    else:
      current_section["text"] += line + "\n"  # Add newline for proper formatting
  if current_section["text"]:
    sections.append(current_section)
  return sections

# Split text into sections
sections = split_into_sections(text)

# Create DataFrame with columns: title, text, page_start
df = pd.DataFrame(sections, columns=["title", "text", "page_start"])

# Print the DataFrame (consider saving to CSV)



Collecting textract
  Downloading textract-1.6.5-py3-none-any.whl (23 kB)
Collecting argcomplete~=1.10.0 (from textract)
  Downloading argcomplete-1.10.3-py2.py3-none-any.whl (36 kB)
Collecting beautifulsoup4~=4.8.0 (from textract)
  Downloading beautifulsoup4-4.8.2-py3-none-any.whl (106 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m106.9/106.9 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting chardet==3.* (from textract)
  Downloading chardet-3.0.4-py2.py3-none-any.whl (133 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m133.4/133.4 kB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting docx2txt~=0.8 (from textract)
  Downloading docx2txt-0.8.tar.gz (2.8 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting extract-msg<=0.29.* (from textract)
  Downloading extract_msg-0.28.7-py2.py3-none-any.whl (69 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m69.0/69.0 kB[0m [31m4.4 MB/s[0m eta [36

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.

Custom TB Handler failed, unregistering


Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/IPython/core/interactiveshell.py", line 3553, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-15-945ba43fe55b>", line 9, in <cell line: 9>
    import textract
ModuleNotFoundError: No module named 'textract'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/IPython/core/ultratb.py", line 1101, in get_records
    return _fixed_getinnerframes(etb, number_of_lines_of_context, tb_offset)
  File "/usr/local/lib/python3.10/dist-packages/IPython/core/ultratb.py", line 248, in wrapped
    return f(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/IPython/core/ultratb.py", line 281, in _fixed_getinnerframes
    records = fix_frame_records_filenames(inspect.getinnerframes(etb, context))
  File "/usr/lib/python3.10/inspect.py", line 1662, in getinnerframes
   