<a href="https://colab.research.google.com/github/dscott-tantustech/ab_test_guide_in_python/blob/master/Summary_of_Summaries_AI_Script_v1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#MODULE 1: LIBRARY INSTALLATIONS
# Installations for NLP Work to create Summary of Summaries below
!pip install openai
!pip install tiktoken

In [None]:
#MODULE 2: READ IN EXTRNAL FILE
# Read in the Summary report file for creating a SUMMARY OF SUMMARIES.
# This example file contains 310 distinct MC summaries for 5 different subtopics selected by Ferhat for
# evaluation of the Summary of Summaries AI output:
#   2D2c_Telephone E/M Services
#   2F2_O/O E/M Visit Complexity Add-on
#   2F3_Split or Shared Visits
#   2H3_Payment Approaches for Skin Substitutes
#   2J1_Coverage of MFT Services and MHC Services
import pandas as pd

#filename = 'summary_report_example_summaries_5_subtopics.xlsx'
filename = 'summary_report_all_summaries.xlsx'

# Read the Excel file into a DataFrame
summaries_df = pd.read_excel(filename)

# Print the DataFrame (optional)
summaries_df.tail()

In [None]:
# MODULE 3: AGGREGATE SUMMARIES BY SUBTOPIC and add gpt-4 based token count
import tiktoken

#create token counter function. Either "gpt-4" or "gpt-4-turbo-preview" are used in summary models in the next module.
def num_tokens_from_string(string: str, encoding_name: str="gpt-4") -> int:
  encoding = tiktoken.encoding_for_model(encoding_name)
  num_tokens = len(encoding.encode(string))
  return num_tokens

grouped_df = summaries_df.groupby('Subtopic')

# Create an empty list to store the summary DataFrames
summary_dfs = []

# Define the row indicator format
row_indicator_format = "SUMMARY {}: [begin summary] {} [end summary]             "  # Added extra spaces

# Iterate through each group
for Subtopic, group_df in grouped_df:
  Topic = group_df['Topic'].iloc[0]

  # Combine summaries with row indicators
  combined_text = ""
  for i, summary in enumerate(group_df['Summary'].astype(str)):
    combined_text += row_indicator_format.format(i+1, summary) + "\n"  # Add newline between comments

  summary_df = pd.DataFrame({
      'Topic': [Topic],
      'Subtopic': [Subtopic],
      'combined_text': [combined_text[:-1]]  # Remove trailing newline
  })
  summary_dfs.append(summary_df)

# Concatenate the summary DataFrames into a single DataFrame
summary_df_interim = pd.concat(summary_dfs, ignore_index=True)
summary_df_EachTopic= summary_df_interim[summary_df_interim['Subtopic'].ne(" (None)                                                                                           ")]
#summary_df_EachTopic['Char_length']=summary_df_EachTopic['combined_text'].str.len()
summary_df_EachTopic['token_count'] = summary_df_EachTopic['combined_text'].apply(num_tokens_from_string)

#Write a temp .csv file to check output at row level during single document process execution
summary_df_EachTopic.to_excel('this_doc_comments.xlsx', index=False)

summary_df_EachTopic.tail()


In [52]:
#MODULE 4: openai/gpt4
# create gpt-4 summaries
#!pip install openai (this is taken care of in Module 1 but is also here for conveience)
import pandas as pd
import openai

# Set your OpenAI API key (get new key at https://platform.openai.com/account/api-keys)
openai.api_key = 'need_openai_api_key_here'

def generate_gpt4_summaries_for_all_comments(df):
    """Generates gpt-4 or gpt-4-turbo-preview summaries (depending on input text size) for all comments in the given DataFrame."""

    loop_summary_df = pd.DataFrame(
        columns=["Topic","Subtopic", "Combined_Text","token_count", "Summarizer_type", "Summary"]
    )

    long_prompt = """Summarize the following set of text blocks into a 500-word summary. The text blocks are from many different Medical
                                Professionals (and organizations representing them). Each block of text to be summarized represents several different
                                comments from multiple  commentors which have been collated together into a single text string. Please summarize the
                                overall perspective of all commentors considered in aggregation.
"""
    #Set limit of summary output length (500 words ~ 675 tokens)
    max_output_tokens=675

    for index, row in df.iterrows():
        Topic = row["Topic"]
        Subtopic = row["Subtopic"]
        Combined_Text=row["combined_text"]
        token_count=row["token_count"]
        # Handle the case where the "Text" field might span multiple lines
        text_lines = Combined_Text.split('\n')
        full_text = ' '.join(text_lines)
        # Dynamically choose the model based on token length, including output tokens; due to gpt4 upper limit
        row_model_choice = "gpt-4-turbo-preview" if token_count > (8192-max_output_tokens) else "gpt-4"
        try:
            summary = get_gpt4_summary(full_text, long_prompt, row_model_choice, max_output_tokens)
            new_row = {
                "Topic": Topic,  # Store the combined text
                "Subtopic": Subtopic,
                "Combined_Text": Combined_Text,
                "token_count": token_count,
                "Summarizer_type": row_model_choice,
                "Summary": summary
            }
            #loop_summary_df = loop_summary_df.append(new_row, ignore_index=True)
            new_index = len(loop_summary_df)  # Get the new index position
            loop_summary_df.loc[new_index] = new_row  # Assign the new row at the index
        except Exception as e:
            print(f"Error processing row {index}: {e} Check that text input is less than 128k tokens (approximately 200 pages.)")

    return loop_summary_df

def get_gpt4_summary(text, prompt, row_model_choice, max_output_tokens):
    """Calls gpt4 to generate a summary of the given text using the provided prompt."""
    # Combine the input text and prompt for summarization
    input_text = f"{text}\n\nPrompt: {prompt}"
    model_choice=row_model_choice
    # Make a request to the OpenAI API
    response = openai.chat.completions.create(
        model=model_choice,
        messages=[{"role": "user", "content": input_text}],
        max_tokens=max_output_tokens,
        temperature=0.5,
        n=1,
    )

    # Extract the generated summary from the API response
    generated_summary = response.choices[0].message.content.strip()
    return generated_summary

# Example usage (assuming you have a DataFrame named 'summary_df_EachTopic_human'):
gpt4_summary_df = generate_gpt4_summaries_for_all_comments(summary_df_EachTopic)
gpt4_summary_df.to_excel("gpt4_summary_df.xlsx", index=False)

#### THE END of the Summary of Summaries Script

(Code below allows for alternative LLM model (Claude-3) which has been evaluated postively by L&M, in the event of any issues with openai.)

In [47]:
# MODULE 5 (OPTIONAL): Anthropic/Claude-3
#ALTERNTATIVE APPPROACH IF ANY ISSUES WITH OPENAI CALL: Claude-3
# for tier-1 API plan, can handle up to 4,000 requests per minute
#!pip install anthropic
import anthropic
import pandas as pd
import os

# need create an paid account and get an anthropic api key to insert here, from https://console.anthropic.com/settings/keys
os.environ["ANTHROPIC_API_KEY"]="need_anthropic_api_key"
client = anthropic.Anthropic()


def generate_claude3_summaries_for_all_comments(df):
  """Generates Claude-3 summaries for all comments in the given DataFrame."""
  client = anthropic.Anthropic(
     api_key="my_anthropic_key",)
  loop_summary_df = pd.DataFrame(
      columns=["Topic", "Subtopic", "Combined_Text", "token_count", "Summarizer_type", "Summary"]
  )

  for index, row in df.iterrows():
    Topic = row["Topic"]
    Subtopic = row["Subtopic"]
    Combined_Text = row["combined_text"]
    token_count = row["token_count"]

    # Use the text from the DataFrame for summarization
    client = anthropic.Anthropic()
    message = client.messages.create(
        model="claude-3-opus-20240229",
        max_tokens=4000,
        temperature=0,
        #system="Summarize the following block of text in less than 150 words. For output, do NOT explain that it is a summary, just output the actual summary itself.",
        system="""Summarize the following set of text blocks into a 500-word summary. The text blocks are from many different Medical
                                Professionals (and organizations representing them). Each block of text to be summarized represents several different
                                comments from multiple  commentors which have been collated together into a single text string. Please summarize the
                                overall perspective of all commentors considered in aggregation.
""",
        messages=[
            {
                "role": "user",
                "content": [
                    {
                        "type": "text",
                        "text": Combined_Text
                    }
                ]
            }
        ]
    )

    summary_text = message.content[0].text.strip()  # Strip leading/trailing whitespace

    new_row = {
        "Topic": Topic,
        "Subtopic": Subtopic,
        "Combined_Text": Combined_Text,
        "token_count": token_count,
        "Summarizer_type": "Claude-3",
        "Summary": summary_text
    }
    #loop_summary_df = loop_summary_df.append(new_row, ignore_index=True)
    new_index = len(loop_summary_df)  # Get the new index position
    loop_summary_df.loc[new_index] = new_row  # Assign the new row at the index
  return loop_summary_df

# Example usage (assuming you have a DataFrame named 'summary_df_EachTopic_human'):
claude3_summary_df = generate_claude3_summaries_for_all_comments(summary_df_EachTopic)
claude3_summary_df.to_excel("claude3_summary_df.xlsx", index=False)


In [None]:
# MODULE 6(OPTIONAL): COMBINE 2 different model outputs for comparison.
import pandas as pd

# Create a combined DataFrame
combined_df = pd.DataFrame(columns=["Topic", "Subtopic", "Combined_Text", "token_count", "Summarizer_type", "Summary"])

# Concatenate the DataFrames (assuming gpt4_summary_df and claude3_summary_df are DataFrames)
combined_df = pd.concat([combined_df, gpt4_summary_df, claude3_summary_df], ignore_index=True)

# Convert character length to pages, and add some metrics
combined_df['Combined_Text_page_length'] = round(combined_df['Combined_Text'].str.len() / 3625, 1)
combined_df['Summary_page_length'] = round((combined_df['Summary'].str.len()) / 3625, 1)
combined_df['ratio'] = combined_df['Summary'].str.len() / combined_df['Combined_Text'].str.len()
combined_df['ratio'] = combined_df['ratio'].apply('{:.3f}'.format)

# Desired column order
desired_columns = ["Topic", "Subtopic", "Combined_Text", "Combined_Text_page_length", "Summarizer_type", "Summary",
                   "Summary_page_length", "ratio"]
combined_df = combined_df.reindex(columns=desired_columns)

combined_df.to_excel('combined_df.xlsx', index=False)
combined_df.tail(6)


Unnamed: 0,Topic,Subtopic,Combined_Text,Combined_Text_page_length,Summarizer_type,Summary,Summary_page_length,ratio
4,2J_Advancing Access to Behavioral Health Services,2J1_Coverage of MFT Services and MHC Services,SUMMARY 1: [begin summary] Premier Inc. applau...,7.8,gpt-4,The proposed changes by the Centers for Medica...,0.8,0.102
5,2D_Telehealth Services,2D2c_Telephone E/M Services,SUMMARY 1: [begin summary] The American Academ...,2.8,Claude-3,The majority of the medical professionals and ...,0.6,0.195
6,2F_E&M Visits,2F2_O/O E/M Visit Complexity Add-on,SUMMARY 1: [begin summary] The undersigned org...,23.6,Claude-3,Here is a 500-word summary of the key perspect...,0.8,0.035
7,2F_E&M Visits,2F3_Split or Shared Visits,SUMMARY 1: [begin summary] Premier Inc. suppor...,11.2,Claude-3,Here is a 500-word summary of the overall pers...,0.7,0.059
8,2H_Payment for Skin Substitutes,2H3_Payment Approaches for Skin Substitutes,SUMMARY 1: [begin summary] LifeNet Health (LNH...,3.1,Claude-3,The medical professionals and organizations wh...,0.6,0.199
9,2J_Advancing Access to Behavioral Health Services,2J1_Coverage of MFT Services and MHC Services,SUMMARY 1: [begin summary] Premier Inc. applau...,7.8,Claude-3,Here is a 500-word summary of the overall pers...,0.9,0.109
