In [1]:
! pip install --quiet openai

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m262.9/262.9 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m75.6/75.6 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.9/77.9 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.3/58.3 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
import nltk
import re
import gensim
import matplotlib.pyplot as plt
import pandas as pd
import logging
import warnings
import numpy as np
import seaborn as sns
from typing import Optional
import tqdm
import time
import psutil
import os
from typing import Optional
from openai import OpenAI


nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

plt.style.use('fivethirtyeight')
plt.rcParams['figure.facecolor'] = 'white'

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


## config.py

In [3]:
# Global Configuration for Gensim Bigrams
class CONFIG:
    def __init__(self):
        self.MIN_COUNT = 20
        self.THRESHOLD = 10
        self.NO_BELOW = 5
        self.NO_ABOVE = 0.95

        # Global Configuration for Gensm LDA
        self.CHUNKSIZE = 1700 #2000
        self.ITERATIONS = 80 #100
        self.PASSES = 15 #20 # epochs
cfg = CONFIG()

## topic_modling.py
### scripts

In [4]:
import nltk
import re
import gensim
import pandas as pd
import logging
import warnings
import numpy as np
import tqdm

cfg = CONFIG()

nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)

def normalize_corpus(conversations):
    """
    Normalize the corpus by converting to lowercase, removing special entities,
    tokenizing, lemmatizing, and removing stopwords.

    Parameters:
    conversations (list): List of conversations to be normalized.

    Returns:
    list: List of normalized conversations.
    """
    stop_words = nltk.corpus.stopwords.words('english')
    tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')
    lemmatizer = nltk.stem.wordnet.WordNetLemmatizer()
    norm_conversations = []

    print("Normalizing conversations: ")
    for conversation in tqdm.tqdm(conversations):
        conversation = conversation.lower()
        conversation = re.sub(r'\{\{.*?\}\}', '', conversation)
        conversation_tokens = [token.strip() for token in tokenizer.tokenize(conversation)]
        conversation_tokens = [lemmatizer.lemmatize(token) for token in conversation_tokens if not token.isnumeric()]
        conversation_tokens = [token for token in conversation_tokens if len(token) > 1]
        conversation_tokens = [token for token in conversation_tokens if token not in stop_words]
        conversation_tokens = list(filter(None, conversation_tokens))
        if conversation_tokens:
            norm_conversations.append(conversation_tokens)

    return norm_conversations


def gensim_build_bigrams_bow(norm_conversations):
    """
    Build bigrams and Bag of Words representation of the normalized conversations.

    Parameters:
    norm_conversations (list): List of normalized conversations.

    Returns:
    tuple: Tuple containing the Bag of Words corpus, dictionary, and conversations with bigrams.
    """
    bigram = gensim.models.Phrases(norm_conversations, min_count=cfg.MIN_COUNT, threshold=cfg.THRESHOLD, delimiter='_')
    bigram_model = gensim.models.phrases.Phraser(bigram)
    norm_conversations_bigrams = [bigram_model[conversation] for conversation in norm_conversations]
    dictionary = gensim.corpora.Dictionary(norm_conversations_bigrams)
    dictionary.filter_extremes(no_below=cfg.NO_BELOW, no_above=cfg.NO_ABOVE)
    bow_corpus = [dictionary.doc2bow(text) for text in norm_conversations_bigrams]

    return bow_corpus, dictionary, norm_conversations_bigrams

def topic_modeling_by_coherence(bow_corpus, conversations, dictionary, start_topic_count=2, end_topic_count=10, step=1):
    """
    Perform topic modeling and evaluate using coherence scores.

    Parameters:
    bow_corpus (list): Bag of Words corpus.
    conversations (list): Conversations with bigrams.
    dictionary (gensim.corpora.Dictionary): Gensim dictionary.
    start_topic_count (int): Starting number of topics.
    end_topic_count (int): Ending number of topics.
    step (int): Step size for the number of topics.

    Returns:
    tuple: Tuple containing the LDA models, coherence dataframe, and coherence plot.
    """
    lda_models = []
    scores = {"coherence_c_v_scores": [], "coherence_umass_scores": [], "perplexity_scores": [], "warnings": []}

    gensim_logger = logging.getLogger('gensim')
    gensim_logger.setLevel(logging.ERROR)

    print("Fitting the n-topics iteration: ")
    for num_topics in tqdm.tqdm(range(start_topic_count, end_topic_count + 1, step)):
        with warnings.catch_warnings(record=True) as caught_warnings:
            warnings.simplefilter("always")
            lda_model = gensim.models.LdaModel(corpus=bow_corpus, id2word=dictionary, chunksize=cfg.CHUNKSIZE,
                                               alpha='auto', eta='auto', random_state=7, iterations=cfg.ITERATIONS,
                                               num_topics=num_topics, passes=cfg.PASSES, eval_every=None)
            lda_models.append(lda_model)

            # Coherence and perplexity evaluations
            cv_coherence = gensim.models.CoherenceModel(model=lda_model, corpus=bow_corpus, texts=conversations,
                                                        dictionary=dictionary, coherence='c_v').get_coherence()
            scores["coherence_c_v_scores"].append(cv_coherence)

            umass_coherence = gensim.models.CoherenceModel(model=lda_model, corpus=bow_corpus, texts=conversations,
                                                           dictionary=dictionary, coherence='u_mass').get_coherence()
            scores["coherence_umass_scores"].append(umass_coherence)

            perplexity = lda_model.log_perplexity(bow_corpus)
            scores["perplexity_scores"].append(perplexity)

            # Capture warnings
            warning_message = [str(warning.message) for warning in caught_warnings if "updated prior is not positive" in str(warning.message)]
            scores["warnings"].append(warning_message[0] if warning_message else None)

    # Dataframe for coherence scores
    coherence_df = pd.DataFrame({
        'Number of Topics': range(start_topic_count, end_topic_count + 1, step),
        'C_v Score': np.round(scores["coherence_c_v_scores"], 4),
        'UMass Score': np.round(scores["coherence_umass_scores"], 4),
        'Perplexity Score': np.round(scores["perplexity_scores"], 4),
        'Warnings': scores["warnings"]
    })

    return lda_models, coherence_df

## llm_topic_labeling.py

In [5]:
def create_openai_client(api_key):
    client = OpenAI(api_key=api_key)
    return client

def generate_topic_labels(api_key, topics_keywords_as_list, context="chatbot conversations"):
    client = create_openai_client(api_key)

    existing_labels = []
    topic_labels = {}

    for topic, keywords in topics_keywords_as_list.items():
        system_prompt = (
            "You are designed to generate concise labels for topics. "
            "These topics are derived from chatbot conversations about {context} using LDA Topic Modeling. "
            "You will be provided with keywords which are the most representative words from the topic. "
            "The first keywords in the list are way more significant for topic assignment of the chat conversation, "
            "while the latter ones decrementally reduce their importance and should be used to provide additional context. "
            "Existing generated labels for other topics are: {existing}. "
            "Your task is to provide a single, pertinent label for each set of keywords representing a topic, "
            "ensuring the label accurately reflects the {context} context of the conversation."
            ).format(
                context=context,
                existing=', '.join(existing_labels) if existing_labels else "None"
            )

        prompt = f"Based on these keywords: {', '.join(keywords)}, suggest a concise topic label."

        chat_completion = client.chat.completions.create(
            model="gpt-3.5-turbo",
            max_tokens=10,
            temperature=0.4,
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": prompt}
            ]
        )

        generated_label = chat_completion.choices[0].message.content.strip()
        generated_label = ' '.join(word.capitalize() for word in generated_label.split())

        topic_labels[topic] = generated_label
        existing_labels.append(generated_label)

    # Label review and validation
    for topic, label in topic_labels.items():
        topic_details = "\n".join([f"{topic}: '{label}' generated with these keywords: {', '.join(topics_keywords_as_list[topic][:10])}" for topic, label in topic_labels.items()])
        review_prompt = (
            "You are designed to review generated labels for topics. "
            "These topics are derived from chatbot conversations about {context} using LDA Topic Modeling. "
            "The LDA modeling gave the most impactful keywords for topic assignment of the chat conversation, "
            "The first keywords in the list of are way more significant for the topic assignment, "
            "while the latter ones decrementally reduce their importance and should be used to provide additional context only. "
            "The AI generated topics are as follows: \n"
            f"{topic_details}"
            "Holistically considering all these previously generated labels and their keywords, "
            "for each topic suggest an improved label or keep the current label if it's optimal. "
            "Only answer with the new suggested label name or the initial label name if no improvement is needed. "
        ).format(
            context=context)

        prompt = f"Considering all the topic labels and their keywords, suggest an improved label or keep the current label for {topic}. Only respond with the label name."

        review_completion = client.chat.completions.create(
            model="gpt-3.5-turbo",
            max_tokens=10,
            temperature=0.4,
            messages=[
                {"role": "system", "content": review_prompt},
                {"role": "user", "content": prompt}
            ]
        )

        reviewed_label = review_completion.choices[0].message.content.strip()
        reviewed_label = reviewed_label.replace('"', '').replace("'", "").strip()
        reviewed_label = ' '.join(word.capitalize() for word in reviewed_label.split())
        topic_labels[topic] = reviewed_label

    return topic_labels

## optibot_modeling.py

In [6]:
class OptiBotModeling:
    def __init__(self,
                 df: pd.DataFrame,
                 api_key,
                 context: str = "chatbot conversations",
                 start_topic_count: int = 3,
                 end_topic_count: int = 10,
                 step: int = 1):

        self.df_conversation = df["conversation"].to_list()
        self.api_key = api_key
        self.context: Optional[str] = context
        self.start_topic_count = int(start_topic_count)
        self.end_topic_count = int(end_topic_count)
        self.step = int(step)
        self._best_lda_model = None
        self._bow_corpus = None
        self._norm_conversations_bigrams = None
        self._topics_df: Optional[pd.DataFrame] = None
        self._topics_df_as_list: Optional[pd.DataFrame] = None
        self._coherence_df: Optional[pd.DataFrame] = None
        self._corpus_topic_df: Optional[pd.DataFrame] = None
        self.best_number_topics = None
        self.best_coherence_score = None
        self.execution_time = None
        self.resource_usage = None


    def fit(self):
        start_time = time.time()
        initial_memory_use = psutil.Process(os.getpid()).memory_info().rss / (1024 * 1024)

        norm_conversations = normalize_corpus(self.df_conversation)
        self._bow_corpus, dictionary, self._norm_conversations_bigrams = gensim_build_bigrams_bow(norm_conversations)

        lda_models, self._coherence_df = topic_modeling_by_coherence(
            bow_corpus=self._bow_corpus,
            conversations=self._norm_conversations_bigrams,
            dictionary=dictionary,
            start_topic_count=self.start_topic_count,
            end_topic_count=self.end_topic_count
        )

        best_model_idx = self._coherence_df['C_v Score'].idxmax()
        self._best_lda_model = lda_models[best_model_idx]
        self.best_number_topics = self._coherence_df['Number of Topics'].iloc[best_model_idx]
        self.best_coherence_score = self._coherence_df['C_v Score'].iloc[best_model_idx]

        lda_models = None # <-- Garbage collection
        self._fit_topics_on_data()

        end_memory_use = psutil.Process(os.getpid()).memory_info().rss / (1024 * 1024)
        self.execution_time = round(time.time() - start_time, 3)
        self.resource_usage = round(end_memory_use - initial_memory_use, 3)  # in MB

    def _fit_topics_on_data(self):
        # Check if the model is fitted
        if self._best_lda_model is None:
            raise ValueError("Model not fitted. Call 'fit' before this method.")

        # Topic term extraction and dataframe creation
        topics = [[(term, round(wt, 3))
                    for term, wt in self._best_lda_model.show_topic(n, topn=20)]
                        for n in range(0, self._best_lda_model.num_topics)]

        self._topics_df = pd.DataFrame([', '.join([term for term, wt in topic])
                                  for topic in topics],
                             columns=['Terms per Topic'],
                             index=['Topic'+str(t) for t in range(1, self._best_lda_model.num_topics+1)]
                             )


        self._topics_df_as_list = pd.DataFrame([[topic] for topic in [[term for term, wt in topic] for topic in topics]],
                                  columns=['Terms per Topic'],
                                  index=['Topic'+str(t) for t in range(1, self._best_lda_model.num_topics+1)])

        tm_results = self._best_lda_model[self._bow_corpus]
        corpus_topics = [sorted(topics, key=lambda record: -record[1])[0]
                            for topics in tm_results]

        # Integrate topic modeling results into original conversations
        self._corpus_topic_df = pd.DataFrame()
        self._corpus_topic_df['Dominant Topic'] = [item[0]+1 for item in corpus_topics]
        self._corpus_topic_df['Contribution %'] = [round(item[1]*100, 2) for item in corpus_topics]
        self._corpus_topic_df['Conversation'] = self.df_conversation

        # Generate topic labels
        topics_keywords_as_list = self._topics_df_as_list.to_dict()["Terms per Topic"]
        topic_labels = generate_topic_labels(self.api_key, topics_keywords_as_list, self.context)
        def map_topic_label(topic_number):
            return topic_labels.get(f"Topic{topic_number}", "Unknown Topic")

        self._corpus_topic_df['Topic Label'] = self._corpus_topic_df['Dominant Topic'].apply(map_topic_label)
        self._corpus_topic_df.insert(1, 'Topic Label', self._corpus_topic_df.pop('Topic Label'))

        self._topics_df['Topic Label'] = self._topics_df.index.map(topic_labels)
        self._topics_df.insert(1, 'Topic Label', self._topics_df.pop('Topic Label'))


    def show_coherence_plot(self, save=False):
        if self.coherence_df is None:
            raise ValueError("Topics not generated. Call 'fit' to generate topics.")

        # Extract the coherence scores and compute the highest point
        coherence_scores = self.coherence_df["C_v Score"]
        max_score_index = coherence_scores.idxmax()
        max_score = coherence_scores[max_score_index]
        max_score_topic = self.start_topic_count + self.step * max_score_index

        # Create the plot
        fig, ax = plt.subplots(figsize=(12, 6))
        x_values = range(self.start_topic_count, self.end_topic_count + 1, self.step)
        ax.plot(x_values, coherence_scores, c='r')
        ax.axhline(y=0.5, c='k', linestyle='--', linewidth=2)
        ax.set_xlabel('Number of Topics')
        ax.set_ylabel('Coherence C_v Score')
        ax.set_title('Topic Coherence')
        ax.set_facecolor('#f0f0f0')
        fig.patch.set_facecolor('white')
        ax.grid(True)
        ax.scatter(max_score_topic, max_score, s=500, edgecolors='blue', facecolors='none', linewidths=5, zorder=5) #color='blue',

        if save:
            fig.savefig('coherence_plot.png', bbox_inches='tight')

        return fig

    @property
    def topics_df(self) -> pd.DataFrame:
        if self._topics_df is None:
            raise ValueError("Topics not generated. Call 'fit' to generate topics.")
        return self._topics_df

    @property
    def topics_df_as_list(self) -> pd.DataFrame:
        if self._topics_df is None:
            raise ValueError("Topics not generated. Call 'fit' to generate topics.")
        return self._topics_df_as_list

    @property
    def corpus_topic_df(self) -> pd.DataFrame:
        if self._corpus_topic_df is None:
            raise ValueError("Corpus topic not generated. Call 'fit' to generate corpus topics.")
        return self._corpus_topic_df

    @property
    def coherence_df(self) -> pd.DataFrame:
        if self._coherence_df is None:
            raise ValueError("Corpus topic not generated. Call 'fit' to generate corpus topics.")
        return self._coherence_df

    @property
    def coherence_plot(self) -> pd.DataFrame:
        if self._coherence_plot is None:
            raise ValueError("Corpus topic not generated. Call 'fit' to generate corpus topics.")
        return self._coherence_plot

## load_data.py

In [7]:
def merge_multiple_qa_columns(df, question_cols, answer_cols):
    df['conversation'] = "Question: " + df[question_cols].apply(lambda x: ' '.join(x.astype(str)), axis=1) + " Answer: " + df[answer_cols].apply(lambda x: ' '.join(x.astype(str)), axis=1)
    return df

def extract_unified_column(df, col):
    df["conversation"] = df[col]
    return df


## clients.py

In [24]:
def create_openai_client(api_key):
    client = OpenAI(api_key=api_key)
    return client

## Satisfaction Analysis with LLMs

In [8]:
! pip install --quiet datasets

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m510.5/510.5 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m10.1 MB/s[0m eta [36m0:00:00[0m
[?25h

In [9]:
from google.colab import userdata

api_key = userdata.get('OPENAI_API_KEY')

In [10]:
from datasets import load_dataset

dataset = load_dataset("ruslanmv/ai-medical-chatbot")
medical_ai_df = dataset['train'].to_pandas()
medical_ai_df = merge_multiple_qa_columns(medical_ai_df, ["Description", "Patient"], ["Doctor"])
medical_ai_df = medical_ai_df.sample(50000).reset_index(drop=True)
medical_ai_df.head()

Downloading readme:   0%|          | 0.00/863 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/142M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/256916 [00:00<?, ? examples/s]

Unnamed: 0,Description,Patient,Doctor,conversation
0,Can gallbladder removal be the cause of stabbi...,Had my gallbladder removed 28 August. When I t...,hi. noted history of cholecystectomy and abdom...,Question: Can gallbladder removal be the cause...
1,Does nuchal fold in fetus at 19 weeks cause Do...,Hi my name is sarah I have a few questions. At...,"Hi, Nasal bone is not confirmatory. If suspici...",Question: Does nuchal fold in fetus at 19 week...
2,Suggest treatment for testicle pain post mastu...,"Hi Doctors, I'm 26 and I feel a mild pain in m...",Hello and Welcome to ‘Ask A Doctor’ service. I...,Question: Suggest treatment for testicle pain ...
3,What is the dull aching pain on my right side?,Hi i am a 44yr old female & i have a dull achi...,Hello! Welcome to HCM.This seems to be urinary...,Question: What is the dull aching pain on my r...
4,Is it normal for the child to have loose motio...,Hi my son aged 4 has nephrotic syndrome but ha...,antibiotics are not indicated for every respir...,Question: Is it normal for the child to have l...


In [11]:
context = "General medical questions"
medical_ai_topics = OptiBotModeling(medical_ai_df, userdata.get('OPENAI_API_KEY'), context=context, start_topic_count=5, end_topic_count=20)
medical_ai_topics.fit()

Normalizing conversations: 


100%|██████████| 50000/50000 [01:33<00:00, 534.25it/s]


Fitting the n-topics iteration: 


100%|██████████| 1/1 [07:49<00:00, 469.21s/it]


In [18]:
medical_ai_topics.topics_df.head(4)

Unnamed: 0,Terms per Topic,Topic Label
Topic1,"take, medicine, taking, doctor, day, medicatio...",Medication Management
Topic2,"chest, cough, thanks, lung, question, asthma, ...",Respiratory Health Concerns
Topic3,"erection, luck, query_available, understand_co...",Mens Sexual Health Concerns
Topic4,"pain, back, cause, question, get, right, wa, a...",Back Pain Causes And Management


In [19]:
medical_ai_topics.topics_df.tail(3)

Unnamed: 0,Terms per Topic,Topic Label
Topic15,"conceive, day, suggest, infertility, contracep...",Fertility Treatment Options
Topic16,"penis, like, skin, bump, question, could, spot...",Genital Skin Condition Evaluation
Topic17,"rash, skin, cream, use, itching, hair, face, t...",Skin Rash Evaluation And Treatment


In [21]:
medical_ai_topics.corpus_topic_df.head(5)

Unnamed: 0,Dominant Topic,Topic Label,Contribution %,Conversation
0,5,Medical Consultation Inquiries,35.59,Question: Can gallbladder removal be the cause...
1,10,Pediatric Fever And Infection Evaluation,31.8,Question: Does nuchal fold in fetus at 19 week...
2,4,Back Pain Causes And Management,33.37,Question: Suggest treatment for testicle pain ...
3,4,Back Pain Causes And Management,44.49,Question: What is the dull aching pain on my r...
4,10,Pediatric Fever And Infection Evaluation,50.41,Question: Is it normal for the child to have l...


In [28]:
def create_openai_client(api_key):
    client = OpenAI(api_key=api_key)
    return client

def evaluate_response(api_key, corpus_row, context="chatbot conversations"):
    client = create_openai_client(api_key)
    conversation = corpus_row['Conversation']
    topic = corpus_row['Topic Label']

    system_prompt = (
        "You are designed to review question and answer pairs from chatbot conversations about {context}. "
        "You will evaluate the chatbot's response based on the following criteria: "
        "1. Relevance (does the answer address the question?), "
        "2. Accuracy (is the information provided correct? perform a fact-check), "
        "3. Completeness (does the answer cover all necessary aspects of the question?), "
        "4. Conciseness (is the response easy to understand? think Flesch-Kincaid Readability), "
        "5. Tone (is the response engaging and appropriately toned?). "
        "Rate each criterion on a scale of 1 to 5 and give a very short and concise assessment"
        "Respnse format should strictly be as the following example: "
        "1: score \n"
        "2: score \n"
        "3: score \n"
        "4: score \n"
        "5: score \n"
        "Assessment: short and concise assessment. "
    ).format(
        context = context
    )

    prompt = (
        "Review the following conversation related to the topic '{topic}': "
        "'{conversation}'"
    ).format(
        topic = topic,
        conversation = conversation
    )

    gpt_response = client.chat.completions.create(
        model="gpt-3.5-turbo",
        temperature=0.7,
        max_tokens=300,
        messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": prompt}
            ]
    )

    response = gpt_response.choices[0].message.content.strip()

    return response

In [43]:
%%time
df_assessed_conversations = medical_ai_topics.corpus_topic_df.head(100).copy()
df_assessed_conversations['LLM Evaluation'] = df_assessed_conversations.apply(lambda row: evaluate_response(api_key, row, context=context), axis=1)

CPU times: user 9.7 s, sys: 230 ms, total: 9.93 s
Wall time: 2min 53s


In [38]:
# Example of prompt result
print(df_assessed_conversations["LLM Evaluation"].iloc[0])

1: 5
2: 5
3: 5
4: 4
5: 5
Assessment: The response is relevant, accurate, complete, and engaging, providing appropriate advice and guidance for the situation described. The tone is reassuring and informative.


In [44]:
def parse_llm_evaluation(row):
    scores = {'Relevance': None, 'Accuracy': None, 'Completeness': None, 'Conciseness': None, 'Tone': None, 'Assessment': None}

    try:
        lines = row['LLM Evaluation'].split('\n')
        scores['Relevance'] = int(lines[0].split(':')[1].strip())
        scores['Accuracy'] = int(lines[1].split(':')[1].strip())
        scores['Completeness'] = int(lines[2].split(':')[1].strip())
        scores['Conciseness'] = int(lines[3].split(':')[1].strip())
        scores['Tone'] = int(lines[4].split(':')[1].strip())

        score_values = [scores['Relevance'], scores['Accuracy'], scores['Completeness'], scores['Conciseness'], scores['Tone']]
        scores['Average Score'] = sum(score_values) / len(score_values)

        assessment_index = [i for i, s in enumerate(lines) if 'Assessment:' in s][0]
        scores['Assessment'] = lines[assessment_index].split(':', 1)[1].strip()

    except Exception as e:
        print(f"Error parsing LLM evaluation: {e}")

    return pd.Series(scores)

score_columns = df_assessed_conversations.apply(parse_llm_evaluation, axis=1)
df_assessed_conversations = pd.concat([df_assessed_conversations, score_columns], axis=1)

df_assessed_conversations = df_assessed_conversations.drop('LLM Evaluation', axis=1)


Unnamed: 0,Dominant Topic,Topic Label,Contribution %,Conversation,Relevance,Accuracy,Completeness,Conciseness,Tone,Assessment,Average Score
0,5,Medical Consultation Inquiries,35.59,Question: Can gallbladder removal be the cause...,5,5,5,4,4,"The response is relevant, accurate, and comple...",4.6
1,10,Pediatric Fever And Infection Evaluation,31.80,Question: Does nuchal fold in fetus at 19 week...,2,3,2,2,3,The response partially addresses the question ...,2.4
2,4,Back Pain Causes And Management,33.37,Question: Suggest treatment for testicle pain ...,1,1,1,1,1,The response does not address the question abo...,1.0
3,4,Back Pain Causes And Management,44.49,Question: What is the dull aching pain on my r...,2,1,1,2,3,The response is not relevant as it does not ad...,1.8
4,10,Pediatric Fever And Infection Evaluation,50.41,Question: Is it normal for the child to have l...,3,3,2,2,2,The response provides some relevant informatio...,2.4
...,...,...,...,...,...,...,...,...,...,...,...
95,10,Pediatric Fever And Infection Evaluation,24.89,Question: 3 years old with swollen gums and pu...,1,2,2,1,2,"The response is not relevant, inaccurate, inco...",1.6
96,5,Medical Consultation Inquiries,54.69,Question: Is gun shooting safe during pregnanc...,5,5,5,3,4,"The response is relevant, accurate, and provid...",4.4
97,4,Back Pain Causes And Management,55.09,Question: I had some cartilage from my left ri...,3,2,2,2,2,The response provided some relevant advice but...,2.2
98,8,Blood Pressure Management,45.05,Question: Should further tests be done for lef...,2,2,1,3,3,The response lacks relevance as it does not di...,2.2


## Plotting Distribution

In [40]:
df_plot = medical_ai_topics.corpus_topic_df.copy()

In [41]:
import plotly.express as px
import numpy as np

topic_group = df_plot.groupby(['Dominant Topic', 'Topic Label']).size().reset_index(name='Count')
topic_group['Percentage'] = (topic_group['Count'] / topic_group['Count'].sum()) * 100
topic_group['Label'] = topic_group['Dominant Topic'].astype(str) + ' ' + topic_group['Topic Label']
topic_group = topic_group.sort_values(by='Percentage', ascending=False)


fig = px.bar(topic_group, x='Label', y='Count',
             text=np.round(topic_group['Percentage'], 2),
             labels={'Count': 'Count', 'Label': 'Topic'},
             title='')

fig.update_traces(texttemplate='%{text}%', textposition='outside',
                  hovertemplate='<b>Topic Number</b>: %{x}<br>' +
                                '<b>Topic Label</b>: %{customdata}<br>' +
                                '<b>Count</b>: %{y} of ' + str(topic_group['Count'].sum()) + '<br>' +
                                '<b>Percentage</b>: %{text}%',
                  customdata=topic_group['Topic Label'],
                  hoverlabel=dict(font=dict(size=17)),
                  marker_color='#4C72B0')  # Set the bar color

fig.update_layout(
            xaxis_title='Topic Number',
            yaxis_title='Count',
            plot_bgcolor='#f0f0f0',
            paper_bgcolor='white',
            margin=dict(l=40, r=40, t=40, b=40),
            xaxis={'tickmode': 'array',
                'tickvals': topic_group['Label'],
                'ticktext': [f"{row['Dominant Topic']}" for _, row in topic_group.iterrows()],
                'title_font': {'color': 'black'},
                'tickfont': {'color': 'black'}},
            yaxis={'title_font': {'color': 'black'},
                'tickfont': {'color': 'black'}},
            title={'text': 'Distribution of Topics by Label', 'font': {'color': 'black'}}
)

fig.show()