In [1]:
from itertools import permutations
import pandas as pd
import random
from transformers import BertTokenizer

In [24]:
probes_df = pd.read_csv('../data/Catalog - Probes.csv')
probes_df.head()



Unnamed: 0,Manufacturer,Probe_Model,Connection_Type,Compatible_Systems,Array_Type,Frequency_Range,Applications,Stock,Description
0,ATL,C3,,HDI 5000,Convex Array,3 MHz,"abdominal, general",0,The ATL C3 is a convex array ultrasound transd...
1,ATL,C4-2,,HDI 5000,Convex Array,2-4 MHz,"abdominal, obstetric, gynecologic, general",2,The ATL C4-2 is a convex array ultrasound prob...
2,ATL,C5-2,,"HDI 1500, HDI 3000, HDI 3500, HDI 5000",Convex Array,2-5 MHz,abdominal,7,The ATL C5-2 convex Array transducer is a vers...
3,ATL,C5-IVT,,"UM9 HDI, HDI 1500, HDI 3000, HDI 3500, HDI 5000",Convex Array,4-6 MHz,"intracavitary, endovaginal, gynecological",0,The ATL C5-IVT convex linear ultrasound transd...
4,ATL,C7-4,,"UM9 HDI, HDI 1500, HDI 3000, HDI 5000",Convex Array,4-7 MHz,"cardiac, vascular, general",0,The ATL C7-4 convex linear ultrasound transduc...


In [2]:
# Load the pre-trained BERT tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

def create_context(question, row, relevant_columns):
    context_parts = [f"Question: {question}"]
    for col in relevant_columns:
        value = row[col]
        if isinstance(value, str):
            context_parts.append(value)
        elif isinstance(value, list):
            context_parts.append(' '.join(value))
        else:
            context_parts.append(str(value))
    context = ' Context: '.join(context_parts)
    return context

In [25]:
# Extract distinct manufacturers from the DataFrame
all_manufacturers = probes_df['Manufacturer'].dropna().unique().tolist()

# Extract all distinct ultrasound systems from the Compatible_Systems column
compatibility_df = probes_df.copy()
compatibility_df['Compatible_Systems'] = compatibility_df['Compatible_Systems'].str.split(',')
compatibility_df = compatibility_df.explode('Compatible_Systems')
compatibility_df['Compatible_Systems'] = compatibility_df['Compatible_Systems'].str.strip()
all_systems = set(compatibility_df['Compatible_Systems'].dropna().unique())

# Extract distinct manufacturers from the DataFrame
all_manufacturers = probes_df['Manufacturer'].dropna().unique().tolist()

# Create a dictionary to map Probe_Model to a list of manufacturers and connection types
model_details = {}
for index, row in probes_df.iterrows():
    model = row['Probe_Model']
    manufacturer = row['Manufacturer']
    connection_type = row['Connection_Type']
    if model not in model_details:
        model_details[model] = {}
    if manufacturer not in model_details[model]:
        model_details[model][manufacturer] = set()
    model_details[model][manufacturer].add(connection_type)

# Initialize an empty list to store the QA data
qa_data = []

# Loop through each row in the DataFrame to generate QA pairs
for index, row in probes_df.iterrows():
    manufacturer = row['Manufacturer']
    probe_model = row['Probe_Model']
    connection_type = row['Connection_Type']
    connection_info = f" with a {connection_type} connector" if connection_type == "Cartridge" else "" # Include connection type in the question if it's not empty
    compatible_systems = row['Compatible_Systems'].split(';') if pd.notna(row['Compatible_Systems']) else []
    compatible_systems = [system.strip() for system in compatible_systems] # Clean and update compatible systems list
    array_type = row['Array_Type'] if 'Array_Type' in row else 'Unknown'
    frequency_range = row['Frequency_Range'] if 'Frequency_Range' in row else 'Unknown'
    application_types = row['Applications'].split(';') if 'Applications' in row else []
    stock = row['Stock'] if 'Stock' in row else 0

    # Q1 with random replacement of "manufacturer" with "maker"
    manufacturer_term = random.choice(["manufacturer", "maker"])
    suffix = random.choice(["", " transducer", " probe"])
    question1 = f"Who is the {manufacturer_term} of {probe_model}{connection_info}{suffix}?"
    if connection_type == "Cartridge":
        answer1 = f"The manufacturer of the {probe_model} probe {connection_info} is {manufacturer}."
    else:
        answer1 = f"The manufacturer of the {probe_model} probe {connection_info} is {manufacturer}."
    # Create the context
    context = create_context(question1, row, ['Manufacturer'])
    # Tokenize the context and answer
    encoding = tokenizer.encode_plus(context, answer1, truncation=True, max_length=512, padding='max_length', return_offsets_mapping=True)
    # Find the start and end token positions of the answer within the context
    answer_start = encoding.char_to_token(len(context), 0)[1]
    answer_end = encoding.char_to_token(len(context) + len(answer1), 0)[1]
    qa_data.append({"Question": question1, "Answer": answer1, "Categories": ["Manufacturer"], "Tags": [manufacturer, probe_model]})

    # Q3
    manufacturers_for_model = list(model_details[probe_model].keys())
    if len(manufacturers_for_model) > 1:
        false_manufacturer = random.choice([fm for fm in manufacturers_for_model if fm != manufacturer])
        answer3 = f"The {probe_model}{connection_info} is made by both {manufacturer} and {false_manufacturer}."
    else:
        false_manufacturer = random.choice([fm for fm in all_manufacturers if fm != manufacturer])
        answer3 = f"The {probe_model}{connection_info} probe is made by {manufacturer}."
    question3 = f"Is {probe_model}{connection_info} made by {manufacturer} or {false_manufacturer}?"
    qa_data.append({"Question": question3, "Answer": answer3, "Categories": ["Manufacturer"], "Tags": [manufacturer, probe_model]})

    # Q4 about system compatibility
    compatible_systems_str = ', '.join([system for sublist in compatible_systems for system in sublist.split(',')])
    question4 = f"What systems is {manufacturer} {probe_model}{connection_info} compatible with?"
    answer4 = f"The {manufacturer} {probe_model}{connection_info} is compatible with {compatible_systems_str} ultrasound systems."
    qa_data.append({"Question": question4, "Answer": answer4, "Categories": ["Compatibility"], "Tags": [manufacturer, probe_model, ', '.join(compatible_systems)]})

    # Q5 about specific system compatibility (always "Yes")
    action_word = random.choice(["Can", "Does"])
    if compatible_systems:
        compatible_systems_flat = [system for sublist in compatible_systems for system in sublist.split(',')]
        system = random.choice(compatible_systems_flat)
        question5 = f"{action_word} the {manufacturer} {probe_model}{connection_info} work with the {system}?"
        additional_systems = [s for s in compatible_systems_flat if s != system]
        if additional_systems:
            additional_systems_str = ", ".join(additional_systems)
            answer5 = f"Yes, the {manufacturer} {probe_model}{connection_info} is compatible with the {system} ultrasound system, as well as {additional_systems_str}."
        else:
            answer5 = f"Yes, the {manufacturer} {probe_model}{connection_info} is compatible with the {system} ultrasound system."
        qa_data.append({"Question": question5, "Answer": answer5, "Categories": ["Compatibility"], "Tags": [manufacturer, probe_model, system]})

    # Q6 about specific system compatibility (always "No")
    non_compatible_systems = list(all_systems - set([system for sublist in compatible_systems for system in sublist.split(',')]))
    if non_compatible_systems:
        system = random.choice(list(non_compatible_systems))
        question6 = f"{random.choice(['Can', 'Does'])} the {manufacturer} {probe_model}{connection_info} work with the {system}?"
        compatible_systems_str = ', '.join([system for sublist in compatible_systems for system in sublist.split(',')])
        answer6 = f"No, the {manufacturer} {probe_model}{connection_info} is not compatible with the {system} ultrasound system."
        qa_data.append({"Question": question6, "Answer": answer6, "Categories": ["Compatibility"], "Tags": [manufacturer, probe_model]})

    # Q7 about probe type and characteristics
    question7 = f"What type of probe is the {manufacturer} {probe_model}{connection_info}?"
    answer7 = f"The {manufacturer} {probe_model} is a {array_type} probe with a frequency range of {frequency_range}."
    qa_data.append({"Question": question7, "Answer": answer7, "Categories": ["Array Type", "Frequency Range", "Probe Type"], "Tags": [manufacturer, probe_model, array_type]})

    # Q8 about application type
    formatted_application_types = ", ".join(application_types[:-1]) + ", and " + application_types[-1] if len(application_types) > 1 else ", ".join(application_types)
    question8 = f"What can the {manufacturer} {probe_model}{connection_info} be used for?"
    answer8 = f"The {manufacturer} {probe_model}{connection_info} can be used for {formatted_application_types} applications."
    qa_data.append({"Question": question8, "Answer": answer8, "Categories": ["Applications"], "Tags": [manufacturer, probe_model, *application_types]})

    # Q9 about availability for sale or in stock
    sale_or_stock = random.choice(["for sale", "in stock"])
    question9 = f"Do you have any {manufacturer} {probe_model}{connection_info} {sale_or_stock}?"
    if stock > 0:
        answer9 = f"Yes, we currently have {manufacturer} {probe_model}{connection_info} available for sale."
    else:
        answer9 = f"We currently do not have any {manufacturer} {probe_model}{connection_info} in stock."
    qa_data.append({"Question": question9, "Answer": answer9, "Categories": ["Stock"], "Tags": [manufacturer, probe_model]})

    # Q10 
    if compatible_systems:
        # Split each string in compatible_systems by comma and flatten the list
        all_compatible_systems = [system.strip() for sublist in compatible_systems for system in sublist.split(',')]
        
        if all_compatible_systems:
            selected_system = random.choice(all_compatible_systems)
            question10 = f"Do you have any {probe_model}{connection_info} for {selected_system} for sale?"
            
            # Determine the answer based on the stock
            if stock > 0:
                answer10 = f"Yes, we currently have the {manufacturer} {probe_model}{connection_info} for the {selected_system} available for sale."
            else:
                answer10 = f"Unfortunately, we do not have any {manufacturer} {probe_model}{connection_info} for the {selected_system} available for sale."
            qa_data.append({"Question": question10, "Answer": answer10, "Categories": ["Stock", "Compatibility"], "Tags": [manufacturer, probe_model, selected_system]})
        else:
            qa_data.append({"Question": "", "Answer": "", "Categories": [], "Tags": []})

# Q11
query_array_types = ['linear', 'convex']

# Extract all distinct ultrasound systems from the Compatible_Systems column
distinct_systems = set()
probes_df['Compatible_Systems'].dropna().str.split(';').apply(
    lambda x: distinct_systems.update([system.strip() for sublist in x for system in sublist.split(',')]))

# Loop through each distinct system and array type
for system in distinct_systems:
    for array_type in query_array_types:
        # Generate the question
        question11 = f"Do you have any {array_type} array probes for {system}?"

        # Check for matching conditions in the DataFrame
        matches = probes_df[(probes_df['Compatible_Systems'].str.contains(system, na=False)) &
                            (probes_df['Array_Type'].str.lower() == array_type) &
                            (probes_df['Stock'] > 0)]

        # Determine the answer based on whether there are matches
        if not matches.empty:
            # Collect matching manufacturer and probe model names
            available_probes = ", ".join([f"{row['Manufacturer']} {row['Probe_Model']}" for index, row in matches.iterrows()])
            answer11 = f"Yes, we currently have the following {array_type} probes available for sale: {available_probes}."
        else:
            answer11 = f"No, we currently do not have any {array_type} probes for {system} in stock."

        # Append the QA data to the list
        qa_data.append({"Question": question11, "Answer": answer11, "Categories": ["Compatibility", "Stock"], "Tags": [system]})

# Convert the list of dictionaries to a DataFrame
qa_dataset = pd.DataFrame(qa_data)

# Display the first few rows of the dataset to verify
qa_dataset

Unnamed: 0,Question,Answer,Categories,Tags
0,Who is the maker of C3?,The C3 probe is made by ATL.,[Manufacturer],"[ATL, C3]"
1,Is C3 made by ATL or Philips?,The C3 probe is made by ATL.,[Manufacturer],"[ATL, C3]"
2,What systems is ATL C3 compatible with?,The ATL C3 is compatible with HDI 5000 ultraso...,[Compatibility],"[ATL, C3, HDI 5000]"
3,Can the ATL C3 work with the HDI 5000?,"Yes, the ATL C3 is compatible with the HDI 500...",[Compatibility],"[ATL, C3, HDI 5000]"
4,Does the ATL C3 work with the Sequoia?,"No, the ATL C3 is not compatible with the Sequ...",[Compatibility],"[ATL, C3]"
...,...,...,...,...
460,Do you have any convex array probes for Voluso...,"No, we currently do not have any convex probes...","[Compatibility, Stock]",[Voluson 730]
461,Do you have any linear array probes for Voluso...,"No, we currently do not have any linear probes...","[Compatibility, Stock]",[Voluson E8]
462,Do you have any convex array probes for Voluso...,"No, we currently do not have any convex probes...","[Compatibility, Stock]",[Voluson E8]
463,Do you have any linear array probes for HD11 XE?,"No, we currently do not have any linear probes...","[Compatibility, Stock]",[HD11 XE]


In [None]:
from transformers import BertTokenizer

# Load the pre-trained BERT tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Initialize an empty list to store the QA data
qa_data = []

# Extract distinct manufacturers from the DataFrame
all_manufacturers = probes_df['Manufacturer'].dropna().unique().tolist()

# Extract all distinct ultrasound systems from the Compatible_Systems column
compatibility_df = probes_df.copy()
compatibility_df['Compatible_Systems'] = compatibility_df['Compatible_Systems'].str.split(',')
compatibility_df = compatibility_df.explode('Compatible_Systems')
compatibility_df['Compatible_Systems'] = compatibility_df['Compatible_Systems'].str.strip()
all_systems = set(compatibility_df['Compatible_Systems'].dropna().unique())

# Create a dictionary to map Probe_Model to a list of manufacturers and connection types
model_details = {}
for index, row in probes_df.iterrows():
    model = row['Probe_Model']
    manufacturer = row['Manufacturer']
    connection_type = row['Connection_Type']
    if model not in model_details:
        model_details[model] = {}
    if manufacturer not in model_details[model]:
        model_details[model][manufacturer] = set()
    model_details[model][manufacturer].add(connection_type)

# Loop through each row in the DataFrame to generate QA pairs
for index, row in probes_df.iterrows():
    manufacturer = row['Manufacturer']
    probe_model = row['Probe_Model']
    connection_type = row['Connection_Type']
    connection_info = f" with a {connection_type} connector" if connection_type == "Cartridge" else "" # Include connection type in the question if it's not empty
    compatible_systems = row['Compatible_Systems'].split(';') if pd.notna(row['Compatible_Systems']) else []
    compatible_systems = [system.strip() for system in compatible_systems] # Clean and update compatible systems list
    array_type = row['Array_Type'] if 'Array_Type' in row else 'Unknown'
    frequency_range = row['Frequency_Range'] if 'Frequency_Range' in row else 'Unknown'
    application_types = row['Applications'].split(';') if 'Applications' in row else []
    stock = row['Stock'] if 'Stock' in row else 0

    # Q1 with random replacement of "manufacturer" with "maker"
    manufacturer_term = random.choice(["manufacturer", "maker"])
    suffix = random.choice(["", " transducer", " probe"])
    question1 = f"Who is the {manufacturer_term} of {probe_model}{connection_info}{suffix}?"
    if connection_type == "Cartridge":
        answer1 = f"The {manufacturer} {probe_model}{connection_info} is made by {manufacturer}."
    else:
        answer1 = f"The {manufacturer} {probe_model} is made by {manufacturer}."
    qa_data.append({"Question": question1, "Answer": answer1})

    # Q3
    manufacturers_for_model = list(model_details[probe_model].keys())
    if len(manufacturers_for_model) > 1:
        false_manufacturer = random.choice([fm for fm in manufacturers_for_model if fm != manufacturer])
        answer3 = f"The {probe_model}{connection_info} is made by both {manufacturer} and {false_manufacturer}."
    else:
        false_manufacturer = random.choice([fm for fm in all_manufacturers if fm != manufacturer])
        answer3 = f"The {probe_model}{connection_info} probe is made by {manufacturer}."
    question3 = f"Is {probe_model}{connection_info} made by {manufacturer} or {false_manufacturer}?"
    qa_data.append({"Question": question3, "Answer": answer3, "Categories": ["Manufacturer"], "Tags": [manufacturer, probe_model]})

    # Q4 about system compatibility
    compatible_systems_str = ', '.join([system for sublist in compatible_systems for system in sublist.split(',')])
    question4 = f"What systems is {manufacturer} {probe_model}{connection_info} compatible with?"
    answer4 = f"The {manufacturer} {probe_model}{connection_info} is compatible with {compatible_systems_str} ultrasound systems."
    qa_data.append({"Question": question4, "Answer": answer4, "Categories": ["Compatibility"], "Tags": [manufacturer, probe_model, ', '.join(compatible_systems)]})

    # Q5 about specific system compatibility (always "Yes")
    action_word = random.choice(["Can", "Does"])
    if compatible_systems:
        compatible_systems_flat = [system for sublist in compatible_systems for system in sublist.split(',')]
        system = random.choice(compatible_systems_flat)
        question5 = f"{action_word} the {manufacturer} {probe_model}{connection_info} work with the {system}?"
        additional_systems = [s for s in compatible_systems_flat if s != system]
        if additional_systems:
            additional_systems_str = ", ".join(additional_systems)
            answer5 = f"Yes, the {manufacturer} {probe_model}{connection_info} is compatible with the {system} ultrasound system, as well as {additional_systems_str}."
        else:
            answer5 = f"Yes, the {manufacturer} {probe_model}{connection_info} is compatible with the {system} ultrasound system."
        qa_data.append({"Question": question5, "Answer": answer5, "Categories": ["Compatibility"], "Tags": [manufacturer, probe_model, system]})

    # Q6 about specific system compatibility (always "No")
    non_compatible_systems = list(all_systems - set([system for sublist in compatible_systems for system in sublist.split(',')]))
    if non_compatible_systems:
        system = random.choice(list(non_compatible_systems))
        question6 = f"{random.choice(['Can', 'Does'])} the {manufacturer} {probe_model}{connection_info} work with the {system}?"
        compatible_systems_str = ', '.join([system for sublist in compatible_systems for system in sublist.split(',')])
        answer6 = f"No, the {manufacturer} {probe_model}{connection_info} is not compatible with the {system} ultrasound system."
        qa_data.append({"Question": question6, "Answer": answer6, "Categories": ["Compatibility"], "Tags": [manufacturer, probe_model]})

    # Q7 about probe type and characteristics
    question7 = f"What type of probe is the {manufacturer} {probe_model}{connection_info}?"
    answer7 = f"The {manufacturer} {probe_model} is a {array_type} probe with a frequency range of {frequency_range}."
    qa_data.append({"Question": question7, "Answer": answer7, "Categories": ["Array Type", "Frequency Range", "Probe Type"], "Tags": [manufacturer, probe_model, array_type]})

    # Q8 about application type
    formatted_application_types = ", ".join(application_types[:-1]) + ", and " + application_types[-1] if len(application_types) > 1 else ", ".join(application_types)
    question8 = f"What can the {manufacturer} {probe_model}{connection_info} be used for?"
    answer8 = f"The {manufacturer} {probe_model}{connection_info} can be used for {formatted_application_types} applications."
    qa_data.append({"Question": question8, "Answer": answer8, "Categories": ["Applications"], "Tags": [manufacturer, probe_model, *application_types]})

    # Q9 about availability for sale or in stock
    sale_or_stock = random.choice(["for sale", "in stock"])
    question9 = f"Do you have any {manufacturer} {probe_model}{connection_info} {sale_or_stock}?"
    if stock > 0:
        answer9 = f"Yes, we currently have {manufacturer} {probe_model}{connection_info} available for sale."
    else:
        answer9 = f"We currently do not have any {manufacturer} {probe_model}{connection_info} in stock."
    qa_data.append({"Question": question9, "Answer": answer9, "Categories": ["Stock"], "Tags": [manufacturer, probe_model]})

    # Q10 
    if compatible_systems:
        # Split each string in compatible_systems by comma and flatten the list
        all_compatible_systems = [system.strip() for sublist in compatible_systems for system in sublist.split(',')]
        
        if all_compatible_systems:
            selected_system = random.choice(all_compatible_systems)
            question10 = f"Do you have any {probe_model}{connection_info} for {selected_system} for sale?"
            
            # Determine the answer based on the stock
            if stock > 0:
                answer10 = f"Yes, we currently have the {manufacturer} {probe_model}{connection_info} for the {selected_system} available for sale."
            else:
                answer10 = f"Unfortunately, we do not have any {manufacturer} {probe_model}{connection_info} for the {selected_system} available for sale."
            qa_data.append({"Question": question10, "Answer": answer10, "Categories": ["Stock", "Compatibility"], "Tags": [manufacturer, probe_model, selected_system]})
        else:
            qa_data.append({"Question": "", "Answer": "", "Categories": [], "Tags": []})

# Q11
query_array_types = ['linear', 'convex']

# Extract all distinct ultrasound systems from the Compatible_Systems column
distinct_systems = set()
probes_df['Compatible_Systems'].dropna().str.split(';').apply(
    lambda x: distinct_systems.update([system.strip() for sublist in x for system in sublist.split(',')]))

# Loop through each distinct system and array type
for system in distinct_systems:
    for array_type in query_array_types:
        # Generate the question
        question11 = f"Do you have any {array_type} array probes for {system}?"

        # Check for matching conditions in the DataFrame
        matches = probes_df[(probes_df['Compatible_Systems'].str.contains(system, na=False)) &
                            (probes_df['Array_Type'].str.lower() == array_type) &
                            (probes_df['Stock'] > 0)]

        # Determine the answer based on whether there are matches
        if not matches.empty:
            # Collect matching manufacturer and probe model names
            available_probes = ", ".join([f"{row['Manufacturer']} {row['Probe_Model']}" for index, row in matches.iterrows()])
            answer11 = f"Yes, we currently have the following {array_type} probes available for sale: {available_probes}."
        else:
            answer11 = f"No, we currently do not have any {array_type} probes for {system} in stock."

        # Append the QA data to the list
        qa_data.append({"Question": question11, "Answer": answer11, "Categories": ["Compatibility", "Stock"], "Tags": [system]})

# Convert the list of dictionaries to a DataFrame
qa_dataset = pd.DataFrame(qa_data)

# Convert the list of dictionaries to a DataFrame
qa_dataset = pd.DataFrame(qa_data)
