In [10]:
import pandas as pd 
import base64
import vertexai
from vertexai.generative_models import GenerativeModel, Part, SafetySetting, FinishReason
from google.cloud import aiplatform
import vertexai.preview.generative_models as generative_models
import asyncio
from tqdm import tqdm
import nest_asyncio
nest_asyncio.apply()


# async def generate(input_text, safety_settings, generation_config):
#     vertexai.init(project="mtrx-wg2-modeling-dev-9yj", location="us-east1")
#     model = GenerativeModel(
#         "gemini-1.5-flash-001",
#     )
#     responses = model.generate_content(
#       [input_text],
#       generation_config=generation_config,
#       safety_settings=safety_settings,
#       stream=True,
#     )
    
#     resText = ""
#     for response in responses:
#         resText+=response.text
        
#     return resText

async def generate_content(model: GenerativeModel, prompt: str) -> str:
    response = await model.generate_content_async(prompt)
    return response.text

async def process_responses(responses: list[asyncio.Task]) -> list[str]:
    results = []
    for response in responses:
        result = await response
        results.append(result)
    return results

async def generate_responses(prompts: list[str]) -> list[str]:
    project_id = "mtrx-wg2-modeling-dev-9yj"
    location = "us-east1"
    aiplatform.init(project=project_id, location=location)

    model = GenerativeModel("gemini-1.5-flash-001")
    tasks = [asyncio.create_task(generate_content(model, prompt)) for prompt in prompts]
    responses = await process_responses(tasks)
    return responses


#############################################
## GEMINI STUFF #############################
#############################################
generation_config = {
    "max_output_tokens": 8192,
    "temperature": 1,
    "top_p": 0.95,
}

safety_settings = [
    SafetySetting(
        category=SafetySetting.HarmCategory.HARM_CATEGORY_HATE_SPEECH,
        threshold=SafetySetting.HarmBlockThreshold.BLOCK_ONLY_HIGH
    ),
    SafetySetting(
        category=SafetySetting.HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT,
        threshold=SafetySetting.HarmBlockThreshold.BLOCK_ONLY_HIGH
    ),
    SafetySetting(
        category=SafetySetting.HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT,
        threshold=SafetySetting.HarmBlockThreshold.BLOCK_ONLY_HIGH
    ),
    SafetySetting(
        category=SafetySetting.HarmCategory.HARM_CATEGORY_HARASSMENT,
        threshold=SafetySetting.HarmBlockThreshold.BLOCK_ONLY_HIGH
    ),
]

drugs_to_contraindications = pd.read_excel("../contraindicationList.xlsx")

contraindicationsData = list(drugs_to_contraindications['contraindications'])
activeIngredientsData = list(drugs_to_contraindications['active ingredient'])
print(len(contraindicationsData), ' contraindications sections found')




50378  contraindications sections found


In [11]:
#############################################
## MAIN SECTION #############################
#############################################
therapyActiveIngredients = []
originalText = []
n_contraindications = len(contraindicationsData)

#def get_structured_list(text: str, index: int, safety_settings, generation_config) -> None:
    # try:
    #     response = generate(text, safety_settings, generation_config)
    # except:
    #     response = "LLM Error"
    # return response.text


def get_input_text(active_ingredient_data, contraindication_text):
    text = "Produce a list of diseases contraindicated for the active ingredient " + str(active_ingredient_data) + " in the following contraindications list:\n" + str(contraindication_text) + "Please format the list as [\'item1\', \'item2\', ... ,\'itemN\']. Do not include any other text in the response. If no diseases are contraindicated for, return an empty list as \'[]\'. If the drug is only used for diagnostic purposes, return \'diagnostic/contrast/radiolabel\'. Do not include hypersensitivity or allergy to the named drug as a contraindication. This code is being deployed in bulk so if the contraindications section is just \<template\> or similar, return an empty list. Be mindful of the distinction between contraindications in patient groups and "
    return text    


async def async_get_structured_contraindications_lists(prompts) -> list[str]:
    responses = await generate_responses(prompts)
    return responses




originalText = []
diseasesContraindicated = []
input_texts = []
tasks = [0] * n_contraindications
results = []
for index, item in tqdm(enumerate(contraindicationsData), total=n_contraindications):
    if index < 100:
        input_texts.append(get_input_text(activeIngredientsData[index], item))
        therapyActiveIngredients.append(activeIngredientsData[index])
        originalText.append(item)

diseasesContraindicated = asyncio.run(async_get_structured_contraindications_lists(input_texts))


data = pd.DataFrame({'active ingredient(s)':therapyActiveIngredients,'original text':originalText, 'diseases contraindicated for ': diseasesContraindicated})
data.to_excel("drugs_to_contraindications.xlsx")

100%|████████████████████████████████| 50378/50378 [00:00<00:00, 4050698.70it/s]


RuntimeError: asyncio.run() cannot be called from a running event loop

In [5]:
print(len(diseasesContraindicated))
print(len(therapyActiveIngredients))
print(len(originalText))

TypeError: object of type 'coroutine' has no len()