# Import packages and load credentials

In [2]:
import os
import asyncio
import base64
import time
import pandas as pd
import vertexai.preview.generative_models as generative_models

from IPython.display import display

from typing import List, Dict

from google.cloud import aiplatform
from google.oauth2 import service_account

from vertexai.generative_models import (GenerativeModel,
                                        GenerationResponse,
                                        Part)

In [3]:
# Initialize VertexAI
credentials = service_account.Credentials.from_service_account_file("service-account.json")

aiplatform.init(project = 'project-id',
                credentials = credentials)

In [4]:
# Get the test documents that are classified as 'No need for additional anaysis'
no_pva_path = "C:/Users/llm_agent_pva/Docs/NO_PVA"

# Get the test documents that are classified as 'Yes need for additional anaysis'
yes_pva_path = "C:/Users/llm_agent_pva/Docs/YES_PVA"

no_pva_docs = os.listdir(no_pva_path)
yes_pva_docs = os.listdir(yes_pva_path)

# Prompts

In [5]:
# Prompts for the summary of the document for PVA_needed 
parties = """Extract the parties involved in this contract. Return the response in the form of a dictionary with the given format.
Format: {"parties": <parties>}"""

effective_date = """Extract the effective date from this contract and pressent it in a dictionary format as shown below.
Date should be formatted in Day-Month-Year.

Format: {"effective_date": <Response>}"""

territory = """Extract the territory details from this contract and present the answer in the below format.
Format: {"territory": <territory>}"""

recitals = """Extract the recitals from this contract. Return the response in the form of a dictionary with the given format.
Format: {"recitals": <recitals>}"""

local_global = """You are a legal expert. Read the contract given below and check if the distributor has rights to distribute in one territory or multiple territories.
Rules:
    - Determine the number of countries the distributor is responsible for.
    - If the partner is resopnsible for distributing in only one country, consider it as a 'Local' or else, consider it as 'Global'.
    - Apart from determining if is 'Local' or 'Global', justify your answer in a single paragraph.
    
Format: {"local_global": <answer>, "local_global_reason": <justification>}"""

In [6]:
# Prompts for the PVA needed or not analysis
is_commercial = """You are a legal expert. Read the contract given below and check if the contract is related 'Commercial' or 'Service'. Return the response in the form of a dictionary with the given format.
Rules:
    - Label the contract as 'Commercial' if the contract is related to product supply, sales and distribution, product pricing, payment purchase of products, quantity of products or delivery of products.
    - Label the contract as 'Service' otherwise.
    - Use strictly only these rules to determine the label of the contract.
    
Format: {"contract_type": <response>}"""

distribution_in_out = """You are a legal expert. Read the contract given below and determine if {main_company} is giving out the distribution rights or it is taking the distribution rights. Return the response in the form of a dictionary with the given format.
Rules:
    - If {main_company} is giving out the distribution rights, return the answer as "Distribution-Out".
    - Return the answer as "Distribution-In" otherwise.
    - Use strictly only these rules to determine your response.
    
Format: {"distribution_type": <distribution type>}"""

mah_owner = """You are a legal expert. Read the contract given below and check if there is a transfer of Marketing Authorization (MAH) Rights to other parties. Return the response in the form of a dictionary with the given format.
Rules:
    - Determine the partner of {main_company}.
    - Mark the contract as "Partner" if the partner is responsible for registering the product with regulatory body.
    - Mark the contract as "Partner" if Janssen is transferring the marketing authorization holder rights to the partner in the distribution territory.
    - Mark the contract as "{main_company}" otherwise.
    - Use strictly only these rules to determine the label of the contract.
    
Format: {"mah_owner": <mah owner>}"""

hcp_poc = """You are a legal expert. Read the contract given below and determine if {main_company} is in direct contact with healthcare professionals (HCP).
Return the response in the form of a dictionary with the given format.
Return your answer in the form of "{main_company}" or "Partner".

Rules:
    - Determine the partner of {main_company}.
    - Mark the contract as "{main_company}" if the partner is responsible for holding discussions with pharmacies, hospitals, or doctors, if is directly communicating with patients, or if is responsible for collecting and reporting adverse events information.
    - Mark the contract as "Partner" otherwise.
    - Use strictly only these rules to determine the label of the contract.
    
Format: {"hcp_poc": <hcp poc>}"""

partner_packaging = """You are a legal expert. Read the contract given below and check if the contract is related to {main_company} partner's responsibility towards packaging, ownership of packaging, labeling, and trademarks. Return the response in the form of a dictionary with the given format.
Rules:
    - Determine the partner of {main_company}.
    - Mark the contract as "Partner" if the partner is responsible for packaging.
    - Mark the contract as "Partner" if the partner is responsible for owning the packaging.
    - Mark the contract as "Partner" if the partner is responsible for labeling, trademarks, and/or leaflets.
    - Mark the contract as "Partner" if the partner is responsible for including their details in the packaging.
    - Mark the contract as "{main_company}" otherwise.
    - Use strictly only these rules to determine the label of the contract.
    
Format: {"packaging_owner": <owner>}"""

reg_poc = """You are a legal expert. Read the contract given below and determine if {main_company} is responsible for regulatory filing and communication. Return the response in the form of a dictionary with the given format.
Rules:
    - Determine the partner of {main_company}.
    - Mark the contract as "Partner" if the partner is responsible for the regulatory filing and regulatory communication.
    - Mark the contract as "{main_company}" otherwise.
    - Use strictly only these rules to determine the label of the contract.
    
Format: {"regulatory_poc": <regulatory poc>}"""

pva_needed = """You are a legal advisor. Use the content given below and identify if the partner is responsible for any of the obligations mentioned on the following rules. Return the response in the form of a dictionary with the given format.
Rules:
    - ONLY respond with "Yes" or "No".
    - Mark the contract as "Yes" if packaging_owner is the partner.
    - Mark the contract as "Yes" if regulatory_poc is the partner.
    - Mark the contract as "Yes" if hcp_poc is the partner.
    - Mark the contract as "No" otherwise.
    - Use strictly only these rules to determine the label of the contract.
    
Format: {"pva_needed": <"Yes" or "No">}"""

In [7]:
summary_prompts = [parties, effective_date, territory, recitals, local_global]
pva_prompts = [is_commercial, distribution_in_out, mah_owner, hcp_poc, partner_packaging, reg_poc, pva_needed]

# Auxiliary functions

In [8]:
def extract_function_calls(response:GenerationResponse) -> List[Dict]:
    function_calls = []

    if response.candidates[0].function_calls:
        for function_call in response.candidates[0].function_calls:
            function_call_dict = {function_call.name: {}}
            
            for key,value in function_call.args.items():
                function_call_dict[function_call.name][key] = value

            function_calls.append(function_call_dict)

    return function_calls

In [9]:
async def async_call_gemini(prompt, temp=0, max_tkn=8192, p=0.95):
    
    gemini = GenerativeModel("gemini-1.5-flash-001")

    safety_settings = {generative_models.HarmCategory.HARM_CATEGORY_UNSPECIFIED: generative_models.HarmBlockThreshold.BLOCK_NONE,
                       generative_models.HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: generative_models.HarmBlockThreshold.BLOCK_NONE,
                       generative_models.HarmCategory.HARM_CATEGORY_HATE_SPEECH: generative_models.HarmBlockThreshold.BLOCK_NONE,
                       generative_models.HarmCategory.HARM_CATEGORY_HARASSMENT: generative_models.HarmBlockThreshold.BLOCK_NONE,
                       generative_models.HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: generative_models.HarmBlockThreshold.BLOCK_NONE}
    
    ans = await gemini.generate_content_async(prompt,
                                              generation_config = {"temperature": temp,
                                                                   "max_output_tokens": max_tkn,
                                                                   "top_p": p},
                                              safety_settings = safety_settings)
    in_tkns = ans.usage_metadata.prompt_token_count
    out_tkns = ans.usage_metadata.candidates_token_count
    # print(in_tkns, out_tkns)
    return ans.text, in_tkns, out_tkns

In [10]:
def create_docs(paths):
    docs = []

    for path in paths:
        
        # Encode the document
        with open(path, "rb") as pdf_file:
            encoded_doc = base64.b64encode(pdf_file.read())

        # Create the VertexAI document
        doc = Part.from_data(mime_type = "application/pdf",
                             data = base64.b64decode(encoded_doc))

        # Save the doc on the list
        docs.append(doc)

    return docs

In [11]:
def get_gemini_cost(in_tkns, out_tkns, model="gemini-1.5-flash-001"):
    # The costs are for every 1k characters (or 250 tokens) as described in https://cloud.google.com/vertex-ai/generative-ai/pricing
    if model == "gemini-1.5-flash-001":
        llm = GenerativeModel("gemini-1.5-flash-001")
        cost_in = 0.000125
        cost_out = 0.000375
    # Assuming it's Gemini 1.5 Pro
    else:
        llm = GenerativeModel("gemini-1.5-pro-001")
        cost_in = 0.00125
        cost_out = 0.00375

    out_cost = (cost_out * out_tkns) / 250
    in_cost = (cost_in * in_tkns) / 250

    total_tkns = in_tkns + out_tkns
    total_cost = in_cost + out_cost
    return total_tkns, total_cost

In [12]:
def format_json(ans):
    out = {}
    
    if "```" in ans:
        ans = ans.split("\n{", 1)[1].rsplit("}\n", 1)[0]

    key_value = [''.join(i.replace('"', '', 1).rsplit('"', 1)) for i in ans.split(": ", 1)]

    if key_value[0] == "local_global":
        key_value_second = key_value[1].split('", ')
        out[key_value[0].replace("{", '')] = key_value_second[0].replace("}", '')
        key_value_second = [''.join(i.replace('"', '', 1).rsplit('"', 1)) for i in key_value_second[1].split(": ", 1)]
        out[key_value_second[0].replace("{", '')] = key_value_second[1].replace("}", '')
    else:
        out[key_value[0].replace("{", '')] = key_value[1].replace("}", '')

    return out

In [14]:
def format_llm_outs(outs):
    ans = {}

    for out in outs:
        # Parse into a dictionary
        out_json = format_json(out)
    
        # Include all keys and values into the generic final answer dictionary
        for key,value in out_json.items():
            ans[key] = value
    
    return ans

In [15]:
async def retry_llm_outs(doc, max_tries=3):
    attempts = 1
    done = False

    while (max_tries >= attempts) and (not done):
        try:
            start = time.time()
            outs = await asyncio.gather(*[async_call_gemini([prompt] + doc) for prompt in (summary_prompts + pva_prompts)])
            end = time.time() - start

            answers = [i[0] for i in outs]
            in_tkns = sum([i[1] for i in outs])
            out_tkns = sum([i[2] for i in outs])
            done = True
        except:
            print(f"\t> Trying again. Waiting for {60*attempts} seconds...")
            time.sleep(60*attempts)
            attempts += 1

    return answers, in_tkns, out_tkns, end

# Main tools

In [22]:
async def pva_needed(path:str):
    """Run initial document analysis to determine if a PV Agreement is needed or not.
    
    Input:
        path: The path for the PDF file that will be analyzed."""

    # Prepare the document to be given to the LLM
    print("Parsing documents...")
    doc = create_docs([path])
    print("\t> Documents done!")
    
    # Run the prompts on the document
    print("Running prompts...")
    outs, in_tkns, out_tkns, end = await retry_llm_outs(doc)
    print("\t> Prompts executed!")

    # Format all the answer into a single dictionary
    ans = format_llm_outs(outs)
    
    # Get the number of tokens and cost for every input of the LLM calls
    print("Getting cost of inputs...")
    total_tkns, total_cost = get_gemini_cost(in_tkns, out_tkns)
    print("\t> Costs measured!")

    # Add the costs and tokens to the final answer
    ans["total_tokens"] = total_tkns
    ans["llm_cost"] = total_cost
    ans["execution_time"] = end
    
    # print(out_tkns)
    return ans

## Testing the tools

In [None]:
file_path = "Data/test_contract.pdf"
await pva_needed(file_path)

# Testing

## Classifying if PVA is needed or not

In [17]:
res_file_path = "gemini_pv_results.csv"

In [18]:
# Get the data from the NO_PVA files
for i in range(7, len(no_pva_docs)):
    print(f"Checking file {i+1} of {len(no_pva_docs)}:")

    try:
        llm_outs = await pva_needed(no_pva_path + '/' + no_pva_docs[i])

        llm_outs["file_name"] = no_pva_docs[i].split(".")[0]
        llm_outs["pva_needed_ground_truth"] = "No"

        # Create the dataframe and store on the csv file
        df = pd.DataFrame.from_dict([llm_outs])
        df.to_csv(res_file_path,
                  mode = 'a',
                  index = False,
                  header = False)
    except:
        print(f"Couldn't process file {no_pva_docs[i]}")
    
print("\nDocuments completed!")

Checking file 8 of 17:
Parsing documents...
	> Documents done!
Running prompts...
	> Prompts executed!
Getting cost of inputs...
	> Input costs measured!
Checking file 9 of 17:
Parsing documents...
	> Documents done!
Running prompts...
	> Prompts executed!
Getting cost of inputs...
	> Input costs measured!
Checking file 10 of 17:
Parsing documents...
	> Documents done!
Running prompts...
	> Prompts executed!
Getting cost of inputs...
	> Input costs measured!
Checking file 11 of 17:
Parsing documents...
	> Documents done!
Running prompts...
	> Prompts executed!
Getting cost of inputs...
	> Input costs measured!
Checking file 12 of 17:
Parsing documents...
	> Documents done!
Running prompts...
	> Prompts executed!
Getting cost of inputs...
	> Input costs measured!
Checking file 13 of 17:
Parsing documents...
	> Documents done!
Running prompts...
	> Prompts executed!
Getting cost of inputs...
	> Input costs measured!
Checking file 14 of 17:
Parsing documents...
	> Documents done!
Running 

In [None]:
df_fullPDF = pd.read_csv(res_file_path)
df_fullPDF.head()

In [20]:
# Get the data from the YES_PVA files
for i in range(len(yes_pva_docs)):
    print(f"Checking file {i+1} of {len(yes_pva_docs)}:")

    try:
        llm_outs = await pva_needed(yes_pva_path + '/' + yes_pva_docs[i])

        llm_outs["file_name"] = yes_pva_docs[i].split(".")[0]
        llm_outs["pva_needed_ground_truth"] = "Yes"

        # Create the dataframe and store on the csv file
        df = pd.DataFrame.from_dict([llm_outs])
        df.to_csv(res_file_path,
                  mode = 'a',
                  index = False,
                  header = False)
    except:
        print(f"Couldn't process file {yes_pva_docs[i]}")

print("\nDocuments completed!")

Checking file 1 of 36:
Parsing documents...
	> Documents done!
Running prompts...
	> Prompts executed!
Getting cost of inputs...
	> Input costs measured!
Checking file 2 of 36:
Parsing documents...
	> Documents done!
Running prompts...
	> Prompts executed!
Getting cost of inputs...
	> Input costs measured!
Checking file 3 of 36:
Parsing documents...
	> Documents done!
Running prompts...
	> Prompts executed!
Getting cost of inputs...
	> Input costs measured!
Checking file 4 of 36:
Parsing documents...
	> Documents done!
Running prompts...
	> Prompts executed!
Getting cost of inputs...
	> Input costs measured!
Checking file 5 of 36:
Parsing documents...
	> Documents done!
Running prompts...
	> Prompts executed!
Getting cost of inputs...
	> Input costs measured!
Checking file 6 of 36:
Parsing documents...
	> Documents done!
Running prompts...
	> Prompts executed!
Getting cost of inputs...
	> Input costs measured!
Checking file 7 of 36:
Parsing documents...
	> Documents done!
Running promp

In [None]:
df_fullPDF = pd.read_csv(res_file_path)
df_fullPDF.head()