In [1]:
#this notebook compares the performance of embeddings extracted by Llama3 models on a sentence similarity task
import torch
import pandas as pd
from transformers import AutoTokenizer, AutoModel
import config
import typing
import logging
import tqdm
import torch
import numpy as np
import requests


In [2]:
CFG = config.Config()

In [3]:
#first calculate DiffCSE embeddings:
model_name = "voidism/diffcse-roberta-base-sts"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

Some weights of the model checkpoint at voidism/diffcse-roberta-base-sts were not used when initializing RobertaModel: ['aux_bert.embeddings.LayerNorm.bias', 'aux_bert.embeddings.LayerNorm.weight', 'aux_bert.embeddings.position_embeddings.weight', 'aux_bert.embeddings.position_ids', 'aux_bert.embeddings.token_type_embeddings.weight', 'aux_bert.embeddings.word_embeddings.weight', 'aux_bert.encoder.layer.0.attention.output.LayerNorm.bias', 'aux_bert.encoder.layer.0.attention.output.LayerNorm.weight', 'aux_bert.encoder.layer.0.attention.output.dense.bias', 'aux_bert.encoder.layer.0.attention.output.dense.weight', 'aux_bert.encoder.layer.0.attention.self.key.bias', 'aux_bert.encoder.layer.0.attention.self.key.weight', 'aux_bert.encoder.layer.0.attention.self.query.bias', 'aux_bert.encoder.layer.0.attention.self.query.weight', 'aux_bert.encoder.layer.0.attention.self.value.bias', 'aux_bert.encoder.layer.0.attention.self.value.weight', 'aux_bert.encoder.layer.0.intermediate.dense.bias', 'aux

In [7]:
#first try procedure on extracted topic argument sets from simulation data COMPTEXT24:
LANGUAGE: str = 'English'
GROUPER: str = 'topic'
ARG_DIR: str = 'data/arguments'
MODEL: str = "llama3:70b-instruct-q6_K" # "mixtral:8x7b-instruct-v0.1-q6_K"

In [4]:
#function for encoding sentence embeddings based on the DiffCSE approach which is based on (and refers to) SimCSE/evaluation.py 
def DiffCSE_encode_argument(sentence):
    input_ids = tokenizer.encode(sentence, add_special_tokens=True, truncation=True, max_length=512, padding='max_length')
    input_ids_tensor = torch.tensor(input_ids).unsqueeze(0)  # Add batch dimension
    attention_mask = torch.ones_like(input_ids_tensor)  # Creating attention mask 
    with torch.no_grad():
            outputs = model(input_ids_tensor, output_hidden_states=True, return_dict=True, attention_mask=attention_mask)
            argument_embedding = outputs.last_hidden_state[:, 0].cpu()
            return argument_embedding.tolist()

In [8]:
# Load the arguments from the .parquet file
arguments_df = pd.read_parquet(f"{ARG_DIR}/arguments.by.{GROUPER}.{LANGUAGE}.parquet")

In [17]:
arguments_df.head()

Unnamed: 0,arguments,label
0,"[""Access to affordable healthcare is a fundam...",healthcare
1,"""The current healthcare system unfairly benef...",healthcare
2,"""High costs of healthcare create financial bu...",healthcare
3,"""Pre-existing conditions should not exclude i...",healthcare
4,"""Universal healthcare would ensure everyone h...",healthcare


In [22]:
#for DiffCSE: Create embeddings for each argument and store them in a new column
arguments_df.loc[:, 'DiffCSE_embedding'] = arguments_df["arguments"].apply(DiffCSE_encode_argument)

In [9]:
#for Llama3:
embed_Llama3: typing.Dict[str, np.ndarray] = {}


In [19]:
    name = "embed_llama3"
    template = 'You help me get embeddings for a sentence. I provide you a with a context and a sentence and you reply only with that exact sentence. Context = '
    for index, row in tqdm.tqdm(arguments_df.iterrows(), total=len(arguments_df)):
        context = row["label"]
        sentence = row["arguments"]
        try: 
            embed = np.array(requests.post(
                'https://inf.cl.uni-trier.de/embed/',
                json={'model': MODEL, 'prompt': template + context + '; Sentence: ' + sentence}
                ).json()["response"])
        except Exception as _e:
            logging.warning(_e)
            embed = None
        
        if name=='embed_llama3':
            embed_Llama3[index] = embed


100%|██████████| 206/206 [00:54<00:00,  3.77it/s]


In [20]:
dataset_w_embeds = arguments_df.join(pd.Series(embed_Llama3, name="embed_llama3"))
#dataset_w_embeds.to_parquet(f'{CFG.report_dir}/dataset.embeds.parquet')
dataset_w_embeds.head()

Unnamed: 0,arguments,label,embed_llama3
0,"[""Access to affordable healthcare is a fundam...",healthcare,"[0.5143977403640747, -0.44048359990119934, -0...."
1,"""The current healthcare system unfairly benef...",healthcare,"[0.40946468710899353, -0.4694342613220215, -0...."
2,"""High costs of healthcare create financial bu...",healthcare,"[0.3307845890522003, -0.531212329864502, -0.79..."
3,"""Pre-existing conditions should not exclude i...",healthcare,"[0.2711120843887329, -0.33552631735801697, -0...."
4,"""Universal healthcare would ensure everyone h...",healthcare,"[0.40233278274536133, -0.5556334257125854, -0...."


In [37]:
#reshape embeddings to single dimension:
dataset_w_embeds['DiffCSE_embedding'] = dataset_w_embeds['DiffCSE_embedding'].apply(lambda x: x[0])

In [38]:
grouped_data = dataset_w_embeds.groupby('label')

In [46]:
results: typing.Dict[typing.Tuple[str, str], float] = {}

In [47]:
#for DiffCSE:
dist = torch.nn.PairwiseDistance()

for model_1, c_1 in grouped_data['DiffCSE_embedding']:
    for model_2, c_2 in grouped_data['DiffCSE_embedding']:

        if (
            (model_1, model_2) in results.keys() or 
            (model_2, model_1) in results.keys()
        ):
            continue

        res = sum([
            sum(dist(
                torch.tensor(np.array(v_1)), 
                torch.tensor(np.array(c_2.tolist()))
                )) / len(c_2)
            for v_1 in c_1
        ]) / len(c_1)

        results[(model_1, model_2)] = res

        print(f'{model_1}:{model_2}:{res.item()}')

healthcare:healthcare:3.512138113613706
healthcare:ukraine:4.4383741450897745
ukraine:ukraine:4.4135212572020945


In [48]:
#for Llama3
dist = torch.nn.PairwiseDistance()
results2: typing.Dict[typing.Tuple[str, str], float] = {}
for model_1, c_1 in grouped_data['embed_llama3']:
    for model_2, c_2 in grouped_data['embed_llama3']:

        if (
            (model_1, model_2) in results2.keys() or 
            (model_2, model_1) in results2.keys()
        ):
            continue

        res = sum([
            sum(dist(
                torch.tensor(np.array(v_1)), 
                torch.tensor(np.array(c_2.tolist()))
                )) / len(c_2)
            for v_1 in c_1
        ]) / len(c_1)

        results2[(model_1, model_2)] = res

        print(f'{model_1}:{model_2}:{res.item()}')

healthcare:healthcare:3.8460111482448824
healthcare:ukraine:7.980850266694351
ukraine:ukraine:3.7148092856659054


clearly adding context to Llama3 embeddings differentiates the arguments considerably, but this could be due to the embedding extraction overly relying on the context... what if the arguments had a similar context? If we compare the diversity of all healthcare arguments to all ukraine arguments there is hardly any difference, while the DiffCSE embeddings found a larger diversity amond ukraine arguments than among healthcare arguments

In [16]:
#what if we change the context:

#for argument context:
embed_Llama3_arg_context: typing.Dict[str, np.ndarray] = {}

name = "embed_llama3"
template = 'You help me get embeddings for a sentence. I provide you a with a context within []-square brackets and a sentence within <>-chevrons and you reply only with that exact sentence marked with the chevrons. Context = ['
for index, row in tqdm.tqdm(arguments_df.iterrows(), total=len(arguments_df)):
    context = 'arguments about ' + row["label"]
    sentence = row["arguments"]
    try: 
        embed = np.array(requests.post(
            'https://inf.cl.uni-trier.de/embed/',
            json={'model': MODEL, 'prompt': template + context + ']; Sentence: <' + sentence + '>'}
            ).json()["response"])
    except Exception as _e:
        logging.warning(_e)
        embed = None
    
    if name=='embed_llama3':
        embed_Llama3_arg_context[index] = embed
        
        

100%|██████████| 206/206 [00:45<00:00,  4.53it/s]


In [21]:
dataset_w_embeds = dataset_w_embeds.join(pd.Series(embed_Llama3_arg_context, name="embed_Llama3_arg_context"))
#dataset_w_embeds.to_parquet(f'{CFG.report_dir}/dataset.embeds.parquet')
dataset_w_embeds.head()

Unnamed: 0,arguments,label,embed_llama3,embed_Llama3_arg_context
0,"[""Access to affordable healthcare is a fundam...",healthcare,"[0.5143977403640747, -0.44048359990119934, -0....","[0.4892446994781494, -0.565439760684967, -0.88..."
1,"""The current healthcare system unfairly benef...",healthcare,"[0.40946468710899353, -0.4694342613220215, -0....","[0.5180896520614624, -0.5761647820472717, -0.9..."
2,"""High costs of healthcare create financial bu...",healthcare,"[0.3307845890522003, -0.531212329864502, -0.79...","[0.41686248779296875, -0.5384894013404846, -0...."
3,"""Pre-existing conditions should not exclude i...",healthcare,"[0.2711120843887329, -0.33552631735801697, -0....","[0.5203405618667603, -0.5485878586769104, -0.7..."
4,"""Universal healthcare would ensure everyone h...",healthcare,"[0.40233278274536133, -0.5556334257125854, -0....","[0.48462989926338196, -0.5576851963996887, -0...."


In [22]:
    #what if we don't vary the context:

    #for argument context:
    embed_Llama3_constant_context: typing.Dict[str, np.ndarray] = {}

    name = "embed_llama3"
    template = 'You help me get embeddings for a sentence. I provide you a with a context within []-square brackets and a sentence within <>-chevrons and you reply only with that exact sentence marked with the chevrons. Context = ['
    for index, row in tqdm.tqdm(arguments_df.iterrows(), total=len(arguments_df)):
        context = 'viewpoints extracted from tweets'
        sentence = row["arguments"]
        try: 
            embed = np.array(requests.post(
                'https://inf.cl.uni-trier.de/embed/',
                json={'model': MODEL, 'prompt': template + context + ']; Sentence: <' + sentence + '>'}
                ).json()["response"])
        except Exception as _e:
            logging.warning(_e)
            embed = None
        
        if name=='embed_llama3':
            embed_Llama3_constant_context[index] = embed

100%|██████████| 206/206 [00:49<00:00,  4.19it/s]


In [23]:
dataset_w_embeds = dataset_w_embeds.join(pd.Series(embed_Llama3_constant_context, name="embed_Llama3_constant_context"))
#dataset_w_embeds.to_parquet(f'{CFG.report_dir}/dataset.embeds.parquet')
dataset_w_embeds.head()

Unnamed: 0,arguments,label,embed_llama3,embed_Llama3_arg_context,embed_Llama3_constant_context
0,"[""Access to affordable healthcare is a fundam...",healthcare,"[0.5143977403640747, -0.44048359990119934, -0....","[0.4892446994781494, -0.565439760684967, -0.88...","[0.4609181880950928, -0.5663681626319885, -0.7..."
1,"""The current healthcare system unfairly benef...",healthcare,"[0.40946468710899353, -0.4694342613220215, -0....","[0.5180896520614624, -0.5761647820472717, -0.9...","[0.5193580985069275, -0.5377117991447449, -0.8..."
2,"""High costs of healthcare create financial bu...",healthcare,"[0.3307845890522003, -0.531212329864502, -0.79...","[0.41686248779296875, -0.5384894013404846, -0....","[0.46159040927886963, -0.5081135630607605, -0...."
3,"""Pre-existing conditions should not exclude i...",healthcare,"[0.2711120843887329, -0.33552631735801697, -0....","[0.5203405618667603, -0.5485878586769104, -0.7...","[0.47894448041915894, -0.4845818877220154, -0...."
4,"""Universal healthcare would ensure everyone h...",healthcare,"[0.40233278274536133, -0.5556334257125854, -0....","[0.48462989926338196, -0.5576851963996887, -0....","[0.4409814178943634, -0.5037068724632263, -0.8..."


In [24]:
grouped_data = dataset_w_embeds.groupby('label')

In [25]:
#for embed_Llama3_arg_context
dist = torch.nn.PairwiseDistance()
results3: typing.Dict[typing.Tuple[str, str], float] = {}
for model_1, c_1 in grouped_data['embed_Llama3_arg_context']:
    for model_2, c_2 in grouped_data['embed_Llama3_arg_context']:

        if (
            (model_1, model_2) in results3.keys() or 
            (model_2, model_1) in results3.keys()
        ):
            continue

        res = sum([
            sum(dist(
                torch.tensor(np.array(v_1)), 
                torch.tensor(np.array(c_2.tolist()))
                )) / len(c_2)
            for v_1 in c_1
        ]) / len(c_1)

        results3[(model_1, model_2)] = res

        print(f'{model_1}:{model_2}:{res.item()}')

healthcare:healthcare:2.2894640520120406
healthcare:ukraine:3.9211152842021
ukraine:ukraine:2.4995592026647424


In [26]:
#for embed_Llama3_constant_context
dist = torch.nn.PairwiseDistance()
results4: typing.Dict[typing.Tuple[str, str], float] = {}
for model_1, c_1 in grouped_data['embed_Llama3_constant_context']:
    for model_2, c_2 in grouped_data['embed_Llama3_constant_context']:

        if (
            (model_1, model_2) in results4.keys() or 
            (model_2, model_1) in results4.keys()
        ):
            continue

        res = sum([
            sum(dist(
                torch.tensor(np.array(v_1)), 
                torch.tensor(np.array(c_2.tolist()))
                )) / len(c_2)
            for v_1 in c_1
        ]) / len(c_1)

        results4[(model_1, model_2)] = res

        print(f'{model_1}:{model_2}:{res.item()}')

healthcare:healthcare:2.475380458318616
healthcare:ukraine:3.087622781475221
ukraine:ukraine:2.7385416234923095


if we make the context wording more similar, the difference between topics is smaller
if we don't vary context at all, the difference between topics is very much smaller


In [None]:
#prudent way is to use Llama3 but with constant context
#what would results look like for retrieved source?





In [29]:
#consider testing the effect of adding irrelevant context to sentences on embedding distances:

#prompted chatGPT:can you give me 3 examples of sentences which have the exact same meaning regardless of whether they are used in the context of ukaïne or healthcare?
# Define the sentences
sentences = [
    "The situation remains critical and requires immediate attention.",
    "There is a need for more resources to effectively address the challenges.",
    "Cooperation between different sectors is essential for a successful outcome."
]

# Create a DataFrame
df = pd.DataFrame(sentences, columns=['Sentences'])

# Display the DataFrame
print(df)


                                           Sentences
0  The situation remains critical and requires im...
1  There is a need for more resources to effectiv...
2  Cooperation between different sectors is essen...


In [30]:
#prompted Llama3: can you give me 3 examples of sentences which have the exact same meaning regardless of whether they are used in the context of ukaïne or healthcare?

sentences = [
    "The situation is getting out of control.",
    "We need to take immediate action to address this issue.",
    "The lack of resources is hindering our progress."
]


#Create a dictionary to store the data

df2 = pd.DataFrame(sentences, columns=['Sentences'])
df= df.append(df2, ignore_index=True)


  df= df.append(df2, ignore_index=True)


In [32]:
#duplicate df
df = df.append(df, ignore_index=True)
#add irrelevant context
context =["ukraine", "ukraine", "ukraine", "ukraine", "ukraine", "ukraine", "healthcare", "healthcare", "healthcare", "healthcare", "healthcare", "healthcare"]

df["context"] = context

  df = df.append(df, ignore_index=True)


In [33]:
df

Unnamed: 0,Sentences,context
0,The situation remains critical and requires im...,ukraine
1,There is a need for more resources to effectiv...,ukraine
2,Cooperation between different sectors is essen...,ukraine
3,The situation is getting out of control.,ukraine
4,We need to take immediate action to address th...,ukraine
5,The lack of resources is hindering our progress.,ukraine
6,The situation remains critical and requires im...,healthcare
7,There is a need for more resources to effectiv...,healthcare
8,Cooperation between different sectors is essen...,healthcare
9,The situation is getting out of control.,healthcare


In [34]:
#embed the sentences:
embed_context_indep: typing.Dict[str, np.ndarray] = {}

name = "embed_llama3"
template = 'You help me get embeddings for a sentence. I provide you a with a context within []-square brackets and a sentence within <>-chevrons and you reply only with that exact sentence marked with the chevrons. Context = ['
for index, row in tqdm.tqdm(df.iterrows(), total=len(df)):
    context = row["context"]
    sentence = row["Sentences"]
    try: 
        embed = np.array(requests.post(
            'https://inf.cl.uni-trier.de/embed/',
            json={'model': MODEL, 'prompt': template + context + ']; Sentence: <' + sentence + '>'}
            ).json()["response"])
    except Exception as _e:
        logging.warning(_e)
        embed = None
    
    if name=='embed_llama3':
        embed_context_indep[index] = embed

100%|██████████| 12/12 [00:02<00:00,  5.14it/s]


In [36]:
#join data
df = df.join(pd.Series(embed_context_indep, name='embed_context_indep'))
#group data
dfgrouped=df.groupby('context')
df.head()

Unnamed: 0,Sentences,context,embed_context_indep
0,The situation remains critical and requires im...,ukraine,"[0.4959905743598938, -0.8566449284553528, -0.9..."
1,There is a need for more resources to effectiv...,ukraine,"[0.6272997856140137, -0.8298850655555725, -0.9..."
2,Cooperation between different sectors is essen...,ukraine,"[0.5689340829849243, -0.7365984916687012, -0.8..."
3,The situation is getting out of control.,ukraine,"[0.45812100172042847, -0.8505119681358337, -1...."
4,We need to take immediate action to address th...,ukraine,"[0.5990078449249268, -0.8402009010314941, -0.9..."


In [37]:
#calculate distances between sentences

dist = torch.nn.PairwiseDistance()
results5: typing.Dict[typing.Tuple[str, str], float] = {}
for model_1, c_1 in dfgrouped['embed_context_indep']:
    for model_2, c_2 in dfgrouped['embed_context_indep']:

        if (
            (model_1, model_2) in results5.keys() or 
            (model_2, model_1) in results5.keys()
        ):
            continue

        res = sum([
            sum(dist(
                torch.tensor(np.array(v_1)), 
                torch.tensor(np.array(c_2.tolist()))
                )) / len(c_2)
            for v_1 in c_1
        ]) / len(c_1)

        results5[(model_1, model_2)] = res

        print(f'{model_1}:{model_2}:{res.item()}')

healthcare:healthcare:2.040649500684983
healthcare:ukraine:6.05392772452196
ukraine:ukraine:1.9972082912005538


the irrelevant context has a very strong effect on the embeddings of the sentences, even though the meaning should be similar....

In [10]:
#some validity checks
#maybe look at the output that is embedded, did the output yield correctly or are there traces of the prompt left in the output?

output_llama3: typing.Dict[str, np.ndarray] = {}
name = "embed_llama3"
template = 'You help me get embeddings for a sentence. I provide you a with a context and a sentence and you reply only with that exact sentence. Context = '

for index, row in tqdm.tqdm(arguments_df.iterrows(), total=len(arguments_df)):
    context = 'arguments about ' + row["label"]
    sentence = row["arguments"]
    try: 
        output = np.array(requests.post(
            'https://inf.cl.uni-trier.de/',
            json={'model': MODEL, 'prompt': template + context + '; Sentence: ' + sentence}
            ).json()["response"])
    except Exception as _e:
        logging.warning(_e)
        output = None
    
    if name=='embed_llama3':
        output_llama3[index] = output


100%|██████████| 206/206 [28:36<00:00,  8.33s/it]


In [11]:
output_llama3

{0: array("Here is the embedding for the sentence:\n\n['access', 'to', 'affordable', 'healthcare', 'is', 'a', 'fundamental', 'right']\n\nLet me know if you need anything else!",
       dtype='<U160'),
 1: array('"The current healthcare system unfairly benefits the wealthy"',
       dtype='<U61'),
 2: array('Here is the sentence with its corresponding embedding:\n\n"High costs of healthcare create financial burden for many families"\n\n[[-0.0329, 0.0452, -0.0314, ..., 0.0123, -0.0411, 0.0167]]',
       dtype='<U184'),
 3: array("Here is the embedding for the given sentence:\n\n`[-0.053, -0.107, 0.141, ..., 0.041, 0.117, -0.085]`\n\nPlease note that the actual embedding values will depend on the specific language model and its parameters used to generate them. The above response is a placeholder, and you may need to use a library like Hugging Face's Transformers or spaCy to get the actual embeddings.\n\nIf you want me to assist with generating embeddings using a specific library or model

...the prompt clearly failed...

In [14]:
#what if we add delimitters to the prompt?:


output_llama3: typing.Dict[str, np.ndarray] = {}
name = "embed_llama3"
template = 'You help me get embeddings for a sentence. I provide you a with a context within []-square brackets and a sentence within <>-chevrons and you reply only with that exact sentence marked with the chevrons. Context = ['

for index, row in tqdm.tqdm(arguments_df[5:25].iterrows(), total=len(arguments_df[5:25])):
    context = 'arguments about ' + row["label"]
    sentence = row["arguments"]
    try: 
        output = np.array(requests.post(
            'https://inf.cl.uni-trier.de/',
            json={'model': MODEL, 'prompt': template + context + ']; Sentence: <' + sentence + '>'}
            ).json()["response"])
    except Exception as _e:
        logging.warning(_e)
        output = None
    
    if name=='embed_llama3':
        output_llama3[index] = output


  0%|          | 0/20 [00:00<?, ?it/s]

100%|██████████| 20/20 [00:34<00:00,  1.75s/it]


In [15]:
output_llama3

{5: array('<"Mental health should be treated equally to physical health in healthcare coverage">',
       dtype='<U85'),
 6: array('< "The pharmaceutical industry has too much influence over healthcare policy">',
       dtype='<U78'),
 7: array('< "Telemedicine is a crucial innovation for improving access to care">',
       dtype='<U70'),
 8: array('<"Preventative care measures can reduce overall healthcare costs">',
       dtype='<U66'),
 9: array('< "Chronic disease management is often neglected in current healthcare models" >',
       dtype='<U80'),
 10: array('< "Rural communities face unique challenges in accessing quality healthcare">',
       dtype='<U77'),
 11: array('< "Health disparities exist along racial and ethnic lines, requiring targeted solutions">',
       dtype='<U89'),
 12: array('<"The COVID-19 pandemic has highlighted the importance of a robust public health infrastructure">',
       dtype='<U97'),
 13: array('< "Greater investment in medical research is needed to 

much better!

In [38]:
output_llama3: typing.Dict[str, np.ndarray] = {}
name = "embed_llama3"
template = 'You help me get embeddings for a sentence. I provide you a with a context within []-square brackets and a sentence within <>-chevrons and you reply only with that exact sentence marked with the chevrons. Context = ['

for index, row in tqdm.tqdm(arguments_df[5:25].iterrows(), total=len(arguments_df[5:25])):
    context = 'viewpoints extracted from tweets'
    sentence = row["arguments"]
    try: 
        output = np.array(requests.post(
            'https://inf.cl.uni-trier.de/',
            json={'model': MODEL, 'prompt': template + context + ']; Sentence: <' + sentence + '>'}
            ).json()["response"])
    except Exception as _e:
        logging.warning(_e)
        output = None
    
    if name=='embed_llama3':
        output_llama3[index] = output

100%|██████████| 20/20 [00:52<00:00,  2.61s/it]


In [39]:
output_llama3

{5: array('< "Mental health should be treated equally to physical health in healthcare coverage">',
       dtype='<U86'),
 6: array('< "The pharmaceutical industry has too much influence over healthcare policy">',
       dtype='<U78'),
 7: array('< "Telemedicine is a crucial innovation for improving access to care">',
       dtype='<U70'),
 8: array('<"Preventative care measures can reduce overall healthcare costs>',
       dtype='<U65'),
 9: array('< "Chronic disease management is often neglected in current healthcare models">',
       dtype='<U79'),
 10: array('<"Rural communities face unique challenges in accessing quality healthcare">',
       dtype='<U76'),
 11: array('< "Health disparities exist along racial and ethnic lines, requiring targeted solutions">',
       dtype='<U89'),
 12: array('< "The COVID-19 pandemic has highlighted the importance of a robust public health infrastructure">',
       dtype='<U98'),
 13: array('< "Greater investment in medical research is needed to a