In [4]:
from sentence_transformers import SentenceTransformer
from datasets import load_dataset
from pinecone import Pinecone, ServerlessSpec
import os
from tqdm import tqdm
import cohere
import numpy as np
import warnings
from IPython.display import display
warnings.filterwarnings("ignore")

In [5]:
with open("chohere_api_keys.txt") as f:
    COHERE_API_KEY = f.read().strip()
with open("pinecone_api_key.txt") as f:
    PINECONE_API_KEY = f.read().strip()

In [6]:
# Chossing model for generate embeddings
EMBEDDING_MODEL = 'all-MiniLM-L6-v2'
model = SentenceTransformer(EMBEDDING_MODEL)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [7]:
def load_and_embedd_dataset(
        dataset_name: str = 'ErikCikalleshi/new_york_times_news_2000_2007',
        split: str = 'train',
        model: SentenceTransformer = SentenceTransformer('all-MiniLM-L6-v2'),
        text_field: str = 'content',
        rec_num: int = 40
) -> tuple:
    """
    Load a dataset and embedd the text field using a sentence-transformer model
    Args:
        dataset_name: The name of the dataset to load
        split: The split of the dataset to load
        model: The model to use for embedding
        text_field: The field in the dataset that contains the text
        rec_num: The number of records to load and embedd
    Returns:
        tuple: A tuple containing the dataset and the embeddings
    """
    from datasets import load_dataset

    print("Loading and embedding the dataset")

    # Load the dataset
    dataset = load_dataset(dataset_name, split=split)

    # Embed the first `rec_num` rows of the dataset
    embeddings = model.encode(dataset[text_field][:rec_num])

    print("Done!")
    return dataset, embeddings

In [8]:
# Load our dataset
DATASET_NAME = 'ErikCikalleshi/new_york_times_news_2000_2007'

dataset, embeddings = load_and_embedd_dataset(
    dataset_name=DATASET_NAME,
    rec_num=100,
    model=model,
)
shape = embeddings.shape

Loading and embedding the dataset


Downloading readme:   0%|          | 0.00/473 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/239M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/238M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/240M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/239M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/238M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/238M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/160M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/497249 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/55250 [00:00<?, ? examples/s]

Done!


In [9]:
# Little show of the datset
pd_dataset = dataset.to_pandas()
pd_dataset.head(5)

Unnamed: 0,date,title,content
0,2005,"For Ali G, More Fame But Fewer Dupes","Sacha Baron Cohen, the British comedian behind..."
1,2002,Chris Matthews and Me,To the Editor:\nHats off to Russ Baker for art...
2,2000,A Late Goal by Elias Stuns Panthers,The head fake did it.\nPatrik Elias remembered...
3,2006,Malverne Joins Crackdown on Illegal Housing,The Village of Malverne has joined a crackdown...
4,2001,Democrat Vows to Alter Way 'Business Is Done i...,"James E. McGreevey, the Democratic mayor of Wo..."


In [10]:
print(f"The embeddings shape: {embeddings.shape}")

The embeddings shape: (100, 384)


In [11]:
def create_pinecone_index(
        index_name: str,
        dimension: int,
        metric: str = 'cosine',
):
    """
    Create a pinecone index if it does not exist
    Args:
        index_name: The name of the index
        dimension: The dimension of the index
        metric: The metric to use for the index
    Returns:
        Pinecone: A pinecone object which can later be used for upserting vectors and connecting to VectorDBs
    """
    from pinecone import Pinecone, ServerlessSpec
    print("Creating a Pinecone index...")
    pc = Pinecone(api_key=PINECONE_API_KEY)
    existing_indexes = [index_info["name"] for index_info in pc.list_indexes()]
    if index_name not in existing_indexes:
        pc.create_index(
            name=index_name,
            dimension=dimension,
            # Remember! It is crucial that the metric you will use in your VectorDB will also be a metric your embedding
            # model works well with!
            metric=metric,
            spec=ServerlessSpec(
                cloud="aws",
                region="us-east-1"
            )
        )
    print("Done!")
    return pc

In [12]:
# The name of the pinecone index
INDEX_NAME = 'new-york-times-news-2000-2007'

# Create the vector database
# We are passing the index_name and the size of our embeddings
pc = create_pinecone_index(INDEX_NAME, shape[1])

Creating a Pinecone index...
Done!


In [13]:
def upsert_vectors(
        index: Pinecone,
        embeddings: np.ndarray,
        dataset: dict,
        text_field: str = 'content',
        batch_size: int = 128
):
    """
    Upsert vectors to a pinecone index
    Args:
        index: The pinecone index object
        embeddings: The embeddings to upsert
        dataset: The dataset containing the metadata
        batch_size: The batch size to use for upserting
    Returns:
        An updated pinecone index
    """
    print("Upserting the embeddings to the Pinecone index...")
    shape = embeddings.shape

    ids = [str(i) for i in range(shape[0])]
    meta = [{text_field: text} for text in dataset[text_field]]

    # create list of (id, vector, metadata) tuples to be upserted
    to_upsert = list(zip(ids, embeddings, meta))

    for i in tqdm(range(0, shape[0], batch_size)):
        i_end = min(i + batch_size, shape[0])
        index.upsert(vectors=to_upsert[i:i_end])
    return index


In [14]:
# Upsert the embeddings to the Pinecone index
index = pc.Index(INDEX_NAME)
index_upserted = upsert_vectors(index, embeddings, dataset)

Upserting the embeddings to the Pinecone index...


100%|██████████| 1/1 [00:00<00:00,  1.67it/s]


In [15]:
index.describe_index_stats()

{'dimension': 384,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 100}},
 'total_vector_count': 100}

**First Query**

In [16]:
#First lets write a query for the LLM
query = "How many jobs were added on Long Island in 2001?"

co = cohere.Client(api_key=COHERE_API_KEY)
response = co.chat(
        model='command-r-plus',
        message=query,
    )
response.text

"According to data from the New York State Department of Labor, Long Island added 11,500 jobs in 2001. This growth was driven by gains in the service sector, particularly in areas such as education, health services, and leisure and hospitality. The construction industry also added a significant number of jobs, as building activity on Long Island remained strong. However, there were job losses in some sectors, including manufacturing and financial activities, which experienced declines in employment during that year. Overall, the job growth on Long Island in 2001 contributed to the region's economic expansion and helped to offset the losses experienced in other parts of the country due to the dot-com bubble burst."

In [17]:
def augment_prompt(
        query: str,
        model: SentenceTransformer = SentenceTransformer('all-MiniLM-L6-v2'),
        index=None,
) -> str:
    """
    Augment the prompt with the top 3 results from the knowledge base
    Args:
        query: The query to augment
        index: The vectorstore object
    Returns:
        str: The augmented prompt
    """
    results = [float(val) for val in list(model.encode(query))]

    # get top 3 results from knowledge base
    query_results = index.query(
        vector=results,
        top_k=3,
        include_values=True,
        include_metadata=True
    )['matches']
    text_matches = [match['metadata']['content'] for match in query_results]

    # get the text from the results
    source_knowledge = "\n\n".join(text_matches)

    # feed into an augmented prompt
    augmented_prompt = f"""Using the contexts below, answer the query.
    Contexts:
    {source_knowledge}
    If the answer is not included in the source knowledge - say that you don't know.
    Query: {query}"""
    return augmented_prompt, source_knowledge

In [18]:
# Let us remember our query, and getting hopefully a more accurate answer
query = "How many jobs were added on Long Island in 2001?"
augmented_prompt, source_knowledge = augment_prompt(query, model=model, index=index)
response = co.chat(
        model='command-r-plus',
        message=augmented_prompt,
    )
response.text

'Long Island added about 6,700 jobs.'

In [19]:
print(source_knowledge)

Long Island added about 6,700 jobs last year, less than a quarter of the growth registered the previous year, while unemployment rose to 4.4 percent in January, up from 3.1 percent in January 2001, according to an employment report released on Tuesday by the state Labor Department.
The results suggest that while Long Island is not in a recession, the pockets of growth that had masked previous economic weakness are no longer absorbing excess workers, said Gary Huth, the state labor market analyst in Hicksville.
Long Island added about 6,700 jobs last year, less than a quarter of the growth registered the previous year, while unemployment rose to 4.4 percent in January, up from 3.1 percent in January 2001, according to an employment report released on Tuesday by the state Labor Department.
The results suggest that while Long Island is not in a recession, the pockets of growth that had masked previous economic weakness are no longer absorbing excess workers, said Gary Huth, the state labo

 **Second Query:**

In [20]:
#First lets write a query for the LLM
query = "In 2003 Turkey were waiting for the Bush administration to answer their demand for an economic aid package to ensure their participation in a war with Iraq. What was the amount of the economic aid package?"

co = cohere.Client(api_key=COHERE_API_KEY)
response = co.chat(
        model='command-r-plus',
        message=query,
    )
response.text

"The amount of the economic aid package that Turkey was seeking from the Bush administration in 2003 to ensure their participation in the Iraq War was $30 billion. Turkey requested this aid to help offset the potential economic impact of the war on its economy, as it anticipated significant costs associated with hosting U.S. troops, refugees, and potential disruption to its trade and tourism industries.\n\nThe United States recognized the importance of Turkey's support in the war effort and understood the potential economic challenges Turkey could face. However, the U.S. administration also wanted to ensure that any aid package provided was reasonable and justified. Negotiations took place between the two countries, and ultimately, a compromise was reached.\n\nIn March 2003, just before the Iraq War began, the U.S. offered Turkey a package of $6 billion in grants and $24 billion in loan guarantees, for a total of $30 billion in assistance. This package was designed to help Turkey with 

In [21]:
# Let us remember our query, and getting hopefully a more accurate answer
query = "In 2003 Turkey were waiting for the Bush administration to answer their demand for an economic aid package to ensure their participation in a war with Iraq. What was the amount of the economic aid package?"
augmented_prompt, source_knowledge = augment_prompt(query, model=model, index=index)
response = co.chat(
        model='command-r-plus',
        message=augmented_prompt,
    )
response.text

'The amount of the economic aid package that Turkey was demanding from the Bush administration in 2003 to ensure their participation in a war with Iraq was up to $32 billion.'

In [22]:
print(source_knowledge)

Turkish and American officials continued their diplomatic brinkmanship today, as the Turks said they were waiting for the Bush administration to answer their demand for an economic aid package worth as much as $32 billion to ensure their participation in a war with Iraq.
The American ambassador to Turkey, Robert Pearson, was summoned to the Turkish Foreign Ministry after 10 p.m. on Monday and handed the proposal, which he forwarded to Washington, American officials said.
Turkish officials demand an economic aid package worth as much as $32 billion to ensure their participation in a war with Iraq.
Turkish and American officials continued their diplomatic brinkmanship today, as the Turks said they were waiting for the Bush administration to answer their demand for an economic aid package worth as much as $32 billion to ensure their participation in a war with Iraq.
The American ambassador to Turkey, Robert Pearson, was summoned to the Turkish Foreign Ministry after 10 p.m. on Monday and 

**Third Query**

In [23]:
#First lets write a query for the LLM
query = "How much was the prize for the winners of the first Westchester Prize for New Works? and it was shared equally among how much Westchester arts organizations?"

co = cohere.Client(api_key=COHERE_API_KEY)
response = co.chat(
        model='command-r-plus',
        message=query,
    )
response.text

'The winners of the first Westchester Prize for New Works received a prize of $100,000, which was shared equally among four Westchester arts organizations.'

In [24]:
# Let us remember our query, and getting hopefully a more accurate answer
query = "How much was the prize for the winners of the first Westchester Prize for New Works? and it was shared equally among how much Westchester arts organizations?"
augmented_prompt, source_knowledge = augment_prompt(query, model=model, index=index)
response = co.chat(
        model='command-r-plus',
        message=augmented_prompt,
    )
response.text

'The prize money for the first Westchester Prize for New Works was $50,000, shared equally among three Westchester arts organizations.'

In [25]:
print(source_knowledge)

WINNERS of the first Westchester Prize for New Works, established by the Westchester County Millennium Commission as its cultural legacy, have been named. The $50,000 prize will be shared equally among three Westchester arts organizations and their artistic partners: the Westchester Philharmonic and the clarinetist and composer Derek Bermel; Canticorum Virtuosi, parent organization of the New York Virtuoso Singers and the composer and cantor Gerald Cohen, and Project Ballet Theater and the dancer and choreographer Robert Hill.
The prize, financed by Philip Morris Companies of Rye Brook, requires that the new works -- in this case an orchestral composition, a choral piece and a ballet -- have their first performances in Westchester County.
WINNERS of the first Westchester Prize for New Works, established by the Westchester County Millennium Commission as its cultural legacy, have been named. The $50,000 prize will be shared equally among three Westchester arts organizations and their ar

**Succsseful answwer of the stadard QA model**

In [26]:
#First lets write a query for the LLM
query = "In 2002 the prime minister of China leaves from the Communist Party's Central Committee. What was his name?"

co = cohere.Client(api_key=COHERE_API_KEY)
response = co.chat(
        model='command-r-plus',
        message=query,
    )
response.text

"Zhu Rongji was the premier of the State Council of the People's Republic of China from March 1998 until his retirement in March 2003. He was also a member of the Communist Party's Politburo Standing Committee, China's de facto top decision-making body, from 1997 to 2002."

In [27]:
# Let us remember our query, and getting hopefully a more accurate answer
query = "In 2002 the prime minister of China leaves from the Communist Party's Central Committee. What was his name?"
augmented_prompt, source_knowledge = augment_prompt(query, model=model, index=index)
response = co.chat(
        model='command-r-plus',
        message=augmented_prompt,
    )
response.text

'Zhu Rongji'

In [28]:
print(source_knowledge)

The departure of Zhu Rongji, the hard-driving prime minister, from the Communist Party's Central Committee makes him a lame duck for the next four months as he tries to promote his ambitious plan to overhaul China's economy.
The front-runner to replace Mr. Zhu as prime minister in March is Wen Jiabao, who in his post as deputy prime minister has gained a reputation as a capable manager with a more conciliatory style.
The departure of Zhu Rongji, the hard-driving prime minister, from the Communist Party's Central Committee makes him a lame duck for the next four months as he tries to promote his ambitious plan to overhaul China'seconomy.
The departure of Zhu Rongji, the hard-driving prime minister, from the Communist Party's Central Committee makes him a lame duck for the next four months as he tries to promote his ambitious plan to overhaul China's economy.
The front-runner to replace Mr. Zhu as prime minister in March is Wen Jiabao, who in his post as deputy prime minister has gained 

When it more detailed question about this topic the standard QA model answers it wrong (half correct).

In [29]:
#First lets write a query for the LLM
query = "In 2002 the prime minister of China leaves from the Communist Party's Central Committee. What was the name of the front-runner to replace him?"

co = cohere.Client(api_key=COHERE_API_KEY)
response = co.chat(
        model='command-r-plus',
        message=query,
    )
response.text

'Hu Jintao'

In [30]:
# Let us remember our query, and getting hopefully a more accurate answer
query = "In 2002 the prime minister of China leaves from the Communist Party's Central Committee. What was the name of the front-runner to replace him?"
augmented_prompt, source_knowledge = augment_prompt(query, model=model, index=index)
response = co.chat(
        model='command-r-plus',
        message=augmented_prompt,
    )
response.text

'Wen Jiabao was the front-runner to replace Zhu Rongji as prime minister of China in 2002.'

In [31]:
print(source_knowledge)

The departure of Zhu Rongji, the hard-driving prime minister, from the Communist Party's Central Committee makes him a lame duck for the next four months as he tries to promote his ambitious plan to overhaul China's economy.
The front-runner to replace Mr. Zhu as prime minister in March is Wen Jiabao, who in his post as deputy prime minister has gained a reputation as a capable manager with a more conciliatory style.
The departure of Zhu Rongji, the hard-driving prime minister, from the Communist Party's Central Committee makes him a lame duck for the next four months as he tries to promote his ambitious plan to overhaul China'seconomy.
The departure of Zhu Rongji, the hard-driving prime minister, from the Communist Party's Central Committee makes him a lame duck for the next four months as he tries to promote his ambitious plan to overhaul China's economy.
The front-runner to replace Mr. Zhu as prime minister in March is Wen Jiabao, who in his post as deputy prime minister has gained 