In [9]:
!pip install sentence_transformers

!pip install datasets
!pip install pinecone-client
!pip install cohere




In [10]:
from sentence_transformers import SentenceTransformer
from datasets import load_dataset
from pinecone import Pinecone, ServerlessSpec
import os
from tqdm import tqdm
import cohere
import numpy as np
import warnings
import pandas as pd
import json
from datasets import Dataset
import ast
import time
from IPython.display import display
warnings.filterwarnings("ignore")

In [11]:
with open("cohere_api_key.txt") as f:
    COHERE_API_KEY = f.read().strip()
with open("pinecone_api_key.txt") as f:
    PINECONE_API_KEY = f.read().strip()

## First Element - Evaluate the optimal source of knowledge and embedding method

In [None]:
co = cohere.Client(api_key=COHERE_API_KEY)

def format_embedding(embedding_str):
    # Remove leading/trailing brackets if they exist, split the string, and convert to floats
    embedding = [float(x) for x in embedding_str.replace('[', '').replace(']', '').split()]
    # Convert the list to a JSON string
    return json.dumps(embedding)



def load_preembedded_data(df):

    # Parse the embeddings using ast.literal_eval to safely evaluate list-like strings
    df['embedding'] = df['embedding'].apply(lambda x: np.array(ast.literal_eval(x)))
    df = df.rename(columns={'full_text': 'text'})
    dataset = Dataset.from_pandas(df)
    embeddings = np.stack(df['embedding'].values)

    return dataset, embeddings

def create_pinecone_index(
        index_name: str,
        dimension: int,
        metric: str = 'cosine',
):
    """
    Create a pinecone index if it does not exist
    Args:
        index_name: The name of the index
        dimension: The dimension of the index
        metric: The metric to use for the index
    Returns:
        Pinecone: A pinecone object which can later be used for upserting vectors and connecting to VectorDBs
    """
    from pinecone import Pinecone, ServerlessSpec
    print("Creating a Pinecone index...")
    pc = Pinecone(api_key=PINECONE_API_KEY)
    existing_indexes = [index_info["name"] for index_info in pc.list_indexes()]
    if index_name not in existing_indexes:
        pc.create_index(
            name=index_name,
            dimension=dimension,
            # Remember! It is crucial that the metric you will use in your VectorDB will also be a metric your embedding
            # model works well with!
            metric=metric,
            spec=ServerlessSpec(
                cloud="aws",
                region="us-east-1"
            )
        )
    print("Done!")
    return pc

def upsert_vectors(
        index: "sexting-nsfw-adultconten-es",
        embeddings: np.ndarray,
        dataset: dict,
        text_field: str = 'text',
        batch_size: int = 128
):
    """
    Upsert vectors to a pinecone index
    Args:
        index: The pinecone index object
        embeddings: The embeddings to upsert
        dataset: The dataset containing the metadata
        batch_size: The batch size to use for upserting
    Returns:
        An updated pinecone index
    """
    print("Upserting the embeddings to the Pinecone index...")
    shape = embeddings.shape

    ids = [str(i) for i in range(shape[0])]
    meta = [{text_field: text} for text in dataset[text_field]]

    # create list of (id, vector, metadata) tuples to be upserted
    to_upsert = list(zip(ids, embeddings, meta))

    for i in tqdm(range(0, shape[0], batch_size)):
        i_end = min(i + batch_size, shape[0])
        index.upsert(vectors=to_upsert[i:i_end])
    return index

def augment_prompt(
        query: str,
        model: SentenceTransformer = SentenceTransformer('all-MiniLM-L6-v2'),
        index=None,
) -> str:
    """
    Augment the prompt with the top 3 results from the knowledge base
    Args:
        query: The query to augment
        index: The vectorstore object
    Returns:
        str: The augmented prompt
    """
    results = [float(val) for val in list(model.encode(query))]

    # get top 3 results from knowledge base
    query_results = index.query(
        vector=results,
        top_k=5,
        include_values=True,
        include_metadata=True
    )['matches']
    text_matches = [match['metadata']['text'] for match in query_results]

    # get the text from the results
    source_knowledge = "\n\n".join(text_matches)

    # feed into an augmented prompt
    augmented_prompt = f"""
You are tasked with composing a WhatsApp message that you would send directly to the person, maintaining the tone, style, and level of empathy and directness used in the provided source material.

The response should simulate a real-time, casual WhatsApp message.
Ensure the tone is empathetic and conversational, while remaining concise and clear.
Use the human writing style from the source knowledge as a guide, but note that the source knowledge does not contain direct answers to the query

Important Guidelines:
The response must directly answer the query as if you are sending the message right now.
Maintain the casual tone, while ensuring the message is smooth and empathetic, like a typical WhatsApp conversation.
The source knowledge is provided solely to show the desired tone and writing style. It is not to be used as a source of answers or content for the response.
Stick strictly to the format of a direct message, avoiding extra advice or unwarranted sympathy.


Example Query and Response Format:

Query: “I agreed to be a bridesmaid, but now I can’t commit. How can I let the bride know without causing drama?”

Response: “Hey, I don’t know if you’ve noticed, but I’m kind of freaking out about this whole bridesmaid thing. I’m so sorry, but I don’t think I can do it anymore. I know how important your wedding is, and I don’t want to let you down, but I’m just not in the right headspace. I hope you understand, and that we can still be cool.”

Query: {query}
source_knowledge: {source_knowledge}"""
    return augmented_prompt, source_knowledge

def get_response_for_queries(df_queries, model, index):
  results = []
  for index_row, row in df_queries.iterrows():
    query = row['query']
    augmented_prompt, source_knowledge = augment_prompt(query, model, index)

    # Added a delay of 6 seconds between API calls to respect the rate limit
    time.sleep(6)

    response = co.chat(
        model='command-r-plus',
        message=augmented_prompt,
    )
    results.append({'query': query, 'response': response.text})

  return pd.DataFrame(results)



data_set_lists =['Friends_BERT.csv','Friends_Deberta.csv','Google_Emotions_BERT.csv','Google_Emotions_Deberta.csv','Reddit_Relationship_Advice_BERT.csv','Reddit_Relationship_Advice_Deberta.csv']



for data_set in data_set_lists:
  df = pd.read_csv(data_set)
  df['embedding'] = df['embedding'].apply(format_embedding)
  dataset, embeddings = load_preembedded_data(df)
  shape = embeddings.shape
  print(f"The embeddings shape: {embeddings.shape}")
  INDEX_NAME_BERT = "Insert your index name here, ensuring the appropriate dimensions"
  INDEX_NAME_deberta = "Insert your index name here, ensuring the appropriate dimensions"
  if 'BERT' in data_set:
    INDEX_NAME = INDEX_NAME_BERT
    EMBEDDING_MODEL = 'all-MiniLM-L6-v2'
  elif 'Deberta' in data_set:
    INDEX_NAME = INDEX_NAME_deberta
    EMBEDDING_MODEL = 'microsoft/deberta-base'
  model = SentenceTransformer(EMBEDDING_MODEL)
  pc = create_pinecone_index(INDEX_NAME, shape[1])
  index = pc.Index(INDEX_NAME)
  index_upserted = upsert_vectors(index, embeddings, dataset)
  df_queries = pd.read_csv('5_querys.csv')
  df_results = get_response_for_queries(df_queries, model, index)
  df_results.to_csv(f'_results_{data_set}', index=False)


## Second Element - RAG with FRIENDS Source Knowledge


In [None]:
df = pd.read_csv('Friends_BERT.csv')
df['embedding'] = df['embedding'].apply(format_embedding)
dataset, embeddings = load_preembedded_data(df)
shape = embeddings.shape
print(f"The embeddings shape: {embeddings.shape}")
INDEX_NAME = "Insert your index name here, ensuring the appropriate dimensions"
EMBEDDING_MODEL = 'all-MiniLM-L6-v2'
model = SentenceTransformer(EMBEDDING_MODEL)
pc = create_pinecone_index(INDEX_NAME, shape[1])
index = pc.Index(INDEX_NAME)
index_upserted = upsert_vectors(index, embeddings, dataset)
df_queries = pd.read_csv('200_prompts.csv')
df_results = get_response_for_queries(df_queries, model, index)
df_results.to_csv('df_results.csv', index=False)
