# Ragas evaluation
Test batch and ragas capability.

Uses this article as a model: https://towardsdatascience.com/visualize-your-rag-data-evaluate-your-retrieval-augmented-generation-system-with-ragas-fc2486308557

Ragas repository: https://github.com/explodinggradients/ragas/tree/main

In [9]:
import os, sys
import json
from pathlib import Path
import pickle

from ragas.testset import TestsetGenerator
from ragas import RunConfig
from dotenv import load_dotenv,find_dotenv
import chromadb
from chromadb import PersistentClient
from pinecone import Pinecone as pinecone_client, ServerlessSpec
from langchain_pinecone import PineconeVectorStore
from langchain_community.vectorstores import Chroma
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_voyageai import VoyageAIEmbeddings
from langchain_community.embeddings import HuggingFaceInferenceAPIEmbeddings
from langchain_core.documents import Document
import pandas as pd
import random

from ragas import evaluate
from ragas.metrics import answer_correctness
from datasets import Dataset

from renumics import spotlight
from renumics.spotlight import Embedding
import pandas as pd

from umap import UMAP
import numpy as np

# Import local packages
sys.path.append('../src/aerospace_chatbot')
import queries
from data_processing import _stable_hash_meta, archive_db, get_docs_questions_df

# Set environment variables with .env
load_dotenv(find_dotenv(), override=True)

True

## Utility functions

In [10]:
def write_dict_to_file(data_dict, filename):
    """write a dictionary as a json line to a file - allowing for appending"""
    with open(filename, "a") as f:
        f.write(json.dumps(data_dict) + "\n")

def read_dicts_from_file(filename):
    """Read a json line file as a generator of dictionaries - allowing to load multiple dictionaries as list."""
    with open(filename, "r") as f:
        for line in f:
            yield json.loads(line)
            
def add_cached_column_from_file(df, file_name, merge_on, column):
    """Read a file with cached list of dicts data write it to a dataframe."""
    if Path(file_name).exists():
        cached_answer_correctness = (
            pd.DataFrame(list(read_dicts_from_file(file_name)))
            .drop_duplicates(
                subset=[merge_on],
            )[[column, merge_on]]
            .dropna()
            .reset_index(drop=True)
        )
        return df.merge(
            cached_answer_correctness,
            on=merge_on,
            how="left",
        ).reset_index(drop=True)
    else:
        # Create a copy of the DataFrame
        df_out = df.copy()

        # Add the new column with the name of the variable 'column'
        df_out[column] = None

        # Reorder the columns to place the new column at the end
        columns = list(df_out.columns)
        columns.remove(column)
        columns.append(column)
        df_out = df_out[columns]
        
        return df_out

In [11]:
def rag_responses(index_type, index_name, query_model, llm, QA_model_params, df_questions_answers_in, df_docs):
    df_questions_answers_out=df_questions_answers_in.copy()
    
    # Generate responses using RAG with input parameters
    for i, row in df_questions_answers_out.iterrows():
        if row['answer'] is None or pd.isnull(row['answer']) or row['answer']=='':
            print(f"Processing question {i+1}/{len(df_questions_answers_out)}")

            # Use the QA model to query the documents
            qa_obj=queries.QA_Model(index_type,
                            index_name,
                            query_model,
                            llm,
                            **QA_model_params)
            qa_obj.query_docs(row['question'])
            response=qa_obj.result

            df_questions_answers_out.loc[df_questions_answers_out.index[i], "answer"] = response['answer'].content

            ids=[_stable_hash_meta(source_document.metadata)
                for source_document in response['references']]
            df_questions_answers_out.loc[df_questions_answers_out.index[i], "source_documents"] = ', '.join(ids)

            # Save the response to cache file
            response_dict = {
                "question": row['question'],
                "answer": response['answer'].content,
                "source_documents": ids,
            }
            write_dict_to_file(response_dict, f'rag_response_cache_{index_name}.json')

    # Get the context documents content for each question
    source_documents_list = []
    for cell in df_questions_answers_out['source_documents']:
        cell_list = cell.strip('[]').split(', ')
        context=[]
        for cell in cell_list:
            context.append(df_docs[df_docs["id"] == cell]["document"].values[0])
        source_documents_list.append(context)
    df_questions_answers_out["contexts"]=source_documents_list

    # Addtionaly get embeddings for questions
    if not Path(f'question_embeddings_{index_name}.pickle').exists():
        question_embeddings = [
            query_model.embed_query(question)
            for question in df_questions_answers_out["question"]
        ]
        with open(f'question_embeddings_{index_name}.pickle', "wb") as f:
            pickle.dump(question_embeddings, f)

    question_embeddings = pickle.load(open(f'question_embeddings_{index_name}.pickle', "rb"))
    df_questions_answers_out["embedding"] = question_embeddings
    return df_questions_answers_out

In [12]:
def eval_rag(index_name, df_questions_answers_in):
    df_questions_answers_out=df_questions_answers_in.copy()

    # Add answer correctness column, fill in if it exists
    df_questions_answers_out = add_cached_column_from_file(
        df_questions_answers_out, f'ragas_result_cache_{index_name}.json', "question", "answer_correctness"
    )

    # Unclear why but sometimes ground_truth does not provide a response. Just filter those out.
    df_questions_answers_out = df_questions_answers_out[df_questions_answers_out['ground_truth'].apply(lambda x: isinstance(x, str))]
    df_questions_answers_out

    # Prepare the dataframe for evaluation
    df_qa_eval = df_questions_answers_out.copy()

    # Evaluate the answer correctness if not already done
    fields = ["question", "answer", "contexts", "ground_truth"]
    for i, row in df_qa_eval.iterrows():
        print(i, row["question"])
        if row["answer_correctness"] is None or pd.isnull(row["answer_correctness"]):
            evaluation_result = evaluate(
                Dataset.from_pandas(df_qa_eval.iloc[i : i + 1][fields]),
                [answer_correctness],
            )
            df_qa_eval.loc[i, "answer_correctness"] = evaluation_result[
                "answer_correctness"
            ]

            # optionally save the response to cache
            response_dict = {
                "question": row["question"],
                "answer_correctness": evaluation_result["answer_correctness"],
            }
            write_dict_to_file(response_dict, f'ragas_result_cache_{index_name}.json')

    # write the answer correctness to the original dataframe
    df_questions_answers_out["answer_correctness"] = df_qa_eval["answer_correctness"]

    return df_qa_eval, df_questions_answers_out

In [13]:
def data_viz_prep(index_name,df_qa_eval,df_docs):
    # This section adds a column to df_documents containing the ids of the questions that used the document as source. 

    # add the infos about questions using each document to the documents dataframe

    # Explode 'source_documents' so each document ID is in its own row alongside the question ID
    df_questions_exploded = df_qa_eval.explode("source_documents")

    # Group by exploded 'source_documents' (document IDs) and aggregate
    agg = (
        df_questions_exploded.groupby("source_documents")
        .agg(
            num_questions=("id", "count"),  # Count of questions referencing the document
            question_ids=(
                "id",
                lambda x: list(x),
            ),  # List of question IDs referencing the document
        )
        .reset_index()
        .rename(columns={"source_documents": "id"})
    )

    # Merge the aggregated information back into df_documents
    df_documents_agg = pd.merge(df_docs, agg, on="id", how="left")

    # Use apply to replace NaN values with empty lists for 'question_ids'
    df_documents_agg["question_ids"] = df_documents_agg["question_ids"].apply(
        lambda x: x if isinstance(x, list) else []
    )
    # Replace NaN values in 'num_questions' with 0
    df_documents_agg["num_questions"] = df_documents_agg["num_questions"].fillna(0)

    # Concatenate the two dataframes
    df_visualize = pd.concat([df_qa_eval, df_documents_agg], axis=0)

    df_questions = df_visualize[~df_visualize["question"].isna()]
    umap = UMAP(n_neighbors=20, min_dist=0.15, metric="cosine", random_state=42).fit(
        df_questions["embedding"].values.tolist()
    )
    umap_questions = umap.transform(df_visualize["embedding"].values.tolist())


    df_without_questions = df_visualize[df_visualize["question"].isna()]
    umap = UMAP(n_neighbors=20, min_dist=0.15, metric="cosine", random_state=42).fit(
        df_without_questions["embedding"].values.tolist()
    )
    umap_docs = umap.transform(df_visualize["embedding"].values.tolist())
    df_visualize["umap_docs"] = umap_docs.tolist()

    umap = UMAP(n_neighbors=20, min_dist=0.15, metric="cosine", random_state=42).fit(
        df_visualize["embedding"].values.tolist()
    )
    umap_all = umap.transform(df_visualize["embedding"].values.tolist())
    df_visualize["umap"] = umap_all.tolist()


    # find the nearet question (by embedding) for each document
    question_embeddings = np.array(df_visualize[df_visualize["question"].notna()]["embedding"].tolist())

    df_visualize["nearest_question_dist"] = [  # brute force, could be optimized using ChromaDB
        np.min([np.linalg.norm(np.array(doc_emb) - question_embeddings, axis=1)])
        for doc_emb in df_visualize["embedding"].values
    ]

    # write the dataframe to parquet for later use
    df_visualize.to_parquet(f'df_{index_name}.parquet')

    return df_visualize

# ChromaDB

## Connect to database

In [4]:
persistent_client = chromadb.PersistentClient(path=os.path.join(os.getenv('LOCAL_DB_PATH'),'chromadb'))   
collections=persistent_client.list_collections()
collections

[Collection(name=text-embedding-3-large-2merge-0),
 Collection(name=text-embedding-3-large-0merge-400),
 Collection(name=text-embedding-3-large-0merge-400-parent-child),
 Collection(name=text-embedding-3-large-2merge-0-queries)]

In [5]:
dbs=[{'index_name':'text-embedding-3-large-2merge-0',
     'query_model':OpenAIEmbeddings(model='text-embedding-3-large',openai_api_key=os.getenv('OPENAI_API_KEY'))},
     {'index_name':'text-embedding-3-large-0merge-400',
     'query_model':OpenAIEmbeddings(model='text-embedding-3-large',openai_api_key=os.getenv('OPENAI_API_KEY'))},
     {'index_name':'text-embedding-3-large-0merge-400-parent-child',
     'query_model':OpenAIEmbeddings(model='text-embedding-3-large',openai_api_key=os.getenv('OPENAI_API_KEY'))},
     {'index_name':'text-embedding-3-large-2merge-0-queries',
     'query_model':OpenAIEmbeddings(model='text-embedding-3-large',openai_api_key=os.getenv('OPENAI_API_KEY'))}]

In [6]:
# Inspect the first db, save for synthetic test dataset
db=dbs[0]
docs_vectorstore = Chroma(client=persistent_client,
                        collection_name=db['index_name'],
                        embedding_function=db['query_model'])  
all_docs = docs_vectorstore.get(include=["metadatas", "documents", "embeddings"])
lcdocs_chroma = [Document(page_content=doc, metadata=metadata) 
          for doc, metadata in zip(all_docs['documents'], all_docs['metadatas'])]

print(len(lcdocs_chroma))

2222


In [7]:
# Export all collections to pickles to store them
for db in dbs:
    df_temp_chroma=archive_db('ChromaDB',db['index_name'],db['query_model'],export_pickle=True)

In [10]:
df_temp_chroma.head(5)

Unnamed: 0,id,source,page,metadata,document,embedding
0,001889abd34c56d712ba79f905d7b90159f5c354,"['AMS_2001_reocr.pdf', 'AMS_2001_reocr.pdf']","[37, 38]","{'page': '[37, 38]', 'source': '['AMS_2001_reo...",Using a Ball Aerospace-developed lubricant f...,"[0.013638148084282875, -0.023859944194555283, ..."
1,001bc8af131361ed46a8999ade3e7068522572a1,"['AMS_2016_reocr.pdf', 'AMS_2016_reocr.pdf']","[172, 173]","{'page': '[172, 173]', 'source': '['AMS_2016_r...",158 Design Ideology and Testing \n\nEarly Des...,"[0.0009454930550418794, 0.018359793350100517, ..."
2,0030b47c6180d167c2d013c8e7a8b8bb0c16de44,"['AMS_2001_reocr.pdf', 'AMS_2001_reocr.pdf']","[43, 44]","{'page': '[43, 44]', 'source': '['AMS_2001_reo...",The Vertrel XF treated bearing had a slight am...,"[-0.00619348743930459, -0.028075991198420525, ..."
3,00356e19ac534a10e86a774fa22603978622d654,"['AMS_2018_reocr.pdf', 'AMS_2018_reocr.pdf']","[523, 524]","{'page': '[523, 524]', 'source': '['AMS_2018_r...",•Terminate the test at the first onset of unex...,"[-0.036841981112957, -0.022101974114775658, -0..."
4,003edd3f9efa244ccfd87445ebd5edff5c3155b3,"['AMS_2020_reocr.pdf', 'AMS_2020_reocr.pdf']","[197, 198]","{'page': '[197, 198]', 'source': '['AMS_2020_r...","7 oe ~ Of Dual ~187 a highly loaded planet, a ...","[-0.002627367153763771, -0.006074088159948587,..."


## Generate synthetic dataset

Good article on how models/embeddings are used in the `TestsetGenerator`: https://www.pondhouse-data.com/blog/evaluate-rag-performance-using-ragas

In [11]:
# Set generator inputs
generator_model="gpt-3.5-turbo-16k"
synthetic_generator_llm = ChatOpenAI(model=generator_model, tags=[generator_model])

critic_model='gpt-4o'
synthetic_critic_llm = ChatOpenAI(model=critic_model,tags=[critic_model])

embedding_model='text-embedding-3-large'
synthetic_embeddings = OpenAIEmbeddings(model=embedding_model,api_key=os.getenv('OPENAI_API_KEY'))

In [12]:
generator = TestsetGenerator.from_langchain(
    synthetic_generator_llm,
    synthetic_critic_llm,
    synthetic_embeddings
)

sample_size=50
percent_total=sample_size/len(lcdocs_chroma)
print(percent_total)

# Get a random sample of lcdocs
lcdocs_random = random.sample(lcdocs_chroma, sample_size)

0.022502250225022502


In [13]:
# Check if testset.csv exists, use, or generate the synthetic dataset.
if not os.path.exists('./testset.csv'):
    run_config=RunConfig(timeout=1000,
                    max_retries=50,
                    max_wait=1000,
                    max_workers=1)

    n_questions=5
    testset = generator.generate_with_langchain_docs(lcdocs_random, 
                                                    test_size=n_questions,
                                                    with_debugging_logs=True,
                                                    is_async=False,
                                                    run_config=run_config,
                                                    raise_exceptions=False)
    df_testset=testset.to_pandas()
    df_testset.to_csv(f"testset_{db['index_name']}.csv", index=False)
else:
    # Import testset.csv into a DataFrame
    df_testset = pd.read_csv(f"testset_{db['index_name']}.csv")

embedding nodes:   0%|          | 0/164 [00:00<?, ?it/s]

Filename and doc_id are the same for all nodes.


Generating:   0%|          | 0/5 [00:00<?, ?it/s]

[ragas.testset.filters.DEBUG] context scoring: {'clarity': 2, 'depth': 3, 'structure': 2, 'relevance': 3, 'score': 2.5}
[ragas.testset.evolutions.DEBUG] keyphrases in merged node: ['REACT mechanism', 'Qualification testing', 'Deployable solar panel', 'Actuator', 'Thermal vacuum testing']
[ragas.testset.evolutions.INFO] seed question generated: How was the REACT mechanism tested for integration within an SSTL HDRM for a small satellite deployable solar panel?
[ragas.testset.filters.DEBUG] filtered question: {'feedback': 'The question asks about the testing of the REACT mechanism for integration within an SSTL HDRM for a small satellite deployable solar panel. It specifies the mechanism (REACT), the context (SSTL HDRM), and the application (small satellite deployable solar panel), making the intent clear. However, it assumes familiarity with specific terms (REACT mechanism, SSTL HDRM) without providing definitions or context. To improve clarity and answerability, the question could inclu

In [14]:
df_testset

Unnamed: 0,question,contexts,ground_truth,evolution_type,metadata,episode_done
0,What type of cryogenic sensors were investigat...,"[ besides the planetary gear unit, to transf...",small cryogenic sensors,simple,"[{'page': '[44, 45]', 'source': '['AMS_2002_re...",True
1,What is the purpose of the damping system in t...,[‘elease nd quantifment — threaded~ was 207 P...,The purpose of the damping system in the M10 s...,simple,"[{'page': '[217, 218]', 'source': '['AMS_2020_...",True
2,What material was used for the recommended sen...,"[ besides the planetary gear unit, to transf...",The sensor was contained in a plastic housing.,reasoning,"[{'page': '[44, 45]', 'source': '['AMS_2002_re...",True
3,How was the motor torque recorded and monitore...,"[~oL, a a ~~ ~~ )4/heptane) __ and instal...",The motor torque was recorded at the beginning...,multi_context,"[{'page': '[289, 290]', 'source': '['AMS_2020_...",True
4,"According to Pyroalliance's assessment, what w...",[‘elease nd quantifment — threaded~ was 207 P...,Pyroalliance's assessment identified three mai...,reasoning,"[{'page': '[217, 218]', 'source': '['AMS_2020_...",True


## RAG questions/answers (batch mode)

### Format dataset and database for RAG

In [15]:
df_questions = df_testset[['question', 'ground_truth']].copy()
df_questions['id'] = 'Question ' + df_questions.index.astype(str)
df_questions['question_by'] = generator_model
df_questions = df_questions[['id', 'question', 'ground_truth', 'question_by']]

In [16]:
all_docs = docs_vectorstore.get(include=["metadatas", "documents", "embeddings"])
df_docs = pd.DataFrame(
    {
        "id": [_stable_hash_meta(metadata) for metadata in all_docs["metadatas"]],
        "source": [metadata.get("source") for metadata in all_docs["metadatas"]],
        "page": [metadata.get("page", -1) for metadata in all_docs["metadatas"]],
        "document": all_docs["documents"],
        "embedding": all_docs["embeddings"],
    }
)

In [17]:
# Load the cached RAG answers and source_documents ids from a file - or create an empty column
df_questions_answers = add_cached_column_from_file(
    df_questions, f"rag_response_cache_{db['index_name']}.txt", "question", "answer")

df_questions_answers = add_cached_column_from_file(
    df_questions_answers, f"rag_response_cache_{db['index_name']}.txt", "question", "source_documents")

### Use RAG to generate responses

In [27]:
index_type='ChromaDB'
index_name=db['index_name']
query_model=synthetic_embeddings
llm=synthetic_generator_llm

QA_model_params={'rag_type':'Standard',
                 'k':4,
                 'search_type':'similarity',
                 'local_db_path':os.getenv('LOCAL_DB_PATH')}


In [36]:
df_questions_answers_rag=rag_responses(index_type, index_name, query_model, llm, QA_model_params, df_questions_answers, df_docs)

Processing question 1/5
Processing question 2/5
Processing question 3/5
Processing question 4/5
Processing question 5/5


## Ragas eval, visualize

In [37]:
# Evaluate
df_qa_eval, df_questions_answers_rag = eval_rag(index_name, df_questions_answers_rag)

0 What type of cryogenic sensors were investigated for sensing the HWP position?


Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

1 What is the purpose of the damping system in the M10 separation nut design?


Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

2 What material was used for the recommended sensor housing by Infineon Technologies?


Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

3 How was the motor torque recorded and monitored during testing, and what was the significance of the measurements in relation to the bearing failures?


Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

4 According to Pyroalliance's assessment, what were the main contributors to the induced shock in the separation nut heritage design?


Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

In [21]:
df_questions_answers_rag

Unnamed: 0,id,question,ground_truth,question_by,answer,source_documents,contexts,embedding,answer_correctness
0,Question 0,What type of cryogenic sensors were investigat...,small cryogenic sensors,gpt-3.5-turbo-16k,The study investigated the use of magnetoresis...,"cbfbb0b161515dc8c73e456782988bfb9a65820b, fc15...","[At The University of Chicago, we are designi...","[-0.013885260326153179, -0.001953278271540212,...",0.197826
1,Question 1,What is the purpose of the damping system in t...,The purpose of the damping system in the M10 s...,gpt-3.5-turbo-16k,The damping system in the M10 separation nut d...,"197a4182c91b653cd2e11c7c874b3259dbf786f9, 9298...",[‘elease nd quantifment — threaded~ was 207 P...,"[-0.010085261674414852, -0.008000294874285226,...",0.48548
2,Question 2,What material was used for the recommended sen...,The sensor was contained in a plastic housing.,gpt-3.5-turbo-16k,"According to Infineon Technologies, the advant...","4b2f9047be728770f2feff30dfd09561264f6361, 29e1...",[TQ 471 cable length max. 2m ji =H 1s 134 __q ...,"[-0.016372018581599247, -0.0005170286489279663...",0.367569
3,Question 3,How was the motor torque recorded and monitore...,The motor torque was recorded at the beginning...,gpt-3.5-turbo-16k,The motor torque was recorded and monitored du...,"ec2cc226698a7aef0c03a24ca0e10c008aa2acc3, 54a9...",[taken at various points of the process yielde...,"[0.012493401472138323, -0.010065259356331084, ...",0.852295
4,Question 4,"According to Pyroalliance's assessment, what w...",Pyroalliance's assessment identified three mai...,gpt-3.5-turbo-16k,According to Pyroalliance's assessment of the ...,"197a4182c91b653cd2e11c7c874b3259dbf786f9, dee5...",[‘elease nd quantifment — threaded~ was 207 P...,"[0.011690869174125432, 0.0008642443597386778, ...",0.650764


In [25]:
# Link from documents to questions, that used the document as source. Add UMAP column for visualization purposes.
df_visualize=data_viz_prep(index_name,df_qa_eval,df_docs)

  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")
  warn(
  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")
  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


In [None]:
# concat the df containing the questions and the df containing the documents
df = pd.read_parquet(f'df_{index_name}.parquet')

# show the dataframe with the question and answer in spotlight
spotlight.show(
    df,
    layout="https://spotlightpublic.blob.core.windows.net/docs-data/rag_demo/layout_rag_3.json",
    dtype={x: Embedding for x in df.keys() if "umap" in x},
)

##  UMAP visualization froms cluster of the questions, workaround: UMAP only on documents

# Pinecone

## Connect to database

In [14]:
pinecone_client = pinecone_client(api_key=os.getenv('PINECONE_API_KEY'))
indexes=pinecone_client.list_indexes()
indexes

{'indexes': [{'dimension': 1024,
              'host': 'voyage-large-2-instruct-2merge-0-dj30h8y.svc.aped-4627-b74a.pinecone.io',
              'metric': 'cosine',
              'name': 'voyage-large-2-instruct-2merge-0',
              'spec': {'serverless': {'cloud': 'aws', 'region': 'us-east-1'}},
              'status': {'ready': True, 'state': 'Ready'}},
             {'dimension': 1024,
              'host': 'voyage-large-2-instruct-0merge-400-dj30h8y.svc.aped-4627-b74a.pinecone.io',
              'metric': 'cosine',
              'name': 'voyage-large-2-instruct-0merge-400',
              'spec': {'serverless': {'cloud': 'aws', 'region': 'us-east-1'}},
              'status': {'ready': True, 'state': 'Ready'}}]}

In [None]:
dbs=[{'index_name':'voyage-large-2-instruct-2merge-0',
     'query_model': VoyageAIEmbeddings(model='voyage-large-2-instruct', 
                                       voyage_api_key=os.getenv('VOYAGE_API_KEY'), truncation=False)},
     {'index_name':'voyage-large-2-instruct-0merge-400',
     'query_model': VoyageAIEmbeddings(model='voyage-large-2-instruct', 
                                       voyage_api_key=os.getenv('VOYAGE_API_KEY'), truncation=False)}]

In [None]:
# Inspect the first db, save for synthetic test dataset
db=dbs[0]
index = pinecone_client.Index(db['index_name'])
ids=[]
for id in index.list():
    ids.extend(id)

docs=[]
chunk_size=200  # Tune to whatever doesn't error out, 200 won't for serverless
for i in range(0, len(ids), chunk_size):
    print(f"Fetching {i} to {i+chunk_size}")
    vector=index.fetch(ids[i:i+chunk_size])['vectors']
    vector_data = []
    for key, value in vector.items():
        vector_data.append(value)
    docs.extend(vector_data)

lcdocs_pinecone = []
for data in docs:
    data=data['metadata']
    lcdocs_pinecone.append(Document(page_content=data['page_content'],
                           metadata={'page':data['page'],'source':data['source']}))
    
print(len(lcdocs_pinecone))

In [None]:
# Export all collections to pickles to store them
for db in dbs:
    df_temp_pinecone=archive_db('Pinecone',db['index_name'],db['query_model'],export_pickle=True)