In [None]:
import pickle
from langchain.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings

import os
os.environ['OPENAI_API_KEY'] = ''

# Load the dictionary containing the parsed manuscript PDFs

In [None]:
with open('data_input/scipdf.pkl', 'rb') as handle:
    pdf_dict = pickle.load(handle)

# Generating the manuscript FAISS DBs with different chunk sizes

In [None]:
for chunk_size in [250, 500, 750, 1000]:
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=int(chunk_size/10), keep_separator=False)

    count = 0
    for key, val in pdf_dict.items():
        docs = []
        metadata = []
        for section in val["sections"]:
            # Only parsing the main text of each manuscript (sections)
            splits = text_splitter.split_text(section["text"])
            docs.extend(splits)
            source = key + "|" + section["heading"]
            for i in range(len(splits)):
                metadata.append({"source": source + "_" + str(i), "index": count})
                count += 1

        ### The embedding function to use
        embeddings = OpenAIEmbeddings()
        db = FAISS.from_texts(docs, embeddings, metadatas=metadata)

        ### Write the vector store object to disk
        db.save_local("data_faiss/faiss_chunked/faiss_db_{}/{}".format(chunk_size, key.replace(".pdf", "")))

# Generating the Abstract FAISS DBs

In [None]:
docs = []
metadata = []
for key, val in pdf_dict.items():
    key = key.replace(".pdf", "")
    #if key not in new_list:
    #    continue

    docs.extend([val["abstract"]])
    source = key + "|abstract"
    metadata.extend([{"source": source}])
    docs = [doc.replace("\n", " ") for doc in docs]

    # The embedding function to use
    embeddings = OpenAIEmbeddings()
    db = FAISS.from_texts([docs[-1]], embeddings, metadatas=[metadata[-1]])

    # Write the vector store object to disk
    db.save_local("data_faiss/abstracts/{}".format(key))

# The embedding function to use
embeddings = OpenAIEmbeddings()
db = FAISS.from_texts(docs, embeddings, metadatas=metadata)

# Write the vector store object to disk
db.save_local("data_faiss/abstracts")