In [None]:
import os
import pickle
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib import rcParams
from sklearn.decomposition import PCA
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.chat_models import ChatOpenAI
from langchain.chains import RetrievalQA
from langchain.vectorstores import FAISS

os.environ['OPENAI_API_KEY'] = ''

# Load the dictionary containing the parsed manuscript PDFs

In [None]:
with open('data_input/scipdf.pkl', 'rb') as handle:
    pdf_dict = pickle.load(handle)

pdf_doi_dict = {}
for pdf_name in pdf_dict.keys():
    pdf_doi_dict[pdf_name.replace(".pdf", "")] = pdf_dict[pdf_name]["doi"]

# Load the FAISS embedding vector database

In [None]:
# The embedding function to use
embeddings = OpenAIEmbeddings()

# Read the vectorstore object
db = FAISS.load_local("data_faiss/abstracts/", embeddings,
                      allow_dangerous_deserialization=True)

# Part 1 - LLM-based trait-category classification

In [None]:
trait_class = pd.read_csv("data_figures/36.abstract.curated.tsv", sep="\t")
col_name = "Secondary" # "Traits"
trait_class_dict = trait_class.set_index('PDF')[col_name].to_dict()

for model in ["gpt-3.5-turbo-1106", "gpt-4"]:
    response_list = []
    for key, val in pdf_dict.items():
        key = key.replace(".pdf", "")
        embeddings = OpenAIEmbeddings()
        db = FAISS.load_local("data_faiss/abstracts/{}".format(key), embeddings,
                                allow_dangerous_deserialization=True)
        retriever = db.as_retriever()

        query = """You are provided with an abstract of a research paper dealing with genetic mapping results. \
                    Your goal is to categorize the research paper based on the types of traits that were measured. \
                    You are to choose from the following categories:
                        - Agronomic \
                        - Abiotic \
                        - Biotic \
                    Select only a single category that best describes the manuscript. \
                    Your response should only include the name of the category and no other information.
                """
        
        llm = ChatOpenAI(model_name=model)
        qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever, temperature=0.7)
        abstract_response = qa(query)
        response_list.append([key, pdf_doi_dict[key], trait_class_dict[key], abstract_response['result']])
    
    output_df = pd.DataFrame(response_list)
    output_df.columns = ["id", "doi", "label", "pred"]
    output_df.to_csv(f"data_output/abstracts/36.abstracts.{model}.tsv", sep="\t")

In [None]:
# figure size in inches
plt.rcParams.update({'font.weight': 'bold', 'font.size': 13, 
                     'axes.labelweight': 'bold', 'axes.titleweight': 'bold'})
fig, ax = plt.subplots(figsize=(3, 4))

results = pd.read_csv("data_figures/36.abstract.tsv", sep="\t")
results["correct"] = results["correct"] * 100
results = results.groupby(["Model", "doi"]).mean()
results = results.reset_index()
results.columns = ["Model", "doi", "% correct"]
print(results[["Model", "% correct"]].groupby("Model").mean())

sns.barplot(data=results, x="Model", y= "% correct", color="black")

# Part 2 - Classification based on PCA of abstract embedding vectors

# Load the abstract class data

In [None]:
trait_class = pd.read_csv("data_figures/36.abstract.curated.tsv", sep="\t")
col_name = "Secondary" # "Traits"
trait_class_dict = trait_class.set_index('PDF')[col_name].to_dict()

from random import randint
trait_classes = trait_class[col_name].unique().tolist()
color_dict = {}
n = len(trait_classes)
for i, val in enumerate(trait_classes):
    color_dict[val] = '#%06X' % randint(0, 0xFFFFFF)

# Get the embedding vector for all abstracts

In [None]:
def get_faiss_embeddings(faiss_db_name):
    emb_df = pd.DataFrame()
    key_list = []
    for key in pdf_dict.keys():
        # The embedding function to use
        embeddings = OpenAIEmbeddings()
        key = key.replace(".pdf", "")
        db = FAISS.load_local(faiss_db_name, embeddings,
                              allow_dangerous_deserialization=True)
        emb = [db.index.reconstruct_n(idx, 1)[0] for idx in range(db.index.ntotal)]
        key_list.append(key)
    emb_df = pd.concat([emb_df, pd.DataFrame(emb)])
    emb_df.index = key_list
    return(emb_df)

emb_df = get_faiss_embeddings("data_faiss/abstracts")

# Generate the PCA plot

In [None]:
#https://stackoverflow.com/questions/20126061/creating-a-confidence-ellipse-in-a-scatterplot-using-matplotlib

pca = PCA(n_components=2)
pca = pca.fit_transform(np.array(emb_df))

pdf_doi = list(emb_df.index)
pdf_class  = [trait_class_dict[ix] for ix in emb_df.index]
pdf_colors = [color_dict[trait_class_dict[ix]] for ix in emb_df.index]

test = pd.DataFrame([pca[:, 0], pca[:, 1]]).T
test.columns = ["PC1", "PC2"]
test["PC2"] = -test["PC2"] # fit legend better
test["color"] = pdf_class

plt.rcParams["axes.labelsize"] = 10
# Set font properties globally using rcParams
plt.rcParams.update({'font.weight': 'bold', 'axes.labelweight': 'bold', 'axes.titleweight': 'bold', 'font.size': 10})
fig, ax = plt.subplots(figsize=(3, 3))

sns.scatterplot(data=test, x="PC1", y="PC2", hue="color", ax=ax) # sizes=100,

for group in test["color"].unique():
    tmp = test[test["color"]==group]
    x, y = tmp["PC1"], tmp["PC2"]
    cov = np.cov(x, y)
    val, rot = np.linalg.eig(cov)
    val = np.sqrt(val)
    center = np.mean([x, y], axis=1)[:, None]
    t = np.linspace(0, 2.0 * np.pi, 1000)
    xy = np.stack((np.cos(t), np.sin(t)), axis=-1)
    plt.plot(*(2 * rot @ (val * xy).T + center))
plt.legend(loc='upper right', title="Category", prop={'size': 8})