# RAG Pipeline

### Importing Libraries

In [1]:
import os
import re
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split

from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
from langchain_openai import ChatOpenAI
from langchain.embeddings import OpenAIEmbeddings
from langchain.chains import RetrievalQA
from langchain_pinecone import PineconeVectorStore

from pinecone import Pinecone

from ragas import evaluate
from ragas.metrics import faithfulness, SemanticSimilarity

from datasets import Dataset

### Setting API Keys & Initialising Pinecone

In [2]:
# Setting API Keys
OPENAI_API_KEY = 'your-openai-api-key-here'
PINECONE_API_KEY = 'your-pinecone-api-key-here'

os.environ['PINECONE_API_KEY'] = PINECONE_API_KEY
os.environ['OPENAI_API_KEY'] = OPENAI_API_KEY

# Initialising Pinecone
pc = Pinecone(api_key = PINECONE_API_KEY)

### Loading Ground Truth & Splitting into Train/Val/Test

In [3]:
# Loading ground truth dataset
ground_truth_df = pd.read_csv('ground_truth_dataset.csv')

# Getting unique paper names
unique_papers = ground_truth_df['filename'].unique()
print(f"Total ground truth papers: {len(unique_papers)}")

# Splitting papers into train/test/val
train_papers, temp_papers = train_test_split(unique_papers, test_size=0.5, random_state=42)
val_papers, test_papers = train_test_split(temp_papers, test_size=0.5, random_state=42)

print(f"Train: {len(train_papers)} papers")
print(f"Val: {len(val_papers)} papers")  
print(f"Test: {len(test_papers)} papers")

# Creating separate dataframes
train_df = ground_truth_df[ground_truth_df['filename'].isin(train_papers)]
val_df = ground_truth_df[ground_truth_df['filename'].isin(val_papers)]
test_df = ground_truth_df[ground_truth_df['filename'].isin(test_papers)]

Total ground truth papers: 20
Train: 10 papers
Val: 5 papers
Test: 5 papers


### Preprocessing and Loading the PDFs

In [4]:
# Defining the preprocessing function
def preprocess_text(text):
    text = re.sub(r'References\s*\n[\s\S]*?(?=\n[A-Z][a-z]+\s*\n|$)', '', text, flags=re.IGNORECASE)
    text = re.sub(r'\[\d+(?:,\s*\d+)*\]', '', text)
    text = re.sub(r'\(\w+\s+et\s+al\.,?\s*\d{4}\)', '', text)
    text = re.sub(r'\(\w+\s+and\s+\w+,?\s*\d{4}\)', '', text)
    text = re.sub(r'(Figure|Fig\.|Table)\s+\d+[:\.].*?(?=\n)', '', text, flags=re.IGNORECASE)
    text = re.sub(r'\S+@\S+', '', text)
    text = re.sub(r'https?://\S+', '', text)
    text = re.sub(r'\n{3,}', '\n\n', text)
    text = re.sub(r' {2,}', ' ', text)
    return text.strip()

# Defining the loading function
def load_docs(paper_list, folder='biochar_papers/'):
    docs = []
    for filename in paper_list:
        path = os.path.join(folder, filename)
        if os.path.exists(path):
            pages = PyPDFLoader(path).load()
            full_text = ' '.join([p.page_content for p in pages])
            clean = preprocess_text(full_text)
            docs.append(Document(page_content=clean, metadata={'source': filename}))
    return docs

### Chunking and Embedding Documents

In [5]:
# Splitting documents
splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)

# Initialising embeddings
embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)

# Creating a vector store with all biochar papers
all_papers = os.listdir('biochar_papers/')
all_papers = [f for f in all_papers if f.endswith('.pdf')]

batch_size = 100

all_docs = load_docs(all_papers)
all_chunks = splitter.split_documents(all_docs)

index = pc.Index("biochar-final")
vector_store = PineconeVectorStore(index=index, embedding=embeddings)

for i in range(0, len(all_chunks), batch_size):
    batch = all_chunks[i:i + batch_size]
    vector_store.add_documents(batch)

  embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)
Ignoring wrong pointing object 5 0 (offset 0)
Ignoring wrong pointing object 7 0 (offset 0)
Ignoring wrong pointing object 9 0 (offset 0)
Ignoring wrong pointing object 22 0 (offset 0)
Ignoring wrong pointing object 41 0 (offset 0)
Ignoring wrong pointing object 43 0 (offset 0)
Ignoring wrong pointing object 45 0 (offset 0)
Ignoring wrong pointing object 77 0 (offset 0)
Ignoring wrong pointing object 79 0 (offset 0)
Ignoring wrong pointing object 81 0 (offset 0)
Ignoring wrong pointing object 107 0 (offset 0)
Ignoring wrong pointing object 109 0 (offset 0)
Ignoring wrong pointing object 111 0 (offset 0)


### Defining Extraction Queries & Prompts 

In [6]:
# Defining extraction queries
extraction_queries = [
    "What crop yield and soil quality improvements did biochar provide in this study, and under what specific conditions (soil pH, texture, climate)?",
    "What water-related benefits (retention, drought resilience, irrigation needs) were observed, and in which soil types or climate conditions?",
    "What economic or social co-benefits were reported or discussed (income, poverty, food security), including indirect benefits from agricultural improvements?",
    "What environmental impacts (GHG emissions, carbon sequestration, nutrient leaching) were measured?"]

In [7]:
# Defining two prompts to test
prompt_minimal = """Extract study findings about biochar co-benefits.

- Focus on specific co-benefits
- Include conditions and numbers if given
- Exclude biochar type or rate

Question: {query}

If not mentioned, say "not mentioned in this study"."""

prompt_detailed = """You are an environmental research assistant tasked with extracting findings from this 
document about biochar co-benefits.

Instructions:
- Focus on specific co-benefits (e.g. yield improvement, water retention, poverty reduction)
- Include both observed benefits from field trials and discussed potential benefits
- Review papers that synthesise benefits across multiple studies should report the synthesised findings
- Include conditions under which these benefits occurred (soil pH, texture, climate, farmer type)
- Combine benefits with their conditions (e.g. "yield improved in acidic soils")
- Include numbers when available (e.g. 30% yield increase, pH 5.2)
- For socioeconomic benefits, include implied outcomes (e.g. "yield increases expected to improve food security")
- DO NOT mention the type of biochar or application rate of the biochar

Question: {query}

If no specific evidence addresses this question, say "not mentioned in this study"."""

In [8]:
# Defining extraction function
def extract_direct_answer(vector_store, query, source_doc, k, prompt_template):
    llm = ChatOpenAI(temperature=0, model_name='gpt-4')
    retriever = vector_store.as_retriever(search_kwargs={"k": k, "filter": {"source": source_doc}})
    qa = RetrievalQA.from_chain_type(llm=llm, retriever=retriever, return_source_documents=True)
    full_prompt = prompt_template.replace("{query}", query)
    response = qa({"query": full_prompt})
    return {
        "answer": response["result"],
        "contexts": [doc.page_content for doc in response["source_documents"]]}

### Evaluation Loop for Prompt Comparison

In [9]:
# Testing prompts on training data
def evaluate_prompt_on_split(split_df, prompt_template, prompt_name, k=5):
    eval_data = []
    
    for _, row in split_df.iterrows():
        if row['found'].lower() == 'yes':
            result = extract_direct_answer(vector_store, row['query'], row['filename'], k, prompt_template)
            
            eval_data.append({
                'question': row['query'],
                'answer': result['answer'],
                'contexts': result['contexts'],
                'reference': row['expected_answer']})
    
    dataset = Dataset.from_list(eval_data)
    results = evaluate(dataset, metrics=[faithfulness, SemanticSimilarity()])
    
    avg_results = {
        'faithfulness': np.mean(results['faithfulness']),
        'semantic_similarity': np.mean(results['semantic_similarity']),
        'n_samples': len(eval_data)}
    
    print(f"\n{prompt_name} Results:")
    print(f"  Faithfulness: {avg_results['faithfulness']:.3f}")
    print(f"  Semantic Similarity: {avg_results['semantic_similarity']:.3f}")
    print(f"  Samples: {avg_results['n_samples']}")
    
    return avg_results

# Testing both prompts on training data
train_results_minimal = evaluate_prompt_on_split(train_df, prompt_minimal, "Prompt One", k=5)
train_results_detailed = evaluate_prompt_on_split(train_df, prompt_detailed, "Prompt Two", k=5)

  response = qa({"query": full_prompt})


Evaluating:   0%|          | 0/80 [00:00<?, ?it/s]


Prompt One Results:
  Faithfulness: 0.641
  Semantic Similarity: 0.892
  Samples: 40


Evaluating:   0%|          | 0/80 [00:00<?, ?it/s]


Prompt Two Results:
  Faithfulness: 0.797
  Semantic Similarity: 0.893
  Samples: 40


### Tuning K on Validation 

In [12]:
# Selecting prompt to use
best_prompt = prompt_detailed

k_values = [3, 5, 7, 10]
validation_results = {}

for k in k_values:
    eval_data = []
    
    for _, row in val_df.iterrows():
        if row['found'].lower() == 'yes':
            result = extract_direct_answer(vector_store, row['query'], row['filename'], k, best_prompt)
            
            eval_data.append({
                'question': row['query'],
                'answer': result['answer'],
                'contexts': result['contexts'],
                'reference': row['expected_answer']})
    
    dataset = Dataset.from_list(eval_data)
    results = evaluate(dataset, metrics=[faithfulness, SemanticSimilarity()])
    
    validation_results[k] = {
        'faithfulness': np.mean(results['faithfulness']),
        'semantic_similarity': np.mean(results['semantic_similarity'])}
    
    print(f"k={k}: Faithfulness = {validation_results[k]['faithfulness']:.3f}, Semantic Similarity = {validation_results[k]['semantic_similarity']:.3f}")

Evaluating:   0%|          | 0/40 [00:00<?, ?it/s]

k=3: Faithfulness = 0.679, Semantic Similarity = 0.874


Evaluating:   0%|          | 0/40 [00:00<?, ?it/s]

k=5: Faithfulness = 0.751, Semantic Similarity = 0.862


Evaluating:   0%|          | 0/40 [00:00<?, ?it/s]

k=7: Faithfulness = 0.804, Semantic Similarity = 0.866


Evaluating:   0%|          | 0/40 [00:00<?, ?it/s]

k=10: Faithfulness = 0.806, Semantic Similarity = 0.863


### Testing with Best K and Best Prompt

In [13]:
# Selecting k value to use 
best_k = 7

test_eval_data = []

for _, row in test_df.iterrows():
    if row['found'].lower() == 'yes':
        result = extract_direct_answer(vector_store, row['query'], row['filename'], best_k, best_prompt)
        
        test_eval_data.append({
            'question': row['query'],
            'answer': result['answer'],
            'contexts': result['contexts'],
            'reference': row['expected_answer']})

test_dataset = Dataset.from_list(test_eval_data)
test_results = evaluate(test_dataset, metrics=[faithfulness, SemanticSimilarity()])

print(f"  Faithfulness: {np.mean(test_results['faithfulness']):.3f}")
print(f"  Semantic Similarity: {np.mean(test_results['semantic_similarity']):.3f}")
print(f"  Samples: {len(test_eval_data)}")

Evaluating:   0%|          | 0/40 [00:00<?, ?it/s]

  Faithfulness: 0.835
  Semantic Similarity: 0.885
  Samples: 20


In [15]:
# Comparing ground truth vs model extraction
test_papers_list = test_df[test_df['found'] == 'yes']['filename'].unique()
random_papers = np.random.choice(test_papers_list, size=min(2, len(test_papers_list)), replace=False)

for paper_num, paper in enumerate(random_papers, 1):
    print(f"PAPER {paper_num}: {paper}")
    
    paper_rows = test_df[test_df['filename'] == paper]
    
    for _, row in paper_rows.iterrows():
        print(f"\nQuery: {row['query']}")
        print(f"\nGround Truth: {row['expected_answer']}")
        
        extracted = extract_direct_answer(vector_store, row['query'], row['filename'], best_k, best_prompt)
        print(f"\nModel Extracted: {extracted['answer']}")

PAPER 1: tisserant_2019_global.pdf

Query: What crop yield and soil quality improvements did biochar provide in this study, and under what specific conditions (soil pH, texture, climate)?

Ground Truth: Biochar increased crop yields by an average of 25% in tropical soils, especially those that are acidic, highly weathered, and low in CEC. Effects in temperate soils were mostly neutral or negative. 

Model Extracted: Biochar provided several improvements to crop yield and soil quality in this study. In highly weathered, acidic soils with low cation exchange capacity (CEC), typically found in tropical regions, biochar application resulted in a positive response in terms of yield. An average increase in yield of 25% was observed in these tropical soils. However, biochar had no or very little positive or even negative effects in temperate soil. 

Increased soil moisture, which can be a result of biochar application, can increase yield in temperate regions that have less weathered soils and


Model Extracted: This study does not provide specific information on crop yield improvements due to biochar. However, it does mention that biochar addition to soil can increase soil pH. In this particular study, liming treatment and biochar addition resulted in an increase of soil pH by 0.4 pH units on average. The soil pH in the control treatment was 6.3 on average. The study was conducted on a clay loam soil with a particle size distribution of 37% sand, 27% silt, and 36% clay, under a temperate climate with a mean annual temperature of 9.4 °C and mean annual precipitation of 1054 mm. The study also found that biochar amendment reduced N2O emissions by 52% compared to the control, suggesting an improvement in soil quality.

Query: What water-related benefits (retention, drought resilience, irrigation needs) were observed, and in which soil types or climate conditions?

Ground Truth: Biochar contributed to increased soil water retention and improved soil porosity, particularly in lig

In [17]:
# Extracting from all papers
all_extractions = []

for i, filename in enumerate(all_papers):
    
    paper_extractions = {'source_document': filename}
    
    for query in extraction_queries:
        result = extract_direct_answer(vector_store, query, filename, best_k, best_prompt)
        paper_extractions[query] = result['answer']
    
    all_extractions.append(paper_extractions)

# Creating dataframe
extractions_df = pd.DataFrame(all_extractions)

# Renaming columns
extractions_df = extractions_df.rename(columns={
    extraction_queries[0]: 'yield_conditions',
    extraction_queries[1]: 'water_benefits',
    extraction_queries[2]: 'socioeconomic_benefits',
    extraction_queries[3]: 'environmental_impacts'})

# Saving final results
extractions_df.to_csv('biochar_cobenefit_extractions.csv', index=False)

# Previewing 
print(extractions_df.head())

             source_document  \
0          wang_2019_usa.pdf   
1      yeboah_2020_ghana.pdf   
2    shoudho_2024_global.pdf   
3  juraszeck_2021_europe.pdf   
4    katterer_2019_kenya.pdf   

                                    yield_conditions  \
0  The study does not provide specific evidence o...   
1  The study found that biochar improved the grow...   
2  The study found that the utilization of biocha...   
3  The study found that biochar application incre...   
4  The study found that biochar significantly imp...   

                                      water_benefits  \
0  The study found that biochar can improve soil ...   
1  The study observed that biochar improves water...   
2  The document mentions that biochar enhances so...   
3  The study observed that biochar application si...   
4  The study observed that the application of bio...   

                              socioeconomic_benefits  \
0                       Not mentioned in this study.   
1                    