# Ollama Notebook 
 A notebook to extract relevant, predefined parameters from research articles and provide the results in a desired format. 

In [3]:
# if required
# !pip install pymupdf 

In [4]:
# importing relevant libraries 
import pandas as pd
import fitz
import json
import subprocess
import os  
from tqdm import tqdm

In [5]:
os.environ['CUDA_VISIBLE_DEVICES'] = 'GPU-4ce5b5b6-0e10-58a3-b09a-2f4946277a5a'

The `extract_text` function reads in a pdf, and appends the entire content into a single string. It takes the path to a pdf as its argument.

In [6]:
def extract_text (pdf_path):
    document = fitz.open(pdf_path)
    text = ''
    for page in tqdm(document):
        text += page.get_text('text') + '\n'
    return text 

Debugging cell

In [7]:
def save_text(
        text,
        ouput = 'text.txt'
        ):
    with open(ouput,'w', encoding='utf-8') as file:
        file.write(text)

In [8]:
save_text(text = extract_text('./articles/d4nr00473f.pdf'))

100%|██████████| 12/12 [00:00<00:00, 213.98it/s]


`query_ollama` function calls the local Ollama LLM in this case. The argument for this function are

* prompt in utf-8 string format 
* desired model - 'mistral' is the default argument

In [9]:
ollama_path = r"C:\Users\Faisal\AppData\Local\Programs\Ollama"
ollama_path

'C:\\Users\\Faisal\\AppData\\Local\\Programs\\Ollama'

In [10]:
def query_ollama (
        prompt,
        model = 'mistral'
        ):
    
    ollama_path = r"C:\Users\Faisal\AppData\Local\Programs\Ollama\ollama.exe"
    #path to Ollama executable 
    result = subprocess.run(
        [ollama_path, "run", model],
        input=prompt.encode("utf-8"),
        capture_output=True
    )
    return result.stdout.decode("utf-8")


Prompt is defined here, model is queried and error handling is included. Arguments are:
 * extracted text from pdf and limited to first 2000 characters
 * model - default is 'mistral'
 
 Output should be the relevant conditions in json format.

In [11]:
def extract_conditions(
        text, 
        model="mistral"
        ):
    prompt = f"""
    Extract the following experimental parameters from the text:

    - concentration
    - hydrolysis ratio
    - rate of base addition
    - elemental cation ratio

    Return ONLY a JSON array of objects with keys:
    ["parameter", "value", "unit", "context"].

    Text:
    {text[:2000]}  # limit to first 2000 chars per call
    """
    response = query_ollama(prompt, model=model)
    
    try:
        data = json.loads(response)
    except:
        print("Could not parse JSON, raw output:", response[:300])
        data = []
    
    return data


Finally `process_pdfs` iterates through all the pdfs in a folder, extracts their text, prompts the LLM, and produces a `parsed_conditons.csv` file with the compiled conditions. Arguments are:
* path to a folder containing the pdfs 
* path/name of the ouput csv file - default is `parsed_conditons.csv`
* chosen model - default is 'mistral'

Output is the csv file. 

In [12]:
def process_pdfs(pdf_folder, output_csv="parsed_conditions.csv", model="mistral"):
    all_data = []
    all_text = []
    
    for file in tqdm(os.listdir(pdf_folder)):
        if file.endswith(".pdf"):
            print(f"Processing {file}...")
            text = extract_text(os.path.join(pdf_folder, file))
            conditions = extract_conditions(text, model=model)
            for c in tqdm(conditions):
                c["source"] = file  # tag with filename
            all_data.extend(conditions)

            ##############
            # adding function to save text for debugging
            # text = save_text(text)
            # all_text.append(text)

    
    df = pd.DataFrame(all_data)
    df.to_csv(output_csv, index=False)
    print(f"Saved {len(df)} rows to {output_csv}")


Here we call the `process_pdfs` function to analyze all the pdfs. Don't forget to add a path to the pdf folder 

In [13]:
process_pdfs(
    pdf_folder='./articles',
    output_csv='parsed_conditions.csv',
    model='mistral'
)

  0%|          | 0/1 [00:00<?, ?it/s]

Processing d4nr00473f.pdf...


100%|██████████| 12/12 [00:00<00:00, 344.60it/s]
100%|██████████| 4/4 [00:00<00:00, 84733.41it/s]
100%|██████████| 1/1 [02:50<00:00, 170.70s/it]

Saved 4 rows to parsed_conditions.csv



