In [1]:
# Library imports
import os
import requests
import subprocess
from collections import Counter
import shlex
import re
import markdown2
import genanki
import pymupdf
from huggingface_hub import login
from huggingface_hub import InferenceClient
from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM

# PDF extraction & structuring

In [2]:
pdf = "/home/diomir0/Documents/papers/Bedford, 2023 - LSD_Connectivity.pdf"

In [3]:
# Opening PDF
doc = pymupdf.open(pdf)
print(doc.metadata)

{'format': 'PDF 1.4', 'title': 'The effect of lysergic acid diethylamide (LSD) on whole-brain functional and effective connectivity', 'author': 'Peter Bedford', 'subject': 'Neuropsychopharmacology, doi:10.1038/s41386-023-01574-8', 'keywords': '', 'creator': 'Springer', 'producer': '', 'creationDate': "D:20230605192020+05'30'", 'modDate': "D:20230605192216+05'30'", 'trapped': '', 'encryption': None}


In [4]:
# Get Table of Contents
toc = doc.get_toc()

sec_names = [toc[i][1].lower() for i in range(len(toc))]

# Extracting the sections 
sections = []
for sec in toc:
    if sec[0]==2:
        sections.append(sec[1].lower()) 

In [5]:
# Defining page height from first page
pheight = doc[0].rect.height
# Defining frame height 
pframe = 50

# Computing dominant text size throughout the document
font_sizes = []
for page in doc:
    text_dict = page.get_text("dict")
    blocks = text_dict["blocks"]
    for block in blocks:
        for line in block.get("lines", []):
            for span in line.get("spans", []):
                font_sizes.append(span["size"])
font_count = Counter(font_sizes)
dominant_fonts = font_count.most_common(2)

if (dominant_fonts[1][0] > dominant_fonts[0][0] and dominant_fonts[1][1] > dominant_fonts[0][0]/2):
    main_font = round(dominant_fonts[1][0]) 
else:
    main_font = round(dominant_fonts[0][0]) 
    
print("The main font size is {}".format(main_font))

The main font size is 9


In [6]:
# Extracting text blocks
pblocks = []
for page in doc:
    text_dict = page.get_text("dict")
    pblocks.append(text_dict["blocks"])

main_text = {}
key = ""
value = ""

for blocks in pblocks:
    for block in blocks:
        # Removing header and footer blocks
        if block["bbox"][1] > pframe and block["bbox"][3] < pheight-pframe:
            for line in block.get("lines", []):
                # Get section titles as dict keys
                if line.get("spans", [])[0]["text"].lower() in sections: 
                    key = line.get("spans", [])[0]["text"].lower()
                    value = ""
                    continue
                # If the references title is not included in the toc but still exists
                elif (any([sec.lower() == "references" for sec in sections]) == False and
                         line.get("spans", [])[0]["text"].lower() == "references"):  
                    key = "references"
                    value = ""
                    continue
                # Excluding figure and table captions
                elif (re.match(r'Fig(ure)?\.(\s)?(\d+)?(\w+)?(\s+)?:', line.get("spans", [])[0]["text"]) or 
                      re.match(r'Table(\s)?(\d+)?(\w+)?(\s+)?:', line.get("spans", [])[0]["text"])):
                    break
                # Get the block's text as dict value based on the font size
                for span in line.get("spans", []):
                    # Introducing a tolerance of font size of 0.5 for small variations in the text body
                    if ((round(span["size"]) == main_font and key != "materials and methods")  
                        and key != ""):
                        text = span["text"]
                        # Repairing lines
                        if (len(value) > 1 and (value[-1] == "-" or value[-1] == "ﬁ" or value[-1] == 'ﬂ') 
                            or (text == 'ﬁ' or text == 'ﬂ')):
                            value = value + text   
                        else: 
                            value = value + ' ' + text 
                    # Text in an article "Materials and Methods" section can have a smaller font size
                    if (round(span["size"]) >= main_font-1 and round(span["size"]) <= main_font 
                         and key == "materials and methods"):
                        text = span["text"]
                        # Repairing lines
                        if (len(value) > 1 and (value[-1] == "-" or value[-1] == "ﬁ" or value[-1] == 'ﬂ') 
                            or (text == 'ﬁ' or text == 'ﬂ')):
                            value = value + text   
                        else: 
                            value = value + ' ' + text 
                        
        if (key != "" and value != ""):
            # Removing references
            value = re.sub(r'(;\s)?\[\s(\d(\s,\s+)?)+\s\]', '', value.strip())
            value = re.sub(r'(;\s)?\[\s\d+\s–\s\d+\s\]', '', value.strip())
            # Formatting spaces surrounding commas, dots, and parentheses
            value = re.sub(r'\s,\s', ', ', value.strip())
            value = re.sub(r'\s\.\s', '. ', value.strip())
            value = re.sub(r'\(\s', '(', value.strip())
            value = re.sub(r'\s\)', ')', value.strip())
            # Removing multiple spaces (strip method fails)
            value = re.sub(r'\s+', ' ', value.strip())
            # Replacing the 'ﬁ' and 'ﬂ' characters with correct "fi" string
            value = re.sub(r'ﬁ', 'fi', value.strip())
            value = re.sub(r'ﬂ', 'fl', value.strip())
            
            main_text[key] = value


# LLM Summarizing & Question-Generation

In [7]:
# Function calling mistral/7B-Instruct from ollama and returning its output
def ollama_generate(prompt, model='mistral:7B-instruct'):
    url = "http://localhost:11434/api/generate"
    response = requests.post(url, json={
        "model": model,
        "prompt": prompt,
        "stream": False,  # Set to True if you want streaming responses
        "system": "You are a helpful assistant that has insight in academic, theoretical knowledge in science and humanities, and that is able to accurately summarize complex texts concisely yet precisely without skipping important details.",
        "temperature": 0.4
    })
    return response.json()["response"]

In [8]:
# Download valhalla/t5-base6e2e-qg model from HF - DONE
#login()

#tokenizer = AutoTokenizer.from_pretrained("valhalla/t5-base-e2e-qg")
#model = AutoModelForSeq2SeqLM.from_pretrained("valhalla/t5-base-e2e-qg")

In [8]:
# Path to local HF files
local_path = "/home/diomir0/.cache/huggingface/hub/models--valhalla--t5-base-e2e-qg/snapshots/c652651334cd5516f2bd0f0fb5303a01a678024e" 

tokenizer = AutoTokenizer.from_pretrained(local_path)
model = AutoModelForSeq2SeqLM.from_pretrained(local_path)

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [22]:
# Function calling valhalla/t5-base-e2e-qg
def qf_generate(text, num, tokenizer = tokenizer, model = model):
    input_text = f"{text}"
    inputs = tokenizer(input_text, return_tensors="pt", truncation=True, max_length=512)
    outputs = model.generate(inputs["input_ids"], 
                             max_new_tokens=128, 
                             num_return_sequences=num, 
                             num_beams=num,
                             do_sample=False
                             )
    questions_batch = [tokenizer.decode(o, skip_special_tokens=True) for o in outputs]
    questions_split = [q.strip().split('<sep>') for q in questions_batch]

    # Flatten and clean
    final_questions = []
    seen = set()
    for batch in questions_split:
        for q in batch:
            q = q.strip()
            if q.endswith("?") and len(q.split()) > 3 and q not in seen:
                seen.add(q)
                final_questions.append(q)

    return final_questions

In [9]:
# Starting ollama from shell
subprocess.run(shlex.split("ollama run mistral:7b-instruct"), check = True)

[?25l[?2026h[?25l[1G[K[?25h[?2026l[2K[1G[?25h[?25l[?25h

CompletedProcess(args=['ollama', 'run', 'mistral:7b-instruct'], returncode=0)

In [10]:
# (Re)Initialize the dictionaries
sum_dict = {}
q_dict = {}

In [11]:
# Start of LLM prompt asking for a complete, precise summary yet as concise as possible 
sum_instruct = "You are an expert science educator. Given the following text, do two things: 1. Summarize it clearly, precisely and as concisely as precison allows. 2. Then, extract the key concepts or facts as 3–8 concise bullet points. Text: "
questions_instruct = "You are an expert science and humanities educator. Given the following text, generate a set of five relevant questions and their answers, making sure to only output the questions and their answers in the form of 'Q: ... A:...'. Text: {text}"

for key in main_text.keys():
    # Create summary and bullet points of each main_text entry and store it in sum_dict
    if not key in sum_dict.keys():
        sum_dict[key] = ollama_generate(sum_instruct + main_text[key])
    # Generate questions based on the text
    if not key in q_dict.keys():
        q_dict[key] = ollama_generate(questions_instruct.format(text=main_text[key]))
    

In [16]:
print(q_dict["results"])

 Q: What percentage of unique correlation coefficients in functional connectivity significantly differed between LSD and placebo conditions?
A: About 23% (1993/8646) of the unique correlation coefficients significantly differed between LSD and placebo conditions.

Q: Where was stronger functional connectivity primarily observed under LSD?
A: Stronger functional connectivity was primarily observed in connections involving the bilateral lingual gyrus, bilateral inferior frontal gurys (pars opercularis), right inferior frontal gyrus and right lingual gyrus, left cuneus and right middle frontal gyrus (MFG), and left temporo-occipital middle temporal gyrus (MTG) and left intracalcerine cortex.

Q: Where was weaker functional connectivity primarily found under LSD?
A: Weaker functional connectivity was primarily found between several occipital regions, including left fusiform gyrus and right inferior lateral occipital cortex (LOC), supracalcarine cortex, occipital pole, and between bilateral

In [None]:
# Extract questions and answers using regex
questions = re.findall(r"Q:\s.*?(?=\nA:|\Z)", text, re.DOTALL)
answers = re.findall(r"A:\s.*?(?=\nQ:|\Z)", text, re.DOTALL)

# Reformat with Q1:/A1:
numbered_questions = [f"Q{i+1}: {q.strip()[2:].strip()}" for i, q in enumerate(questions)]
numbered_answers = [f"A{i+1}: {a.strip()[2:].strip()}" for i, a in enumerate(answers)]

# Join into final string
final_output = "\n".join(numbered_questions) + "\n\nAnswers:\n" + "\n".join(numbered_answers)

# Writing the summaries

In [67]:
# Writing the summary and main concepts of the text (book chapter, article, etc) in a MD file for further use (e.g. upload to Joplin)
def write_markdown_file(doc, sum_dict, q_dict, output_path):
    numbered_answers = []
    with open(output_path, 'w', encoding='utf-8') as f:
        f.write(f"# {doc.metadata['title']}\n\n\n")
        for i, key in enumerate(sum_dict.keys(), start=1):
            f.write(f"## {i}. {key.capitalize()}\n")
            summary = re.sub(r'(1\.[\s])?Summary:', '', sum_dict[key])
            summary = re.sub(r'(2\.[\s])?Key Concepts(/Facts)?(\:)?\n(\n)?', '### Key concepts\n', summary)
            summary = re.sub(r'\d+\.\s', '- ', summary)
            f.write(summary.strip() + "\n\n")
            
            # Writing questions
            f.write("### Questions\n") 
            # Extract questions and answers using regex
            questions = re.findall(r"Q:\s.*?(?=\nA:|\Z)", q_dict[key], re.DOTALL)
            answers = re.findall(r"A:\s.*?(?=\n+Q:|\Z)", q_dict[key], re.DOTALL)
            # Reformat with Q1:/A1:
            numbered_questions = [f"- **Q{i}:** {q.strip()[2:].strip()}" for i, q in enumerate(questions, start=1)]
            numbered_answers.append([f"- **A{i}:** {a.strip()[2:].strip()}" for i, a in enumerate(answers, start=1)])
            f.write("\n".join(numbered_questions))
            f.write("\n\n\n")

        # Writing answers
        f.write("## Answers\n")
        for i, key in enumerate(sum_dict.keys(), start=1):
            f.write(f"{i}. {key.capitalize()}\n")
            f.write("\n".join(numbered_answers[i-1]))
            f.write("\n\n")


In [68]:
write_markdown_file(doc, sum_dict, q_dict, f'/home/diomir0/Documents/{doc.metadata['title'].replace(' ', '_')}.md') 

# Anki card generation 

In [None]:
for key in sum_dict.keys():
    