# PDF renamer with Llama 2 (Python)

This Jupyter Notebook is meant to be used in Google Colab, but can be easily run locally.  
The aim of this Jupyter Notebook is to rename PDFs based on the title contained inside its own text. Especially regarding scientific publications.

Based on a previous notebook:

## Setting up Llama 2

In [1]:
!pip install llama-cpp-python==0.1.78

Collecting llama-cpp-python==0.1.78
  Downloading llama_cpp_python-0.1.78.tar.gz (1.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting diskcache>=5.6.1 (from llama-cpp-python==0.1.78)
  Downloading diskcache-5.6.3-py3-none-any.whl (45 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.5/45.5 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: llama-cpp-python
  Building wheel for llama-cpp-python (pyproject.toml) ... [?25l[?25hdone
  Created wheel for llama-cpp-python: filename=llama_cpp_python-0.1.78-cp310-cp310-linux_x86_64.whl size=296587 sha256=9f4925a8af27fb87c1aff3da0c77eef282bed4a188051074c222c80281411267
  Stored in directory: /root/.cache/pip/wheels/61/f9/20/9

If something goes wrong here or later, please check the most up-to-date version of llama-cpp-python at the following link:

https://pypi.org/project/llama-cpp-python/

In [2]:
from llama_cpp import Llama

from IPython.display import display, HTML
import json
import time
import pathlib

In [3]:
!pip install wget

Collecting wget
  Downloading wget-3.2.zip (10 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: wget
  Building wheel for wget (setup.py) ... [?25l[?25hdone
  Created wheel for wget: filename=wget-3.2-py3-none-any.whl size=9655 sha256=9f29170c0ab8614e04a23d853deea831afa1b16bccf6b28b85a24532f2991f6c
  Stored in directory: /root/.cache/pip/wheels/8b/f1/7f/5c94f0a7a505ca1c81cd1d9208ae2064675d97582078e6c769
Successfully built wget
Installing collected packages: wget
Successfully installed wget-3.2


In [4]:
import wget

url = 'https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGML/resolve/main/llama-2-7b-chat.ggmlv3.q8_0.bin'
local_path = '/content/'

wget.download(url, local_path)

'/content//llama-2-7b-chat.ggmlv3.q8_0.bin'

In [5]:
MODEL_Q8_0 = Llama(
    model_path="/content/llama-2-7b-chat.ggmlv3.q8_0.bin",
    n_ctx=2048)

AVX = 1 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 0 | SSE3 = 1 | VSX = 0 | 


## Creation of functions related to reading PDFs

In [6]:
import re

def extract_text_between_quotes(input_string):
    pattern1 = r'"(.*?)"'
    pattern2 = r"'(.*?)'"
    pattern3 = r'(.*?)\sB:'

    match = re.search(pattern1, input_string)
    if match:
        return match.group(1)
    else:
        match = re.search(pattern2, input_string)
        if match:
            return match.group(1)
        else:
            match = re.search(pattern3, input_string)
            if match:
                return match.group(1)
            else:
                return None

In [7]:
def query(model, question):
    model_name = pathlib.Path(model.model_path).name
    time_start = time.time()
    prompt = f"Q: {question} A:"
    output = model(prompt=prompt, max_tokens=0) # if max tokens is zero, depends on n_ctx
    response = output["choices"][0]["text"]
    time_elapsed = time.time() - time_start
    #display(HTML(f'<code>{model_name} response time: {time_elapsed:.02f} sec</code>'))
    #display(HTML(f'<strong>Question:</strong> {question}'))
    display(HTML(f'<strong>Answer:</strong> {response}'))
    #print(json.dumps(output, indent=2))

    response = extract_text_between_quotes(response)
    return response

In [8]:
!pip install PyPDF2

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/232.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.2/232.6 kB[0m [31m2.7 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1


In [9]:
import PyPDF2
from PyPDF2 import PdfReader

#text extraction
def extract_text_from_pdf(pdf_path):
    with open(pdf_path, "rb") as file:
        pdf_reader = PdfReader(file)
        full_text = ""
        for page_num in range(len(pdf_reader.pages)):
            page = pdf_reader.pages[page_num]
            full_text += page.extract_text()
    return full_text

#title extraction from metadata
def get_pdf_title(pdf_path):
    title = None
    with open(pdf_path, "rb") as file:
        pdf_reader = PyPDF2.PdfReader(file)
        if pdf_reader.metadata is not None:
            title = pdf_reader.metadata.title
    return title

## Testing with manually-labelled samples

In [10]:
import os

def get_pdf_files(directory):
    pdf_files = []
    for file in os.listdir(directory):
        if file.endswith('.pdf'):
            pdf_files.append(os.path.join(directory, file))
    return pdf_files

def process_pdfs_in_directory(directory):
    pdf_files = get_pdf_files(directory)
    for pdf_file in pdf_files:
        text = extract_text_from_pdf(pdf_file)
        text = text[:300]
        title = query(MODEL_Q8_0, "Based on the initial 300 characters of a research paper, determine the title of the paper. Follow the scheme: The title is '...'. Here's the text snippet: '" + text + "'")
        truetitle = get_pdf_title(pdf_file)
        print(f"PDF: {pdf_file} \nTitle: {title}\nTrue Title: {truetitle}\n")
        print('---------------------------------------------------')

In [11]:
# Manually-labelled PDFs ZIP download
main_path = '/content/'

url1 = "https://github.com/diegopastorbonet/PDF-Renamer-with-Llama2/blob/main/PDFs_Test_1.zip?raw=true"
url2 = "https://github.com/diegopastorbonet/PDF-Renamer-with-Llama2/blob/main/PDFs_Test_2.zip?raw=true"
url3 = "https://github.com/diegopastorbonet/PDF-Renamer-with-Llama2/blob/main/PDFs_Test_3.zip?raw=true"
url4 = "https://github.com/diegopastorbonet/PDF-Renamer-with-Llama2/blob/main/PDFs_Test_4.zip?raw=true"
url5 = "https://github.com/diegopastorbonet/PDF-Renamer-with-Llama2/blob/main/PDFs_Test_5.zip?raw=true"

wget.download(url1, main_path)
wget.download(url2, main_path)
wget.download(url3, main_path)
wget.download(url4, main_path)
wget.download(url5, main_path)

'/content//PDFs_Test_5.zip'

In [12]:
import zipfile
import os

# Creation of test PDFs directory
output_path = '/content/PDFs/'
if not os.path.exists(output_path):
    os.makedirs(output_path)

# ZIP files directory
zip_directory = main_path

# Listing every file in the directory
zip_files = [f for f in os.listdir(zip_directory) if f.endswith('.zip')]

# Iteration over ZIPs and PDF extraction
for zip_file in zip_files:
    with zipfile.ZipFile(os.path.join(zip_directory, zip_file), 'r') as zip_ref:
        zip_ref.extractall(output_path)
    print("Archivos PDF extraídos de", zip_file)

    # Erasing original ZIP file
    os.remove(os.path.join(zip_directory, zip_file))
    print("Archivo ZIP original eliminado:", zip_file)

print("Archivos PDF extraídos en:", output_path)

Archivos PDF extraídos de PDFs_Test_1.zip
Archivo ZIP original eliminado: PDFs_Test_1.zip
Archivos PDF extraídos de PDFs_Test_4.zip
Archivo ZIP original eliminado: PDFs_Test_4.zip
Archivos PDF extraídos de PDFs_Test_3.zip
Archivo ZIP original eliminado: PDFs_Test_3.zip
Archivos PDF extraídos de PDFs_Test_5.zip
Archivo ZIP original eliminado: PDFs_Test_5.zip
Archivos PDF extraídos de PDFs_Test_2.zip
Archivo ZIP original eliminado: PDFs_Test_2.zip
Archivos PDF extraídos en: /content/PDFs/


In [13]:
pdf_directory = '/content/PDFs/'

process_pdfs_in_directory(pdf_directory)

PDF: /content/PDFs/iannaccone1984.pdf 
Title: Reviewing the Reviews on Literacy and Reasoning.
True Title: Reviewing the Reviews on Literacy and Reasoning: Some Selected Themes and References

---------------------------------------------------


Llama.generate: prefix-match hit


PDF: /content/PDFs/abadal2020.pdf 
Title: Programmable Metamaterials for Software-Defined Electromagnetic Control
True Title: Guest Editorial: Programmable Metamaterials for Software-Defined Electromagnetic Control

---------------------------------------------------


Llama.generate: prefix-match hit


PDF: /content/PDFs/abdelrahman2020.pdf 
Title: Commentary on 'Voided volume for postoperative voiding assessment following prolapse and urinary incontinence surgery.'
True Title: Commentary on “Voided volume for postoperative voiding assessment following prolapse and urinary incontinence surgery”

---------------------------------------------------


Llama.generate: prefix-match hit


PDF: /content/PDFs/ullah2021.pdf 
Title: Predictors and 3-year outcomes of compromised left circumflex coronary artery after left main crossover stenting
True Title: Predictors and 3‐year outcomes of compromised left circumflex coronary artery after left main crossover stenting

---------------------------------------------------


Llama.generate: prefix-match hit


PDF: /content/PDFs/marei2021.pdf 
Title: None
True Title: Current progress in chimeric antigen receptor T cell therapy for glioblastoma multiforme

---------------------------------------------------


Llama.generate: prefix-match hit


PDF: /content/PDFs/riggs2013.pdf 
Title: Chromosomal microarray impacts clinical management
True Title: Chromosomal microarray impacts clinical management

---------------------------------------------------


Llama.generate: prefix-match hit


PDF: /content/PDFs/10.1016@j.coldregions.2019.04.004.pdf 
Title: None
True Title: Probability-based modeling and wind tunnel test of snow distribution on a stepped flat roof

---------------------------------------------------


Llama.generate: prefix-match hit


PDF: /content/PDFs/penna2020.pdf 
Title: Impact of Resection Volume/Stapler Firings-Ratio on Perioperative Complications and Weight Loss After Laparoscopic Sleeve Gastrectomy
True Title: Impact of Resection Volume/Stapler Firings-Ratio on Perioperative Complications and Weight Loss After Laparoscopic Sleeve Gastrectomy

---------------------------------------------------


Llama.generate: prefix-match hit


PDF: /content/PDFs/aursand2019.pdf 
Title: Comparison of Kinetic Theory Evaporation Models for Liquid Thin-Films
True Title: Comparison of kinetic theory evaporation models for liquid thin-films

---------------------------------------------------


Llama.generate: prefix-match hit


PDF: /content/PDFs/haase2020.pdf 
Title: Development of Pharmacy Practice Faculty and Preceptor Skills
True Title: Pharmacy practice faculty and preceptor development

---------------------------------------------------


Llama.generate: prefix-match hit


PDF: /content/PDFs/manuilov2019.pdf 
Title: Douglas Factorization Theorem Revisited
True Title: Douglas factorization theorem revisited

---------------------------------------------------


Llama.generate: prefix-match hit


PDF: /content/PDFs/judge2018.pdf 
Title: The Effectiveness of
True Title: A Suture-Button Technique for Stabilization of the Plantar Plate and Lesser Metatarsophalangeal Joint

---------------------------------------------------


Llama.generate: prefix-match hit


PDF: /content/PDFs/10.1016@j.matlet.2019.127063.pdf 
Title: Manipulating the particle distribution of in situ TiB2p/Al composites via acoustic vibration and cooling rate
True Title: Manipulating the particle distribution of in situ TiB2p/Al composites via acoustic vibration and cooling rate

---------------------------------------------------


Llama.generate: prefix-match hit


PDF: /content/PDFs/hosseinpoor2018.pdf 
Title: None
True Title: Monitoring health inequality in Indonesia

---------------------------------------------------


Llama.generate: prefix-match hit


PDF: /content/PDFs/lei2019.pdf 
Title: Effect of bulk nanobubbles on the entrainment of kaolinite particles in flotation
True Title: Effect of bulk nanobubbles on the entrainment of kaolinite particles in flotation

---------------------------------------------------


Llama.generate: prefix-match hit


PDF: /content/PDFs/chen2020.pdf 
Title: Association between Pro12Ala polymorphism and albuminuria in type 2 diabetic nephropathy
True Title: Association between Pro12Ala polymorphism and albuminuria in type 2 diabetic nephropathy

---------------------------------------------------


Llama.generate: prefix-match hit


PDF: /content/PDFs/huang2014.pdf 
Title: Cardiac-speciﬁc Traf2 overexpression enhances cardiac hypertrophy through activating AKT/GSK3β signaling
True Title: Cardiac-specific Traf2 overexpression enhances cardiac hypertrophy through activating AKT/GSK3β signaling

---------------------------------------------------


Llama.generate: prefix-match hit


PDF: /content/PDFs/10.1038@s41563-020-0608-9.pdf 
Title:  Investigating the potential of graphene-based composites for energy storage and conversion applications
True Title: Quantum dynamics of a single molecule magnet on superconducting Pb(111)

---------------------------------------------------


Llama.generate: prefix-match hit


PDF: /content/PDFs/lou2020.pdf 
Title: The Effects of ppb-level Chloride on Stress Corrosion Cracking of Low Alloy Steels in High-Temperature Water
True Title: Mechanical and metallurgical considerations on the effects of ppb-level chloride on stress corrosion cracking of low alloy steels in high-temperature water

---------------------------------------------------


Llama.generate: prefix-match hit


PDF: /content/PDFs/haygood2012.pdf 
Title: Evaluating the Safety and Efficacy of Intravenous Selenium Infusions in Critically Ill Patients
True Title: Review of Intravenous Selenium Infusions for the Critically III Patient

---------------------------------------------------


Llama.generate: prefix-match hit


PDF: /content/PDFs/garson1905.pdf 
Title: FINGER-PRINT EVIDENCE
True Title: Finger-Print Evidence

---------------------------------------------------


Llama.generate: prefix-match hit


PDF: /content/PDFs/ismar2020.pdf 
Title: Thermal stabilization of poly(acrylonitrile-co-itaconic acid) nano fibers as carbon nano fiber precursor
True Title: Thermal stabilization of poly(acrylonitrile-co-itaconic acid) nanofibers as carbon nanofiber precursor

---------------------------------------------------


Llama.generate: prefix-match hit


PDF: /content/PDFs/10.1007@s00277-020-03916-8.pdf 
Title: None
True Title: M2-like polarization of THP-1 monocyte-derived macrophages under chronic iron overload

---------------------------------------------------


Llama.generate: prefix-match hit


PDF: /content/PDFs/vainio2019.pdf 
Title: The Effects of Climate Change on the Distribution and Abundance of...
True Title: Stuff Matters, Especially When You Risk “Everything” for It

---------------------------------------------------


Llama.generate: prefix-match hit


PDF: /content/PDFs/xu2020.pdf 
Title: Melatonin ameliorates pressure overload-induced cardiac hypertrophy
True Title: Melatonin ameliorates pressure overload-induced cardiac hypertrophy by attenuating Atg5-dependent autophagy and activating the Akt/mTOR pathway

---------------------------------------------------


Llama.generate: prefix-match hit


PDF: /content/PDFs/yanke2002.pdf 
Title: Phalanges
True Title: Vascularity of a Tissue-Engineered Model of Human Phalanges

---------------------------------------------------


Llama.generate: prefix-match hit


PDF: /content/PDFs/mcgrath2015.pdf 
Title: Patients' Desire to Access Metropolitan Hospital Expertise in Haematology.
True Title: Haematology patients&rsquo; desire to access metropolitan hospital expertise

---------------------------------------------------


Llama.generate: prefix-match hit


PDF: /content/PDFs/10.1080@02626667.2020.1791336.pdf 
Title: Assessing the Impacts of Land Cover and Climate on Runoff
True Title: Assessing the impacts of land cover and climate on runoff and sediment yield of a river basin

---------------------------------------------------


Llama.generate: prefix-match hit


PDF: /content/PDFs/10.1080@07399330050130287.pdf 
Title: Health Care for Women International: An Introduction
True Title: TRANSFORMING THE EXHAUSTING TO ENERGIZING PROCESS OF BEING A GOOD PARENT IN THE FACE OF CANCER

---------------------------------------------------


Llama.generate: prefix-match hit


PDF: /content/PDFs/newton2020.pdf 
Title: Risk and Protective Factors and Processes for Behavioral Sleep Problems among Preschool and Early School-Aged Children: A Systematic Review
True Title: Risk and protective factors and processes for behavioral sleep problems among preschool and early school-aged children: A systematic review

---------------------------------------------------


Llama.generate: prefix-match hit


PDF: /content/PDFs/kargar2019.pdf 
Title: Water flow modeling through a graphene-based nanochannel: theory and simulation
True Title: Water flow modeling through a graphene-based nanochannel: theory and simulation

---------------------------------------------------


Llama.generate: prefix-match hit


PDF: /content/PDFs/creinin2019.pdf 
Title: None
True Title: Mifepristone antagonization requires real studies to evaluate safety and efficacy

---------------------------------------------------


Llama.generate: prefix-match hit


PDF: /content/PDFs/10.1016@j.mssp.2020.105523.pdf 
Title: Cobalt Ferrite Nanoparticles and Nanostructures for Semiconductor Applications
True Title: Cobalt ferrite nanoparticles and nanocomposites: Photocatalytic, antimicrobial activity and toxicity in water treatment

---------------------------------------------------


Llama.generate: prefix-match hit


PDF: /content/PDFs/okita2016.pdf 
Title: MHC class I chain-related molecule A and B expression is upregulated by cisplatin and associated with good prognosis in patients with non-small cell lung cancer
True Title: MHC class I chain-related molecule A and B expression is upregulated by cisplatin and associated with good prognosis in patients with non-small cell lung cancer

---------------------------------------------------


Llama.generate: prefix-match hit


PDF: /content/PDFs/hayashidani2005.pdf 
Title: None
True Title: Differences in Heat Resistance among Pathogenic Yersinia enterocolitica Depended on Growth Temperature and Serotype

---------------------------------------------------


Llama.generate: prefix-match hit


PDF: /content/PDFs/panarese2020.pdf 
Title: A Multifactorial Model of Youth Risk-taking in the Italian Context.
True Title: Youth Risk-taking and Leisure: A Multifactorial Model in the Italian Context

---------------------------------------------------


Llama.generate: prefix-match hit


PDF: /content/PDFs/nie2020.pdf 
Title: Exosomal long non-coding RNAs: Emerging players in cancer metastasis and potential diagnostic biomarkers for personalized oncology
True Title: Exosomal long non-coding RNAs: Emerging players in cancer metastasis and potential diagnostic biomarkers for personalized oncology

---------------------------------------------------


Llama.generate: prefix-match hit


PDF: /content/PDFs/10.1007@s10570-019-02516-8.pdf 
Title: Microwave-assisted formic acid extraction for high-purity cellulose production.
True Title: Microwave-assisted formic acid extraction for high-purity cellulose production

---------------------------------------------------


Llama.generate: prefix-match hit


PDF: /content/PDFs/10.1016@j.yofte.2019.04.004.pdf 
Title: Multilayered evanescent wave absorption-based fluoride fibersensor with 2D materials and amorphous silicon layers for enhanced sensitivity and resolution in near-infrared
True Title: Multilayered evanescent wave absorption based fluoride fiber sensor with 2D material and amorphous silicon layers for enhanced sensitivity and resolution in near infrared

---------------------------------------------------


Llama.generate: prefix-match hit


PDF: /content/PDFs/10.1002@ange.202003093.pdf 
Title: Acid-Base Interaction Enhancing Oxygen Tolerance in Electrocatalytic Carbon Dioxide Reduction
True Title: Acid‐Base Interaction Enhancing Oxygen Tolerance in Electrocatalytic Carbon Dioxide Reduction

---------------------------------------------------


Llama.generate: prefix-match hit


PDF: /content/PDFs/mehta2020.pdf 
Title: Prevention of acute graft-versus-host-disease by Withaferin a via suppression of AKT/mTOR pathway
True Title: Prevention of acute graft-versus-host-disease by Withaferin a via suppression of AKT/mTOR pathway

---------------------------------------------------


Llama.generate: prefix-match hit


PDF: /content/PDFs/10.1038@s41567-019-0478-8.pdf 
Title: Qubit State Measurements in Quantum Computation.
True Title: Stern–Gerlach detection of neutral-atom qubits in a state-dependent optical lattice

---------------------------------------------------


Llama.generate: prefix-match hit


PDF: /content/PDFs/wang2020.pdf 
Title: Anisotropic nanogenerator for anticounterfeiting and information encrypted transmission
True Title: Anisotropic nanogenerator for anticounterfeiting and information encrypted transmission

---------------------------------------------------


Llama.generate: prefix-match hit


PDF: /content/PDFs/wang2021.pdf 
Title: A Bayesian inverse approach to measure the anisotropic plasticity properties of materials using spherical indentation experiment
True Title: A Bayesian inverse approach to measure the anisotropic plasticity properties of materials using spherical indentation experiment

---------------------------------------------------


Llama.generate: prefix-match hit


PDF: /content/PDFs/goldberg2000.pdf 
Title: A: The impact of Z on W'
True Title: Laser Treatment Of Vascular Lesions

---------------------------------------------------


Llama.generate: prefix-match hit


PDF: /content/PDFs/ahmedjuma2018.pdf 
Title: The Importance of Digital Libraries and Requirements - Case Study: The National Center for Research.
True Title: The Importance of Digital Libraries and Requirements - Case Study: The National Center for Research, Initiative to Establish the Sudanese Digital Library

---------------------------------------------------


Llama.generate: prefix-match hit


PDF: /content/PDFs/gelhausen2020.pdf 
Title: Forecasting Future Air Traffic Development Up to 2040 and Assessing Mitigation Strategies
True Title: Part III. Forecasting future air traffic development up to 2040 and assessing mitigation strategies

---------------------------------------------------


## Use on own PDF

In [None]:
from google.colab import files

print("Please upload your PDF:")
uploaded_file = files.upload()

In [None]:
def process_own_pdf(pdf_file):
      text = extract_text_from_pdf(pdf_file)
      text = text[:300]
      title = query(MODEL_Q8_0, "Based on the initial 300 characters of a research paper, determine the title of the paper. Follow the scheme: The title is '...'. Here's the text snippet: '" + text + "'")
      print("Title: " + title)
      print('---------------------------------------------------')

In [None]:
process_own_pdf("your PDF rute")