In [6]:
# Extracting text from images

from PIL import Image
import pytesseract

pytesseract.pytesseract.tesseract_cmd = (
    r"C:\Program Files\Tesseract-OCR\tesseract.exe"
)

img = Image.open('documents/sample.jpg')
img = img.convert('L')

text = pytesseract.image_to_string(img)

print(text.replace("\x0c", "").strip())

A Simple PDF File This is (inde:
a small demonstration .pdf file -

for use in the

just
Virtual Mechanics

tutorials. More text. And more text.

And more text.

And more text. And more

text. And more text. And more text.

And more text.
text. And more
more text. And
text. And more
more text. And
text. And more
And more text.
text. And more

And more text.
nace 2

And more text. And more
text. Boring, zzzzz. And
more text. And more
text. And more text. And
more text. And more
text. And more text.
And more text. And more
text. And more text.
Even more. Continued on


In [None]:
# Image Captioning using BLIP

from transformers import BlipProcessor, BlipForConditionalGeneration
import torch
from PIL import Image

processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")

image = Image.open("documents/sample.jpg")
image = image.convert('L')

inputs = processor(images=image, return_tensors="pt") #type: ignore (pylance error; code works fine)

outputs = model.generate(**inputs) #type: ignore (pylance error; code works fine)
caption = processor.decode(outputs[0], skip_special_tokens=True) #type: ignore (pylance error; code works fine)
print("Generated Caption:", caption)

Generated Caption: a simple text editor


In [15]:
# Combined Text Exraction and Image Captioning (Tesseract + BLIP)
from PIL import Image
import pytesseract
from transformers import BlipProcessor, BlipForConditionalGeneration
import torch
from PIL import Image


pytesseract.pytesseract.tesseract_cmd = ( r"C:\Program Files\Tesseract-OCR\tesseract.exe" )
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")

img = Image.open('documents/sample.jpg')
img = img.convert('L')
inputs = processor(images=img, return_tensors="pt") #type: ignore (pylance error; code works fine)

text = pytesseract.image_to_string(img)
outputs = model.generate(**inputs) #type: ignore (pylance error; code works fine)
caption = processor.decode(outputs[0], skip_special_tokens=True) #type: ignore (pylance error; code works fine)


# Empty dictionary to store the final output
response = {}

response["Text"] = text.replace("\x0c", "").strip()
response["Caption"] = caption

print(response)

{'Text': 'A Simple PDF File This is (inde:\na small demonstration .pdf file -\n\nfor use in the\n\njust\nVirtual Mechanics\n\ntutorials. More text. And more text.\n\nAnd more text.\n\nAnd more text. And more\n\ntext. And more text. And more text.\n\nAnd more text.\ntext. And more\nmore text. And\ntext. And more\nmore text. And\ntext. And more\nAnd more text.\ntext. And more\n\nAnd more text.\nnace 2\n\nAnd more text. And more\ntext. Boring, zzzzz. And\nmore text. And more\ntext. And more text. And\nmore text. And more\ntext. And more text.\nAnd more text. And more\ntext. And more text.\nEven more. Continued on', 'Caption': 'a simple text editor'}


In [None]:
# Extracting texts from PDFs (PyMuPDF)

import fitz

pdf_text_response = []

doc = fitz.open('documents/sample.pdf')
for page in doc:
    text = page.get_text()
    pdf_text_response.append(text)

print(pdf_text_response)

['A Simple PDF File \nThis is a small demonstration .pdf file \njust for use in the Virtual Mechanics tutorials. More text. And more  \ntext. And more text. And more text. And more text. \nAnd more text. And more text. And more text. And more text. And more.  \ntext. And more text. Boring, zzzzz. And more text. And more text. And \nmore text. And more text. And more text. And more text. And more text.  \nAnd more text. And more text. \nAnd more text. And more text. And more text. And more text. And more  \ntext. And more text. And more text. Even more. Continued on page 2 ... \nSimple PDF File 2 \n...continued from page 1. Yet more text. And more text. And more text.  \nAnd more text. And more text. And more text. And more text. And more  \ntext. Oh, how boring typing this stuff. But not as boring as watching  \npaint dry. And more text. And more text. And more text. And more text.  \nBoring. More, a little more text. The end, and just as well. \n']


In [25]:
# Extracting tables from PDFs (PyMuPDF, using Fitz)

import fitz

pdf_table_response = []

doc = fitz.open('documents/sample-report.pdf')
for page in doc:
    tabs = page.find_tables()
        
    for t in tabs:
        tables = t.extract()
        clean_rows = ["\t".join(map(str, row)) for row in tables]
        clean_table = "\n".join(clean_rows)
        pdf_table_response.append(clean_table)


print(pdf_table_response)


['ID\tMetric\tValue\tRemarks\n1\tMetric 1\t70\tValid Data\n2\tMetric 2\t431\tValid Data\n3\tMetric 3\t186\tValid Data\n4\tMetric 4\t489\tValid Data\n5\tMetric 5\t180\tValid Data', 'Marketing Strategy\nTristique non tempus vitae, ornare sed mauris. Etiam\nblandit tempor metus, at vehicula nisi. Maecenas\nsuscipit vulputate varius.\n1st Strategy\nInteger et justo velus.\nUt in ipsum ac risus.\nMaecenas iaculis.\nUt nec mauris vel.\nTellus accumsan.\n2nd Strategy\nInteger et justo velus.\nUt in ipsum ac risus.\nMaecenas iaculis.\nUt nec mauris vel.\nTellus accumsan.\nThis sample PDF file is provided by Sample-Files.com. Visit us for more sample files and resource.\tNone\tg Strategy\nmpus vitae, ornare sed mauris. Etiam\nmetus, at vehicula nisi. Maecenas\nte varius.\n1st Strategy\nNone\t\t\nNone\tNone\t', 'Sales Projections\nTristique non tempus vitae, ornare sed mauris. Etiam\nblandit tempor metus, at vehicula nisi. Maecenas\nsuscipit vulputate varius.\nSeries 1 Series 2\n20\n15\n10\n5\n0

In [31]:
import fitz
import os
import uuid

pdf_imgs_response = []

doc = fitz.open('documents/sample-report.pdf')
img_folder = 'extracted-images'
os.makedirs(img_folder, exist_ok=True)

for page in doc:
    for img in page.get_images(full=True):
        xref = img[0]
        pix = fitz.Pixmap(doc, xref)

        uid = uuid.uuid4().hex
        img_name = f"img_{page.number}_{xref}_{uid}.png"
        img_path = os.path.join(img_folder, img_name)

        if pix.n < 5:
            pix.save(img_path)
        else:
            pix = fitz.Pixmap(fitz.csRGB, pix)
            pix.save(img_path)

        del pix

        pdf_imgs_response.append({
            "page": page.number,
            "xref": xref,
            "path": img_path
        })

print(pdf_imgs_response)


[{'page': 3, 'xref': 32, 'path': 'extracted-images\\img_3_32_7c8792835c20480c80fff5cba049da2b.png'}, {'page': 3, 'xref': 33, 'path': 'extracted-images\\img_3_33_aa2d831aa8a243bd9c3f644969790707.png'}, {'page': 4, 'xref': 38, 'path': 'extracted-images\\img_4_38_ec9dac347d07460e918dc80680cb534a.png'}, {'page': 5, 'xref': 43, 'path': 'extracted-images\\img_5_43_e898d80981b04ca19fb73496fe1bc381.png'}, {'page': 5, 'xref': 44, 'path': 'extracted-images\\img_5_44_5cf85e2b519a401e82fb20a6017ea991.png'}]


In [None]:
# Combining the Text Extraction, Table Extraction, and Image Extraction from PDF

import fitz
import os
import uuid

pdf_text_response = []
pdf_table_response = []
pdf_imgs_response = []

doc = fitz.open('documents/sample-report.pdf')
for page in doc:

    # Getting the text
    text = page.get_text()
    pdf_text_response.append(text)

    # Getting the tables
    tabs = page.find_tables() #type:ignore
    for t in tabs:
        tables = t.extract()
        clean_rows = ["\t".join(map(str, row)) for row in tables]
        clean_table = "\n".join(clean_rows)
        pdf_table_response.append(clean_table)

    # Getting the images
    for page in doc:
        for img in page.get_images(full=True):
            xref = img[0]
            pix = fitz.Pixmap(doc, xref)

            uid = uuid.uuid4().hex
            img_name = f"img_{page.number}_{xref}_{uid}.png"
            img_path = os.path.join(img_folder, img_name)

            if pix.n < 5:
                pix.save(img_path)
            else:
                pix = fitz.Pixmap(fitz.csRGB, pix)
                pix.save(img_path)

            del pix

            pdf_imgs_response.append({
                "page": page.number,
                "xref": xref,
                "path": img_path
            })

pdf_response = {
    'text' : pdf_text_response,
    'tables' : pdf_table_response,
    'images' : pdf_imgs_response
}

print(pdf_response)

{'text': ['Sample Team\nPrepared By\nsample-files.com\nMulti-Page\nReport\n“A comprehensive and content-heavy report that\nincludes text, images, and tables for thorough\ntesting of pagination and complex layouts.”\n', 'Table of Contents\n1. Introduction\n2. Market Analysis\n3. Data Analysis\n4. Product Overview\n5. Results & Discussion\n6. Marketing Strategy\n7. Sales Projections\n8. Launch Timeline\nThis sample PDF file is provided by Sample-Files.com. Visit us for more sample files and resource.\n', 'Introduction\nThis section introduces the report and highlights the\nkey objectives. The purpose of this report is to analyze\ndata, evaluate outcomes, and provide insights for\nfuture decisions. This analysis is based on various data\nsources that include quantitative and qualitative\ninputs. Note: Add an image here illustrating the\nconcept of data analysis or research methodology. \nID\nMetric\nValue\nRemarks\n1\nMetric 1\n70\nValid Data\n2\nMetric 2\n431\nValid Data\n3\nMetric 3\n18

In [None]:
# Extracting texts from Audio (MP3, WAV, etc. files, using SpeechRecognition and Pydub (for extracting large audio files))

import speech_recognition as sr

filename = 'documents/sample.wav'

r = sr.Recognizer()

with sr.AudioFile(filename) as source:
    audio_data = r.record(source)
    text = r.recognize_google(audio_data) #type: ignore (pylance error; code works fine)
    print(text)

I believe you are just talking nonsense


In [10]:
# Continued code for audio extraction, but for larger audio files, taken from the documentation (https://thepythoncode.com/article/using-speech-recognition-to-convert-speech-to-text-python)

# importing libraries 
import speech_recognition as sr 
import os 
from pydub import AudioSegment
from pydub.silence import split_on_silence

r = sr.Recognizer()

def transcribe_audio(path):
    with sr.AudioFile(path) as source:
        audio_listened = r.record(source)
        text = r.recognize_google(audio_listened) #type: ignore (pylance error; code works fine)
    return text


def get_large_audio_transcription_on_silence(path):
    """Splitting the large audio file into chunks
    and apply speech recognition on each of these chunks"""
    sound = AudioSegment.from_file(path)  
    # split audio sound where silence is 500 miliseconds or more and get chunks
    chunks = split_on_silence(sound,
        # experiment with this value for your target audio file
        min_silence_len = 700,
        # adjust this per requirement
        silence_thresh = sound.dBFS-14,
        # keep the silence for 1 second, adjustable as well
        keep_silence=700,
    )
    folder_name = "audio-chunks"
    # create a directory to store the audio chunks
    if not os.path.isdir(folder_name):
        os.mkdir(folder_name)
    whole_text = ""
    # process each chunk 
    for i, audio_chunk in enumerate(chunks, start=1):
        chunk_filename = os.path.join(folder_name, f"chunk{i}.wav")
        audio_chunk.export(chunk_filename, format="wav")
        try:
            text = transcribe_audio(chunk_filename)
        except sr.UnknownValueError as e:
            print("Error:", str(e))
        else:
            text = f"{text.capitalize()}. "
            print(chunk_filename, ":", text)
            whole_text += text
    return whole_text


get_large_audio_transcription_on_silence('documents/sample.wav')

audio-chunks\chunk1.wav : I believe you are just talking nonsense. 


'I believe you are just talking nonsense. '