In [None]:
# SYSTEM PACKAGES (for OCR and PDF image conversion)
!apt-get install -y tesseract-ocr libtesseract-dev poppler-utils

# PYTHON PACKAGES
!pip install pdfminer.six python-docx pytesseract pdf2image keybert spacy sentence-transformers transformers
!python -m spacy download en_core_web_sm

import nltk
nltk.download('punkt')

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
tesseract-ocr is already the newest version (4.1.1-2.1build1).
The following additional packages will be installed:
  libarchive-dev libleptonica-dev
The following NEW packages will be installed:
  libarchive-dev libleptonica-dev libtesseract-dev poppler-utils
0 upgraded, 4 newly installed, 0 to remove and 35 not upgraded.
Need to get 3,929 kB of archives.
After this operation, 16.7 MB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy-updates/main amd64 libarchive-dev amd64 3.6.0-1ubuntu1.4 [581 kB]
Get:2 http://archive.ubuntu.com/ubuntu jammy/universe amd64 libleptonica-dev amd64 1.82.0-3build1 [1,562 kB]
Get:3 http://archive.ubuntu.com/ubuntu jammy/universe amd64 libtesseract-dev amd64 4.1.1-2.1build1 [1,600 kB]
Get:4 http://archive.ubuntu.com/ubuntu jammy-updates/main amd64 poppler-utils amd64 22.02.0-2ubuntu0.8 [186 kB]
Fetched 3,929 kB in 2s (2,339 kB/

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
import os
import json
import pytesseract
import spacy
import tempfile
from pdfminer.high_level import extract_text as pdf_extract_text
from pdf2image import convert_from_path
from docx import Document
from keybert import KeyBERT
from transformers import pipeline
from datetime import datetime
import ipywidgets as widgets
from IPython.display import display, clear_output

In [None]:
# --- File Extraction Functions ---

def extract_text_from_pdf(file_path):
    # Try direct text extraction
    text = pdf_extract_text(file_path)
    if text.strip():
        return text
    # Fallback to OCR
    images = convert_from_path(file_path)
    ocr_text = ""
    for img in images:
        ocr_text += pytesseract.image_to_string(img)
    return ocr_text

def extract_text_from_docx(file_path):
    doc = Document(file_path)
    return "\n".join([para.text for para in doc.paragraphs])

def extract_text_from_txt(file_path):
    with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
        return f.read()

def extract_text(file_path, file_ext):
    if file_ext == ".pdf":
        return extract_text_from_pdf(file_path)
    elif file_ext == ".docx":
        return extract_text_from_docx(file_path)
    elif file_ext == ".txt":
        return extract_text_from_txt(file_path)
    else:
        return ""

In [None]:
# Load models
nlp = spacy.load("en_core_web_sm")
kw_model = KeyBERT('all-MiniLM-L6-v2')
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

def extract_keywords(text, top_n=5):
    keywords = kw_model.extract_keywords(text, top_n=top_n, stop_words='english')
    return [kw for kw, score in keywords]

def extract_entities(text):
    doc = nlp(text)
    author, org, date = None, None, None
    for ent in doc.ents:
        if ent.label_ == "PERSON" and not author:
            author = ent.text
        elif ent.label_ == "ORG" and not org:
            org = ent.text
        elif ent.label_ == "DATE" and not date:
            date = ent.text
    return {"author": author, "organization": org, "publication_date": date}

def summarize_text(text, max_length=130):
    # Truncate for summarization model
    text = text[:1024*2]
    summary = summarizer(text, max_length=max_length, min_length=30, do_sample=False)
    return summary[0]['summary_text']

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Device set to use cpu


In [None]:
# File upload widget
uploader = widgets.FileUpload(
    accept='.pdf,.docx,.txt',
    multiple=False
)
display(uploader)

def handle_upload(change):
    clear_output(wait=True)
    display(uploader)
    if uploader.value:
        # Save uploaded file to temp
        for fname, fileinfo in uploader.value.items():
            ext = os.path.splitext(fname)[-1].lower()
            with tempfile.NamedTemporaryFile(delete=False, suffix=ext) as tmp:
                tmp.write(fileinfo['content'])
                tmp_path = tmp.name

            # Extract text
            text = extract_text(tmp_path, ext)
            # NLP processing
            keywords = extract_keywords(text)
            entities = extract_entities(text)
            summary = summarize_text(text)
            title = fname

            # Prepare metadata
            metadata = {
                "title": title,
                "keywords": keywords,
                "author": entities["author"],
                "organization": entities["organization"],
                "publication_date": entities["publication_date"],
                "summary": summary
            }

            # Display result
            print("Extracted Metadata:")
            print(json.dumps(metadata, indent=2))

            # Download button
            metadata_str = json.dumps(metadata, indent=2)
            metadata_bytes = metadata_str.encode('utf-8')
            download_widget = widgets.Button(description="Download Metadata JSON")
            out = widgets.Output()

            def on_download_clicked(b):
                with out:
                    clear_output()
                    from google.colab import files
                    with open("metadata.json", "w", encoding="utf-8") as f:
                        f.write(metadata_str)
                    files.download("metadata.json")

            download_widget.on_click(on_download_clicked)
            display(download_widget, out)

uploader.observe(handle_upload, names='value')

FileUpload(value={'MARS OPEN PROJECTS 2025.pdf': {'metadata': {'name': 'MARS OPEN PROJECTS 2025.pdf', 'type': …

In [None]:
import time
from ipywidgets import IntProgress, HTML, VBox, HBox, Button, Output
from IPython.display import display, clear_output

# Enhanced File Upload Widget with Visual Feedback
uploader = widgets.FileUpload(
    accept='.pdf,.docx,.txt',
    multiple=False,
    description='Upload Document'
)

# Status widgets
status_html = HTML(value="<b>📁 Ready to upload a document (PDF, DOCX, or TXT)</b>")
progress_bar = IntProgress(
    value=0,
    min=0,
    max=100,
    description='Progress:',
    bar_style='info',
    style={'bar_color': '#4CAF50'},
    layout={'width': '400px'}
)
progress_bar.style.display = 'none'

# Processing time display
time_html = HTML(value="")

# Results output area
results_output = Output()

# Download button (initially hidden)
download_btn = Button(
    description="📥 Download JSON",
    button_style='success',
    layout={'width': '150px'}
)
download_btn.style.display = 'none'

# Layout
upload_box = VBox([
    HTML("<h3>🔍 Document Metadata Extractor</h3>"),
    uploader,
    status_html,
    progress_bar,
    time_html,
    download_btn,
    results_output
])

display(upload_box)

def update_progress(value, status_text, bar_style='info'):
    """Update progress bar and status"""
    progress_bar.value = value
    progress_bar.bar_style = bar_style
    status_html.value = f"<b>{status_text}</b>"
    if value > 0:
        progress_bar.style.display = 'block'

def format_time(seconds):
    """Format time in a readable way"""
    if seconds < 60:
        return f"{seconds:.1f} seconds"
    else:
        minutes = int(seconds // 60)
        secs = seconds % 60
        return f"{minutes}m {secs:.1f}s"

def handle_upload(change):
    if not uploader.value:
        return

    start_time = time.time()

    # Clear previous results
    with results_output:
        clear_output()

    # Hide download button
    download_btn.style.display = 'none'

    try:
        # Get uploaded file info
        fname = list(uploader.value.keys())[0]
        fileinfo = uploader.value[fname]
        file_size = len(fileinfo['content'])

        # Show file upload confirmation
        update_progress(10, f"✅ File uploaded: {fname} ({file_size:,} bytes)")
        time.sleep(0.5)

        # Save file to temp location
        update_progress(20, "💾 Saving file...")
        ext = os.path.splitext(fname)[-1].lower()
        with tempfile.NamedTemporaryFile(delete=False, suffix=ext) as tmp:
            tmp.write(fileinfo['content'])
            tmp_path = tmp.name
        time.sleep(0.3)

        # Extract text
        update_progress(30, "📄 Extracting text from document...")
        text = extract_text(tmp_path, ext)
        if not text.strip():
            raise ValueError("No text could be extracted from the document")

        elapsed = time.time() - start_time
        time_html.value = f"<i>⏱️ Elapsed time: {format_time(elapsed)}</i>"
        time.sleep(0.5)

        # Keyword extraction
        update_progress(50, "🔍 Extracting keywords...")
        keywords = extract_keywords(text)
        time.sleep(0.5)

        # Named entity recognition
        update_progress(70, "👤 Identifying entities (author, organization, dates)...")
        entities = extract_entities(text)
        time.sleep(0.5)

        # Text summarization
        update_progress(85, "📝 Generating summary...")
        summary = summarize_text(text)
        time.sleep(0.5)

        # Prepare metadata
        update_progress(95, "📋 Preparing metadata...")
        metadata = {
            "title": fname,
            "file_size_bytes": file_size,
            "word_count": len(text.split()),
            "character_count": len(text),
            "keywords": keywords,
            "author": entities["author"],
            "organization": entities["organization"],
            "publication_date": entities["publication_date"],
            "summary": summary,
            "processed_at": datetime.now().isoformat()
        }

        # Complete
        total_time = time.time() - start_time
        update_progress(100, f"✨ Processing complete! ({format_time(total_time)})", 'success')
        time_html.value = f"<i>⏱️ Total processing time: {format_time(total_time)}</i>"

        # Display results
        with results_output:
            print("🎉 EXTRACTION RESULTS")
            print("=" * 50)
            print(f"📄 Document: {fname}")
            print(f"📊 Size: {file_size:,} bytes | Words: {len(text.split()):,} | Characters: {len(text):,}")
            print(f"⏱️ Processing time: {format_time(total_time)}")
            print("\n📋 METADATA (JSON):")
            print("-" * 30)
            print(json.dumps(metadata, indent=2, ensure_ascii=False))

            # Show preview of extracted text
            print(f"\n📖 TEXT PREVIEW (first 300 characters):")
            print("-" * 40)
            print(f"{text[:300]}{'...' if len(text) > 300 else ''}")

        # Show download button
        download_btn.style.display = 'block'

        # Store metadata for download
        global current_metadata
        current_metadata = metadata

    except Exception as e:
        update_progress(0, f"❌ Error: {str(e)}", 'danger')
        time_html.value = f"<i>⏱️ Failed after: {format_time(time.time() - start_time)}</i>"
        with results_output:
            print(f"❌ ERROR: {str(e)}")
        progress_bar.style.display = 'none'

def download_metadata(b):
    """Handle metadata download"""
    try:
        metadata_str = json.dumps(current_metadata, indent=2, ensure_ascii=False)
        filename = f"metadata_{current_metadata['title'].replace(' ', '_')}.json"

        with open(filename, "w", encoding="utf-8") as f:
            f.write(metadata_str)

        from google.colab import files
        files.download(filename)

        # Update status
        status_html.value = f"<b>📥 Downloaded: {filename}</b>"

    except Exception as e:
        status_html.value = f"<b>❌ Download failed: {str(e)}</b>"

# Connect event handlers
uploader.observe(handle_upload, names='value')
download_btn.on_click(download_metadata)

# Initialize
current_metadata = {}

VBox(children=(HTML(value='<h3>🔍 Document Metadata Extractor</h3>'), FileUpload(value={}, accept='.pdf,.docx,.…