In [1]:
import pdfplumber
from PyPDF2 import PdfReader
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import normalize
import numpy as np
from pymongo import MongoClient
from concurrent.futures import ThreadPoolExecutor, as_completed
import pandas as pd
from collections import Counter

In [2]:
def process_pdf(file_path):
    content = []
    try:
        with pdfplumber.open(file_path) as pdf:
            for page_num, page in enumerate(pdf.pages):
                rotation_angle = page.rotation
                if rotation_angle in [90, 270]:
                    print(f"Page {page_num + 1} is rotated by {rotation_angle} degrees")

                text = page.extract_text()
                content.append({"page": page_num + 1, "text": text or "No text found"})
    except Exception as e:
        print(f"❌ Error processing {file_path}: {e}")
        content = [{"error": str(e)}]
    
    return file_path, content

In [3]:
def process_all_pdfs(folder_path, num_threads=8):
    results = {}
    pdf_files = [os.path.join(folder_path, f) for f in os.listdir(folder_path) if f.lower().endswith('.pdf')]

    with ThreadPoolExecutor(max_workers=num_threads) as executor:
        future_to_file = {executor.submit(process_pdf, file): file for file in pdf_files}

        for future in as_completed(future_to_file):
            file_path, content = future.result()
            results[os.path.basename(file_path)] = content

    return results

In [4]:
folder_path = (r"C:\Users\0132499s\OneDrive - National University of Ireland, Galway\Documents\Documents\Publication")

In [5]:
all_pdf_results = process_all_pdfs(folder_path)

❌ Error processing C:\Users\0132499s\OneDrive - National University of Ireland, Galway\Documents\Documents\Publication\2017.pdf: No /Root object! - Is this really a PDF?
❌ Error processing C:\Users\0132499s\OneDrive - National University of Ireland, Galway\Documents\Documents\Publication\2019 in Review.pdf: No /Root object! - Is this really a PDF?
Page 58 is rotated by 90 degrees
Page 72 is rotated by 90 degrees
Page 73 is rotated by 90 degrees
Page 74 is rotated by 90 degrees
Page 75 is rotated by 90 degrees
❌ Error processing C:\Users\0132499s\OneDrive - National University of Ireland, Galway\Documents\Documents\Publication\A Review of Current Priorities and Emerging Issues in European Waste Policy.pdf: No /Root object! - Is this really a PDF?
Page 110 is rotated by 90 degrees
Page 111 is rotated by 90 degrees
Page 112 is rotated by 90 degrees
Page 119 is rotated by 90 degrees
Page 120 is rotated by 90 degrees
Page 121 is rotated by 90 degrees
Page 123 is rotated by 90 degrees
Page 1

In [6]:
print(len(all_pdf_results))

518


In [7]:
data = []

for doc_name, pages in all_pdf_results.items():
    for page in pages:
        if 'text' in page and isinstance(page['text'], str) and page['text'].strip():  # Skip empty or non-text pages
            # Add row to the data list with document name, page number, and text
            data.append({
                'Document Name': doc_name,
                'Page Number': page['page'],
                'Text': page['text'].replace('\n', ' ')  # Replace newlines with spaces
            })

df = pd.DataFrame(data)

In [8]:
print(df.head())
print(len(df))

                                      Document Name  Page Number  \
0        2023 Government of Ireland Village map.pdf            1   
1  2021-2023 Thematic Research Areas Assessment.pdf            1   
2  2021-2023 Thematic Research Areas Assessment.pdf            2   
3  2021-2023 Thematic Research Areas Assessment.pdf            3   
4  2021-2023 Thematic Research Areas Assessment.pdf            4   

                                                Text  
0  Block 4 National Ploughing Championships 2023 ...  
1  EPA Research 2030 2021-2023 Thematic Research ...  
2  Contents Context ................................  
3  Context Ireland is faced with new opportunitie...  
4  EPA Research 2030 EPA Research 2030 is the ten...  
44161


In [9]:
client = MongoClient('mongodb://localhost:27017/')
db = client['publications']
collection = db['term_matrix_complete'] 

data = df.to_dict(orient='records')
collection.insert_many(data)

InsertManyResult([ObjectId('679cb0e0f166a14143d82246'), ObjectId('679cb0e0f166a14143d82247'), ObjectId('679cb0e0f166a14143d82248'), ObjectId('679cb0e0f166a14143d82249'), ObjectId('679cb0e0f166a14143d8224a'), ObjectId('679cb0e0f166a14143d8224b'), ObjectId('679cb0e0f166a14143d8224c'), ObjectId('679cb0e0f166a14143d8224d'), ObjectId('679cb0e0f166a14143d8224e'), ObjectId('679cb0e0f166a14143d8224f'), ObjectId('679cb0e0f166a14143d82250'), ObjectId('679cb0e0f166a14143d82251'), ObjectId('679cb0e0f166a14143d82252'), ObjectId('679cb0e0f166a14143d82253'), ObjectId('679cb0e0f166a14143d82254'), ObjectId('679cb0e0f166a14143d82255'), ObjectId('679cb0e0f166a14143d82256'), ObjectId('679cb0e0f166a14143d82257'), ObjectId('679cb0e0f166a14143d82258'), ObjectId('679cb0e0f166a14143d82259'), ObjectId('679cb0e0f166a14143d8225a'), ObjectId('679cb0e0f166a14143d8225b'), ObjectId('679cb0e0f166a14143d8225c'), ObjectId('679cb0e0f166a14143d8225d'), ObjectId('679cb0e0f166a14143d8225e'), ObjectId('679cb0e0f166a14143d822

In [19]:
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
from pymongo import MongoClient
import pandas as pd

client = MongoClient('mongodb://localhost:27017/')
db = client['publications']
collection = db['term_matrix_complete']

def debug_text_and_vectorizer():
    docs = list(collection.find({}, {"_id": 1, "Text": 1}))
    if not docs:
        print("❌ No documents found.")
        return

    df = pd.DataFrame(docs)
    df["Text"] = df["Text"].str.lower().str.replace(r"[^a-z\s]", "", regex=True)
    
    print("\n🔍 Sample Text:")
    for text in df["Text"].head(3):
        print(f"📄 {text[:300]}...")

    all_text = " ".join(df["Text"])
    word_counts = Counter(all_text.split())
    filtered_terms = [word.lower() for word, count in word_counts.items() if count > 0 and word.isalpha()]
    print("Sample Vocabulary:", list(filtered_terms)[:20])
    if not filtered_terms:
        print("❌ No frequent terms found.")
        return

    print(f"\n✅ Vocabulary Size: {len(filtered_terms)}")
    print(f"📝 Sample Terms: {filtered_terms[:20]}")
    
    df["Text"] = df["Text"].str.lower()
    vectorizer = TfidfVectorizer(stop_words="english", sublinear_tf=True, max_df=0.7, vocabulary=filtered_terms, lowercase=False)
    
    try:
        X_test = vectorizer.fit_transform(df["Text"].head(3))
        print("\n📊 Sample TDM Output:")
        print(X_test.toarray())

        if (X_test.toarray() == 0).all():
            print("⚠️ Matrix contains only zeros!")
    except Exception as e:
        print(f"❌ Vectorization error: {e}")

    print("\n✅ Debugging complete.")


In [20]:
debug_text_and_vectorizer()


🔍 Sample Text:
📄 block  national ploughing championships   government of ireland village irish coast guard block  block   civil     defence scan here to find global culture were taking gsi row   national out more about the ireland archives creativi ty climate action drill rig innovation government of exhibition and ...
📄 epa research   thematic research areas assessment putting science and innovation at the centre of environmental protection in ireland through the development and proactive transfer of knowledge march  the epa research programme is a government of ireland initiative funded by the department of the en...
📄 contents context    research areas for the research hub on addressing the climate change evidence needs    research areas for the research hub on facilitating a green and circular economy    research areas for the research hub on delivering a healthy environment    research areas for the research hu...
Sample Vocabulary: ['block', 'national', 'ploughing', 'championshi

# TFIDF Vectorizer - Term Document Matrix

In [30]:
client = MongoClient('mongodb://localhost:27017/')
db = client['publications']
collection = db['term_matrix_complete']

def create_tdm_for_document(doc_id, text_data, vectorizer, vocabulary):
    """Create Term-Document Matrix for a single document and update MongoDB (only if not all zeros)."""
    try:
        X = vectorizer.transform([text_data]).toarray().flatten()
        term_scores = [(vocabulary[i], score) for i, score in enumerate(X) if score > 0]

        if not term_scores:
            print(f"⚠️ Skipping document {doc_id} (TDM contains only zeros)")
            return doc_id, "⚠️ Skipped (All Zeros)"

        collection.update_one({"_id": doc_id}, {"$set": {"tdm": term_scores}})
        return doc_id, "✅ Updated"

    except Exception as e:
        print(f"❌ Error processing document {doc_id}: {e}")
        return doc_id, "❌ Failed"

def process_and_update_tdm(num_threads=6):
    """Fetches data from MongoDB, creates TDM, and updates MongoDB."""
    
    docs = list(collection.find({}, {"_id": 1, "Text": 1}))
    if not docs:
        print("❌ No documents found in MongoDB.")
        return

    df = pd.DataFrame(docs)
    if "Text" not in df.columns or df["Text"].isna().all():
        print("❌ No valid 'Text' data found in MongoDB.")
        return

    df["Text"] = df["Text"].str.lower().str.replace(r"[^a-z\s]", "", regex=True)
    vectorizer = TfidfVectorizer(stop_words="english", sublinear_tf=True, max_df=0.7)
    X = vectorizer.fit_transform(df["Text"])
    vocabulary = vectorizer.get_feature_names_out()

    with ThreadPoolExecutor(max_workers=num_threads) as executor:
        futures = {
            executor.submit(create_tdm_for_document, row["_id"], row["Text"], vectorizer, vocabulary): row["_id"]
            for _, row in df.iterrows()
        }

        for future in as_completed(futures):
            doc_id, status = future.result()
            print(f"Document {doc_id}: {status}")

    print("✅ Term-Document Matrix updated in MongoDB.")

In [None]:
process_and_update_tdm()

Document 679cb0e0f166a14143d82296: ✅ Updated
Document 679cb0e0f166a14143d82255: ✅ Updated
Document 679cb0e0f166a14143d82256: ✅ Updated
Document 679cb0e0f166a14143d8228f: ✅ Updated
Document 679cb0e0f166a14143d82261: ✅ Updated
Document 679cb0e0f166a14143d8229a: ✅ Updated
Document 679cb0e0f166a14143d8227c: ✅ Updated
Document 679cb0e0f166a14143d822a9: ✅ Updated
Document 679cb0e0f166a14143d822a3: ✅ Updated
Document 679cb0e0f166a14143d822aa: ✅ Updated
Document 679cb0e0f166a14143d8227b: ✅ Updated
Document 679cb0e0f166a14143d82246: ✅ Updated
Document 679cb0e0f166a14143d82285: ✅ Updated
Document 679cb0e0f166a14143d8224f: ✅ Updated
Document 679cb0e0f166a14143d8224d: ✅ Updated
Document 679cb0e0f166a14143d82270: ✅ Updated
Document 679cb0e0f166a14143d82279: ✅ Updated
Document 679cb0e0f166a14143d822a6: ✅ Updated
Document 679cb0e0f166a14143d822a5: ✅ Updated
Document 679cb0e0f166a14143d8226b: ✅ Updated
Document 679cb0e0f166a14143d82265: ✅ Updated
Document 679cb0e0f166a14143d8225c: ✅ Updated
Document 6

In [32]:
client = MongoClient("mongodb://localhost:27017/")
db = client["publications"]
collection = db["term_matrix_complete"]

records = collection.find().limit(1)

for record in records:
    print(record)

{'_id': ObjectId('679cb0e0f166a14143d82246'), 'Document Name': '2023 Government of Ireland Village map.pdf', 'Page Number': 1, 'Text': 'Block 4 National Ploughing Championships 2023 396 Government of Ireland Village Irish Coast Guard Block 2 Block 3 378 Civil 359 360 361 361 Defence Scan here to find Global Culture, We’re Taking GSI ROW 2 3 National out more about the Ireland Archives Creativi ty, Climate Action Drill Rig Innovation Government of Exhibition and Sport Ireland Village 362 An Garda Creative Wall Síochána 343 - 347 348 349 350 Our Rural Future: Irish Mobile 351 Health and Supporting People Defence Library Justice Sector ROW 22 Forces Safety Authority and Communities Supporting Victims of Crime Climbing Wall 335 336 337 338 339 Department of Agriculture, Putting Bord Bia Public Jobs, Irish Food Apprenticeships, You First ROW 21 Teagasc Food and the Marine Board Literacy and Supported Training by Citizens Information Board 339 Nature and Heritage Laois National 339 Directora

In [37]:
from pymongo import MongoClient

client = MongoClient('mongodb://localhost:27017/')
db = client['publications']
collection = db['term_matrix_complete']

def get_top_documents_for_term(term, top_n=5):
    """Retrieve top N documents where the given term has the highest TF-IDF score."""
    
    docs = collection.find({"tdm": {"$exists": True}}, {"_id": 1, "tdm": 1})
    
    doc_term_frequencies = []
    for doc in docs:
        if not isinstance(doc.get("tdm"), list):
            continue  # Skip documents where TDM is not a list
        
        # Extract term score safely
        term_score = None
        for entry in doc["tdm"]:
            if isinstance(entry, list) and len(entry) == 2:  # Ensure valid (word, score) structure
                word, score = entry
                if word == term:
                    term_score = score
                    break  # No need to check further once we find the term

        if term_score is not None:
            doc_term_frequencies.append((doc["_id"], term_score))
    
    return sorted(doc_term_frequencies, key=lambda x: x[1], reverse=True)[:top_n]

# Example usage
top_docs = get_top_documents_for_term("dairy", top_n=5)
print(top_docs)


[(ObjectId('679cb1c3f166a14143d8b7c3'), 0.4863064196225603), (ObjectId('679cb0e0f166a14143d82337'), 0.35161768342009897), (ObjectId('679cb0e0f166a14143d893b6'), 0.3056441907886277), (ObjectId('679cb1c3f166a14143d8c575'), 0.3011029609788694), (ObjectId('679cb0e0f166a14143d86084'), 0.29931108770787884)]


# TF Vectorizer - Term Document Matrix

In [2]:
from pymongo import MongoClient
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd

client = MongoClient('mongodb://localhost:27017/')
db = client['publications']
collection = db['term_matrix_complete']

def create_tf_matrix(doc_id, text_data, tf_vectorizer):
    X_tf = tf_vectorizer.transform([text_data])
    tf_matrix = [(term, freq) for term, freq in zip(tf_vectorizer.get_feature_names_out(), X_tf.toarray().tolist()[0]) if freq > 1]
    if tf_matrix:
        collection.update_one({"_id": doc_id}, {"$set": {"tf_matrix": tf_matrix}})

def process_and_update_tf(num_threads=6):
    docs = list(collection.find({}, {"_id": 1, "Text": 1}))
    if not docs:
        return
    df = pd.DataFrame(docs)
    df["Text"] = df["Text"].str.lower().str.replace(r"[^a-z\s]", "", regex=True)
    tf_vectorizer = CountVectorizer(stop_words=None)
    tf_vectorizer.fit(df["Text"])
    from concurrent.futures import ThreadPoolExecutor
    with ThreadPoolExecutor(max_workers=num_threads) as executor:
        for _, row in df.iterrows():
            executor.submit(create_tf_matrix, row["_id"], row["Text"], tf_vectorizer)


In [3]:
process_and_update_tf()

In [4]:
client = MongoClient("mongodb://localhost:27017/")
db = client["publications"]
collection = db["term_matrix_complete"]

records = collection.find().limit(5)

for record in records:
    print(record)

{'_id': ObjectId('679cb0e0f166a14143d82246'), 'Document Name': '2023 Government of Ireland Village map.pdf', 'Page Number': 1, 'Text': 'Block 4 National Ploughing Championships 2023 396 Government of Ireland Village Irish Coast Guard Block 2 Block 3 378 Civil 359 360 361 361 Defence Scan here to find Global Culture, We’re Taking GSI ROW 2 3 National out more about the Ireland Archives Creativi ty, Climate Action Drill Rig Innovation Government of Exhibition and Sport Ireland Village 362 An Garda Creative Wall Síochána 343 - 347 348 349 350 Our Rural Future: Irish Mobile 351 Health and Supporting People Defence Library Justice Sector ROW 22 Forces Safety Authority and Communities Supporting Victims of Crime Climbing Wall 335 336 337 338 339 Department of Agriculture, Putting Bord Bia Public Jobs, Irish Food Apprenticeships, You First ROW 21 Teagasc Food and the Marine Board Literacy and Supported Training by Citizens Information Board 339 Nature and Heritage Laois National 339 Directora

In [6]:
def get_document_with_highest_tf():
    client = MongoClient('mongodb://localhost:27017/')
    db = client['publications']
    collection = db['term_matrix_complete']
    
    docs = collection.find({"tf_matrix": {"$exists": True}})
    
    max_total_freq = 0
    max_doc = None
    for doc in docs:
        tf_matrix = doc.get('tf_matrix', [])
        total_frequency = sum(freq for _, freq in tf_matrix)
        
        if total_frequency > max_total_freq:
            max_total_freq = total_frequency
            max_doc = doc
    
    return max_doc, max_total_freq

document, total_freq = get_document_with_highest_tf()
if document:
    print(f"Document ID: {document['_id']}")
    print(f"Total Frequency: {total_freq}")
    print(f"TF Matrix: {document['tf_matrix']}")
else:
    print("No documents found.")


Document ID: 679cb1c3f166a14143d8a07b
Total Frequency: 1255
TF Matrix: [['actively', 2], ['addition', 2], ['additional', 5], ['affected', 2], ['agreed', 2], ['all', 5], ['an', 7], ['analysis', 4], ['and', 37], ['annual', 4], ['any', 2], ['application', 6], ['approach', 3], ['are', 9], ['areas', 3], ['as', 9], ['at', 4], ['available', 2], ['be', 26], ['been', 3], ['begin', 3], ['belgium', 2], ['benefit', 7], ['benefits', 4], ['by', 14], ['carried', 2], ['case', 6], ['certain', 4], ['cfp', 20], ['changes', 3], ['come', 2], ['commissions', 2], ['completed', 3], ['consider', 3], ['considered', 2], ['consultation', 7], ['context', 4], ['could', 4], ['council', 2], ['countries', 2], ['critical', 2], ['demersal', 7], ['departure', 2], ['do', 2], ['down', 2], ['each', 5], ['eg', 2], ['end', 6], ['essential', 5], ['eu', 6], ['existing', 3], ['expected', 4], ['fact', 2], ['first', 2], ['fish', 2], ['fishers', 2], ['following', 4], ['for', 28], ['forum', 4], ['france', 2], ['from', 9], ['full', 5