# Create Term - Document Matrix

In [4]:
from pymongo import MongoClient, errors
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from concurrent.futures import ThreadPoolExecutor
import threading
import logging
import re
from bson import ObjectId

### Checking the MongoDB Connection.

#### Source Connection will be the collection containing the extracted and cleaned texts from the transcript documents.
#### Term Document Collection will be the collection used to store the Term Document Frequencies.

In [5]:
client = MongoClient("localhost", 27017)

In [6]:
db = client['transcripts']
collection_name = db['complete_documents']

### Creating Term Document Frequencies

The term - document frequencies will be evaluated for the extracted and cleaned texts from the transcript documents. This text is stored in teh 'complete_transcripts' collection. The function will evaluated the Term - Document frequencies from 'File Name' and 'Cleaned Text' available in this collection. The evaluated Term - Frequencies are then validated for count more than 1 (to eliminate the terms with 0 frequencies), and then the filtered terms with frequencies more than 1 are stored in a separate collection 'term_document_matrix_transcript'.

In [7]:
logging.basicConfig(level=logging.INFO)

def update_tdm_in_mongodb(db_name, collection_name, dept_db_name, dept_collection_name, keywords_db_name, keywords_collection_name, min_term_count=1, uri="mongodb://localhost:27017/"):
    try:
        # Connect to MongoDB
        client = MongoClient(uri)
        db = client[db_name]
        collection = db[collection_name]

        # Fetch department names from the department collection
        dept_db = client[dept_db_name]
        dept_collection = dept_db[dept_collection_name]
        department_names = [dept['dept_name'].lower() for dept in dept_collection.find()]

        # Fetch keywords from the keywords collection
        keywords_db = client[keywords_db_name]
        keywords_collection = keywords_db[keywords_collection_name]
        keyword_phrases = [kw['Relevant Topics'].lower() for kw in keywords_collection.find()]

        # Combine department names and keyword phrases
        combined_terms = department_names + keyword_phrases

        # Create a regex pattern to match combined terms
        combined_pattern = re.compile(r'\b(' + '|'.join(re.escape(term) for term in combined_terms) + r')\b', re.IGNORECASE)

        # Custom tokenizer to handle combined terms
        def custom_tokenizer(text):
            text = text.lower()  # Convert text to lowercase
            # Replace combined terms with a single token
            text = combined_pattern.sub(lambda match: match.group(0).replace(' ', '_').replace('(', '').replace(')', ''), text)
            # Use a regex pattern that includes special characters, numbers, and apostrophes
            return re.findall(r'\b[\w\-_\'\(\)]+\b', text)

        # Initialize the CountVectorizer with the custom tokenizer
        vectorizer = CountVectorizer(tokenizer=custom_tokenizer)

        # Fetch all documents from the collection
        documents = list(collection.find())

        updated_ids = []

        for doc in documents:
            text = doc.get("Text", "")
            if text:
                # Create the term-document matrix
                X = vectorizer.fit_transform([text])
                terms = vectorizer.get_feature_names_out()
                frequencies = X.toarray().flatten()  # Get the first (and only) row of the array

                logging.info(f"Terms: {terms}")
                logging.info(f"Frequencies: {frequencies}")

                # Create a dictionary of terms with frequency higher than min_term_count
                term_doc_matrix = {terms[i]: int(frequencies[i]) for i in range(len(terms)) if frequencies[i] >= min_term_count}

                # Convert _id to ObjectId
                doc_id = ObjectId(str(doc["_id"]))

                # Update the document with the term-document matrix
                collection.update_one({"_id": doc_id}, {"$set": {"Term Document Matrix": term_doc_matrix}})
                updated_ids.append(doc_id)

        logging.info("Term-document matrices have been added to the collection.")
        logging.info("Updated document IDs:")
        for doc_id in updated_ids:
            logging.info(doc_id)

    except errors.PyMongoError as e:
        logging.error(f"MongoDB error: {e}")
    except Exception as e:
        logging.error(f"An error occurred: {e}")


In [8]:
db_name = 'transcripts'
dept_db_name = 'foodsystems'
collection_name = 'complete_documents'
dept_collection_name = 'departments_and_agencies'
keywords_db_name = 'foodsystems'
keywords_collection_name = 'keywords'
update_tdm_in_mongodb(db_name, collection_name, dept_db_name,dept_collection_name, keywords_db_name, keywords_collection_name, min_term_count=2)

INFO:root:Terms: ['000' '01' '02' ... 'zolvix' 'zoom' 'éireann']
INFO:root:Frequencies: [6 1 1 ... 1 1 1]
INFO:root:Terms: ['000' '04' '1' ... 'éagmais' 'éireann' 'ó']
INFO:root:Frequencies: [ 7  1 16 ...  1  1  3]
INFO:root:Terms: ['0' '000' '1' ... 'zoom' 'éagmais' 'éireann']
INFO:root:Frequencies: [ 1  3 23 ...  1  1  1]
INFO:root:Terms: ['000' '04' '05' ... 'you' 'zealand' 'éireann']
INFO:root:Frequencies: [22  1  1 ...  2  1  1]
INFO:root:Terms: ['000' '10' '100' ... 'your' 'éagmais' 'éireann']
INFO:root:Frequencies: [7 8 4 ... 2 2 1]
INFO:root:Terms: ['000' '015' '08' ... 'you' 'zolvix' 'éireann']
INFO:root:Frequencies: [1 1 1 ... 8 1 1]
INFO:root:Terms: ['000' '1' '100' ... 'your' 'zero' 'éireann']
INFO:root:Frequencies: [3 4 3 ... 1 1 1]
INFO:root:Terms: ['0' '000' '074' ... 'you' 'young' 'éireann']
INFO:root:Frequencies: [1 7 1 ... 1 8 1]
INFO:root:Terms: ['000' '05' '06' ... 'your' 'zealand' 'éireann']
INFO:root:Frequencies: [1 1 1 ... 1 1 1]
INFO:root:Terms: ['000' '1' '10' 

In [9]:
def clean_term(term):
    # Remove leading and trailing underscores
    return term.strip('_')

In [10]:
k1 = 'animal_health_levies_'
k2 = '_regulations_2019'
print(clean_term(k1))
print(clean_term(k2))

animal_health_levies
regulations_2019


In [11]:
def clean_term_document_matrix(term_document_matrix):
    cleaned_matrix = {}
    for term, frequency in term_document_matrix.items():
        cleaned_term = clean_term(term)
        cleaned_matrix[cleaned_term] = frequency
    return cleaned_matrix

def update_term_document_matrix(collection):
    # Fetch all documents from the collection
    documents = collection.find()
    
    for document in documents:
        if "Term Document Matrix" in document:
            term_document_matrix = document["Term Document Matrix"]
            cleaned_matrix = clean_term_document_matrix(term_document_matrix)
            
            collection.update_one(
                {"_id": document["_id"]},
                {"$set": {"Term Document Matrix": cleaned_matrix}}
            )

In [12]:
client = MongoClient("mongodb://localhost:27017/")
db = client["transcripts"]
collection = db["complete_documents"]

update_term_document_matrix(collection)