# Query Term Document Matrix with a set of keywords

In [1]:
from pymongo import MongoClient

### Function to query the term document matrix

This function will query the term document matrix. It will query the matrix for a set of keywords, and it will return the top 5 documents (Number can vary) with the frequecies of each individual keyword in each of the document.

The frequencies are determined by aggregating the frequency of each keywords through the collection. Steps followed are:

1. Connect to MongoDB.
2. Query the TDM collection.
3. Aggregate the frequencies of the specified keywords.
   Aggregation Pipeline: The pipeline consists of three stages:
   
    $project: Projects the File Name and calculates the TotalFrequency of the specified keywords.
    
    $sort: Sorts the documents by TotalFrequency in descending order.
    
    $limit: Limits the results to the top n documents.
    
    $ifNull: Ensures that if a keyword is not present in a document, it is treated as zero frequency.
5. Sort and return the top documents.

In [11]:
from pymongo import MongoClient

def find_top_documents_with_keyword_frequencies_or(db_name, tdm_collection_name, keywords, top_n=5, mongo_uri="mongodb://localhost:27017/"):
    client = MongoClient(mongo_uri)
    db = client[db_name]
    tdm_collection = db[tdm_collection_name]
    keywords = [keyword.replace(' ', '_') for keyword in keywords]
    
    pipeline = [
        {
            '$project': {
                'File Name': 1,
                **{keyword: {'$ifNull': [f'$Term Document Matrix.{keyword}', 0]} for keyword in keywords}
            }
        },
        {
            '$addFields': {
                'TotalFrequency': {
                    '$sum': [f'${keyword}' for keyword in keywords]
                }
            }
        },
        {
            '$match': {
                'TotalFrequency': {'$gt': 0}
            }
        },
        {'$sort': {'TotalFrequency': -1}},
        {'$limit': top_n},
        {
            '$project': {
                'File Name': 1,
                **{keyword: 1 for keyword in keywords}
            }
        }
    ]
    
    print("Pipeline:", pipeline)
    results = list(tdm_collection.aggregate(pipeline))
    print("Results:", results)
    
    for result in results:
        print(f"File Name: {result['File Name']}")
        for keyword in keywords:
            print(f"  {keyword}: {result[keyword]}")
    
    return results

In [13]:
keywords = ['dude']
find_top_documents_with_keyword_frequencies_or('transcripts', 'complete_documents', keywords)

Pipeline: [{'$project': {'File Name': 1, 'dude': {'$ifNull': ['$Term Document Matrix.dude', 0]}}}, {'$addFields': {'TotalFrequency': {'$sum': ['$dude']}}}, {'$match': {'TotalFrequency': {'$gt': 0}}}, {'$sort': {'TotalFrequency': -1}}, {'$limit': 5}, {'$project': {'File Name': 1, 'dude': 1}}]
Results: []


[]

In [14]:
from pymongo import MongoClient

def find_top_documents_with_keyword_frequencies_and(db_name, tdm_collection_name, keywords, top_n=5, mongo_uri="mongodb://localhost:27017/"):
    client = MongoClient(mongo_uri)
    db = client[db_name]
    tdm_collection = db[tdm_collection_name]
    keywords = [keyword.replace(' ', '_') for keyword in keywords]
    
    pipeline = [
        {
            '$project': {
                'File Name': 1,
                **{keyword: {'$ifNull': [f'$Term Document Matrix.{keyword}', 0]} for keyword in keywords}
            }
        },
        {
            '$match': {
                **{keyword: {'$gt': 0} for keyword in keywords}
            }
        },
        {
            '$addFields': {
                'TotalFrequency': {
                    '$sum': [f'${keyword}' for keyword in keywords]
                }
            }
        },
        {'$sort': {'TotalFrequency': -1}},
        {'$limit': top_n},
        {
            '$project': {
                'File Name': 1,
                **{keyword: 1 for keyword in keywords}
            }
        }
    ]
    
    print("Pipeline:", pipeline)
    results = list(tdm_collection.aggregate(pipeline))
    print("Results:", results)
    
    for result in results:
        print(f"File Name: {result['File Name']}")
        for keyword in keywords:
            print(f"  {keyword}: {result[keyword]}")
    
    return results

In [15]:
keywords = ['dude']
find_top_documents_with_keyword_frequencies_and('transcripts', 'complete_documents', keywords)

Pipeline: [{'$project': {'File Name': 1, 'dude': {'$ifNull': ['$Term Document Matrix.dude', 0]}}}, {'$match': {'dude': {'$gt': 0}}}, {'$addFields': {'TotalFrequency': {'$sum': ['$dude']}}}, {'$sort': {'TotalFrequency': -1}}, {'$limit': 5}, {'$project': {'File Name': 1, 'dude': 1}}]
Results: []


[]

### List of Keywords we want to query

In [4]:
def function_call(keywords):
    keyword_list = []
    
    if 'OR' in keywords:
        print('OR')
        keyword_list = [keyword.strip() for keyword in keywords.split('OR')]
        top_documents = find_top_documents_with_keyword_frequencies_or('transcripts', 'complete_documents', keyword_list, top_n=5)
    elif 'AND' in keywords:
        print('AND')
        keyword_list = [keyword.strip() for keyword in keywords.split('AND')]
        top_documents = find_top_documents_with_keyword_frequencies_and('transcripts', 'complete_documents', keyword_list, top_n=5)
    else:
        keyword_list.insert(0,str(AND_keywords))
        top_documents = find_top_documents_with_keyword_frequencies_or('transcripts', 'complete_documents', keyword_list, top_n=5)
    
    return top_documents

In [5]:
OR_keywords = 'dairy_sector OR milk OR emission'
AND_keywords = 'dairy_sector AND milk AND emission'

print('OR', function_call(OR_keywords))
print('AND', function_call(AND_keywords))

OR
Pipeline: [{'$project': {'File Name': 1, 'dairy_sector': {'$ifNull': ['$Term Document Matrix.dairy_sector', 0]}, 'milk': {'$ifNull': ['$Term Document Matrix.milk', 0]}, 'emission': {'$ifNull': ['$Term Document Matrix.emission', 0]}}}, {'$addFields': {'TotalFrequency': {'$sum': ['$dairy_sector', '$milk', '$emission']}}}, {'$sort': {'TotalFrequency': -1}}, {'$limit': 5}, {'$project': {'File Name': 1, 'dairy_sector': 1, 'milk': 1, 'emission': 1}}]
Results: [{'_id': ObjectId('670521f275d9daeff229cdf7'), 'File Name': 'Joint Committee on Agriculture, Food and the Marine debate - Wednesday, 13 Apr 2022.pdf', 'dairy_sector': 2, 'milk': 123, 'emission': 0}, {'_id': ObjectId('670521f275d9daeff229ce22'), 'File Name': 'Joint Committee on Agriculture, Food and the Marine debate - Wednesday, 8 Mar 2023.pdf', 'dairy_sector': 0, 'milk': 111, 'emission': 0}, {'_id': ObjectId('670521f275d9daeff229cf8d'), 'File Name': 'Dáil Éireann debate - Wednesday, 4 Mar 2015.pdf', 'dairy_sector': 25, 'milk': 74, '

In [6]:
OR_keywords = 'sea-fisheries'
print("OR", find_top_documents_with_keyword_frequencies_or('transcripts', 'complete_documents', OR_keywords.split(' OR '), top_n=5))

Pipeline: [{'$project': {'File Name': 1, 'sea-fisheries': {'$ifNull': ['$Term Document Matrix.sea-fisheries', 0]}}}, {'$addFields': {'TotalFrequency': {'$sum': ['$sea-fisheries']}}}, {'$sort': {'TotalFrequency': -1}}, {'$limit': 5}, {'$project': {'File Name': 1, 'sea-fisheries': 1}}]
Results: [{'_id': ObjectId('670521f275d9daeff229cded'), 'File Name': 'Joint Committee on Agriculture, Food and the Marine debate - Tuesday, 8 Nov 2016.pdf', 'sea-fisheries': 21}, {'_id': ObjectId('670521f275d9daeff229ce2e'), 'File Name': 'Select Committee on Agriculture, Food and the Marine debate - Tuesday, 23 Nov 2021.pdf', 'sea-fisheries': 12}, {'_id': ObjectId('670521f275d9daeff229ced7'), 'File Name': 'Dáil Éireann debate - Thursday, 9 Mar 2017.pdf', 'sea-fisheries': 10}, {'_id': ObjectId('670521f275d9daeff229cdd5'), 'File Name': 'Joint Committee on Agriculture, Food and the Marine debate - Tuesday, 27 Jun 2017.pdf', 'sea-fisheries': 10}, {'_id': ObjectId('670521f275d9daeff229ce92'), 'File Name': 'Dáil

In [7]:
keyword_list = []
AND_keywords = 'animal health levies '
keyword_list.insert(0,str(AND_keywords))
print(keyword_list)
top_documents = find_top_documents_with_keyword_frequencies_or('transcripts', 'complete_documents', keyword_list, top_n=5)


['animal health levies ']
Pipeline: [{'$project': {'File Name': 1, 'animal_health_levies_': {'$ifNull': ['$Term Document Matrix.animal_health_levies_', 0]}}}, {'$addFields': {'TotalFrequency': {'$sum': ['$animal_health_levies_']}}}, {'$sort': {'TotalFrequency': -1}}, {'$limit': 5}, {'$project': {'File Name': 1, 'animal_health_levies_': 1}}]
Results: [{'_id': ObjectId('670521f275d9daeff229cd9b'), 'File Name': 'Joint Committee on Agriculture and the Marine debate - Tuesday, 15 Dec 2020.pdf', 'animal_health_levies_': 0}, {'_id': ObjectId('670521f275d9daeff229cd9c'), 'File Name': 'Joint Committee on Agriculture and the Marine debate - Tuesday, 20 Apr 2021.pdf', 'animal_health_levies_': 0}, {'_id': ObjectId('670521f275d9daeff229cd99'), 'File Name': 'Joint Committee on Agriculture and the Marine debate - Thursday, 17 Dec 2020.pdf', 'animal_health_levies_': 0}, {'_id': ObjectId('670521f275d9daeff229cd98'), 'File Name': 'Joint Committee on Agriculture and the Marine debate - Thursday, 12 Nov 2

In [8]:
k1 = 'renewable energy project'
k2 = ''

In [10]:
OR_keywords = 'agriculture appeal OR we'
print("OR", find_top_documents_with_keyword_frequencies_or('transcripts', 'complete_documents', OR_keywords.split(' OR '), top_n=5))

Pipeline: [{'$project': {'File Name': 1, 'agriculture_appeal': {'$ifNull': ['$Term Document Matrix.agriculture_appeal', 0]}, 'we': {'$ifNull': ['$Term Document Matrix.we', 0]}}}, {'$addFields': {'TotalFrequency': {'$sum': ['$agriculture_appeal', '$we']}}}, {'$sort': {'TotalFrequency': -1}}, {'$limit': 5}, {'$project': {'File Name': 1, 'agriculture_appeal': 1, 'we': 1}}]
Results: [{'_id': ObjectId('670521f275d9daeff229cf94'), 'File Name': 'Dáil Éireann debate - Wednesday, 6 Apr 2022.pdf', 'agriculture_appeal': 0, 'we': 1635}, {'_id': ObjectId('670521f275d9daeff229cec4'), 'File Name': 'Dáil Éireann debate - Thursday, 30 Sep 2021.pdf', 'agriculture_appeal': 0, 'we': 1608}, {'_id': ObjectId('670521f275d9daeff229ce90'), 'File Name': 'Dáil Éireann debate - Thursday, 17 Dec 2020.pdf', 'agriculture_appeal': 0, 'we': 1553}, {'_id': ObjectId('670521f275d9daeff229cf9b'), 'File Name': 'Dáil Éireann debate - Wednesday, 8 Dec 2021.pdf', 'agriculture_appeal': 0, 'we': 1519}, {'_id': ObjectId('670521f

# Query for multi-worded terms with _

In [9]:
from pymongo import MongoClient

# Replace with your MongoDB connection string
client = MongoClient('mongodb://localhost:27017/')
db = client['transcripts']
collection = db['complete_documents']

# Retrieve a sample document to inspect the structure
sample_document = collection.find_one()

# Print the sample document
print(sample_document)


{'_id': ObjectId('670521f275d9daeff229cd98'), 'File Name': 'Joint Committee on Agriculture and the Marine debate - Thursday, 12 Nov 2020.pdf', 'Category': 'Agri, Food and Marine', 'Text': 'AN COMHCHOISTE UM THALMHAÍOCHT AGUS MUIR\nJOINT COMMITTEE ON AGRICULTURE AND THE MARINE\nDéardaoin, 12 Samhain 2020\nThursday, 12 November 2020\nTháinig an Comhchoiste le chéile ag 4 p.m.\nThe Joint Committee met at 4 p.m.\nComhaltaí a bhí i láthair / Members present:\nTeachtaí Dála / Deputies Seanadóirí / Senators\nMartin Browne, Victor Boyhan,\nMatt Carthy, Paul Daly,\nMichael Collins, Tim Lombard,\nMichael Fitzmaurice, Denis O’Donovan.\nPaul Kehoe,\nBrian Leddin,\nMichael Ring.\nI láthair / In attendance: Deputy Carol Nolan.\nTeachta / Deputy Jackie Cahill sa Chathaoir / in the Chair.DÁIL ÉIREANN\n12JAM\nBusiness of Joint Committee\nChairman : Before we begin, I remind members to turn off their mobile phones.  Members \nare requested to ensure that for the duration of the meeting their mobile phon

In [24]:
from pymongo import MongoClient

# Replace with your MongoDB connection string
client = MongoClient('mongodb://localhost:27017/')
db = client['transcripts']
collection = db['complete_documents']

# Define the aggregation pipeline
pipeline = [
    {
        "$project": {
            "term_keys": { "$objectToArray": "$Term Document Matrix" }  # Convert the 'Term Document Matrix' object into an array of key-value pairs
        }
    },
    {
        "$unwind": "$term_keys"  # Unwind the array so each key-value pair is a separate document
    },
    {
        "$match": {
            "term_keys.k": { "$regex": ".*\\d.*" }  # Match terms containing at least one digit
        }
    },
    {
        "$project": {
            "term_with_digits": "$term_keys.k",  # Extract term name
            "term_count": "$term_keys.v"  # Extract term count
        }
    }
]

# Run the aggregation query
results = collection.aggregate(pipeline)

# Print the results
for result in results:
    print(f"Term: {result['term_with_digits']}, Count: {result['term_count']}")

Term: 000, Count: 6
Term: 100, Count: 3
Term: 105(4, Count: 13
Term: 12, Count: 15
Term: 12jam, Count: 2
Term: 15, Count: 3
Term: 17, Count: 3
Term: 2, Count: 2
Term: 20, Count: 2
Term: 2007, Count: 3
Term: 2018, Count: 2
Term: 2019, Count: 21
Term: 2020, Count: 17
Term: 2022, Count: 5
Term: 27, Count: 2
Term: 3, Count: 8
Term: 37, Count: 2
Term: 4, Count: 6
Term: 5, Count: 3
Term: 6, Count: 11
Term: 60, Count: 3
Term: 633, Count: 4
Term: 81, Count: 2
Term: 000, Count: 7
Term: 1, Count: 16
Term: 10, Count: 7
Term: 100, Count: 6
Term: 11, Count: 2
Term: 1217, Count: 2
Term: 15, Count: 6
Term: 17, Count: 2
Term: 18, Count: 8
Term: 1920s, Count: 2
Term: 1970s, Count: 2
Term: 1976, Count: 3
Term: 1980s, Count: 2
Term: 1983, Count: 3
Term: 2, Count: 2
Term: 20, Count: 3
Term: 2006, Count: 2
Term: 2018, Count: 2
Term: 2019, Count: 2
Term: 2020, Count: 14
Term: 2021, Count: 2
Term: 22, Count: 2
Term: 265, Count: 2
Term: 30, Count: 7
Term: 300, Count: 2
Term: 387, Count: 2
Term: 40, Count: 4
T

In [25]:
from pymongo import MongoClient

# Replace with your MongoDB connection string
client = MongoClient('mongodb://localhost:27017/')
db = client['transcripts']
collection = db['P1_speaker_speech']

# Define the aggregation pipeline
pipeline = [
    {
        "$project": {
            "term_keys": { "$objectToArray": "$Term Document Matrix" }  # Convert the 'Term Document Matrix' object into an array of key-value pairs
        }
    },
    {
        "$unwind": "$term_keys"  # Unwind the array so each key-value pair is a separate document
    },
    {
        "$match": {
            "term_keys.k": { "$regex": ".*\\d.*" }  # Match terms containing at least one digit
        }
    },
    {
        "$project": {
            "term_with_digits": "$term_keys.k",  # Extract term name
            "term_count": "$term_keys.v"  # Extract term count
        }
    }
]

# Run the aggregation query
results = collection.aggregate(pipeline)

# Print the results
for result in results:
    print(f"Term: {result['term_with_digits']}, Count: {result['term_count']}")