In [1]:
import pandas as pd
from pymongo import MongoClient
import re
from collections import defaultdict

In [2]:
def query_tdm_or_condition(db_name, collection_name, keywords, num_results=5, mongo_uri="mongodb://localhost:27017/"):
    client = MongoClient(mongo_uri)
    db = client[db_name]
    collection = db[collection_name]

    regex_patterns = []
    for keyword in keywords:
        if '*' in keyword:
            pattern = re.compile(f".*{keyword.replace('*', '.*')}.*", re.IGNORECASE)
        else:
            pattern = re.compile(f"^{keyword}$", re.IGNORECASE)
        regex_patterns.append(pattern)

    pipeline = [
        {"$group": {
            "_id": "$Document Name", 
            "tf_matrix": {"$push": "$tf_matrix"}, 
        }},
        {"$project": {
            "Document Name": "$_id",
            "tf_matrix": {"$reduce": {
                "input": "$tf_matrix",
                "initialValue": [],
                "in": {"$concatArrays": ["$$value", "$$this"]}, 
            }}
        }},
    ]
    
    aggregated_docs = list(collection.aggregate(pipeline))
    results = []

    for doc in aggregated_docs:
        tf_matrix = doc.get("tf_matrix", [])
        total_freq = 0
        term_frequencies = defaultdict(int)
        matched_terms = set()

        for term_freqs in tf_matrix:
            if isinstance(term_freqs, list) and len(term_freqs) == 2:
                term, freq = term_freqs
                term_frequencies[term] += freq
            else:
                print(f"Unexpected structure in tf_matrix: {term_freqs}")
                continue

        for keyword in keywords:
            if keyword in term_frequencies:
                total_freq += term_frequencies[keyword]

        matched_keywords = set()
        for keyword in keywords:
            keyword_matched = False
            for term in term_frequencies:
                for pattern in regex_patterns:
                    if pattern.search(term):
                        matched_keywords.add(keyword)
                        keyword_matched = True
                        break
            if not keyword_matched:
                break

        if len(matched_keywords) == len(keywords):
            results.append({
                "Document Name": doc.get("Document Name", "Unknown"),
                "Total Frequency": total_freq,
                "Term Frequencies": {term: term_frequencies[term] for term in keywords if term in term_frequencies}
            })
    
    results.sort(key=lambda x: x["Total Frequency"], reverse=True)
    return results[:num_results]

In [3]:
top_docs = query_tdm_or_condition("publications", "term_matrix_complete", ["dairy", "milk", "inheritance"])

for result in top_docs:
    print(f"Document Name: {result['Document Name']}")
    print(f"Total Frequency: {result['Total Frequency']}")
    relevant_terms = {term: freq for term, freq in result['Term Frequencies'].items() if term in keywords}
    for term, freq in relevant_terms.items():
        print(f"  {term}: {freq}")
    print("-" * 40)

Document Name: Volatility and Risk in Irish Agriculture - REAG-0511-6268.pdf
Total Frequency: 592


NameError: name 'keywords' is not defined

In [None]:
def query_tdm_and_condition(db_name, collection_name, keywords, num_results=5, mongo_uri="mongodb://localhost:27017/"):
    client = MongoClient(mongo_uri)
    db = client[db_name]
    collection = db[collection_name]

    regex_patterns = []
    for keyword in keywords:
        if '*' in keyword:
            pattern = re.compile(f".*{keyword.replace('*', '.*')}.*", re.IGNORECASE)
        else:
            pattern = re.compile(f"^{keyword}$", re.IGNORECASE)
        regex_patterns.append(pattern)

    pipeline = [
        {"$group": {
            "_id": "$Document Name", 
            "tf_matrix": {"$push": "$tf_matrix"}, 
        }},
        {"$project": {
            "Document Name": "$_id",
            "tf_matrix": {"$reduce": {
                "input": "$tf_matrix",
                "initialValue": [],
                "in": {"$concatArrays": ["$$value", "$$this"]}, 
            }}
        }},
    ]
    
    aggregated_docs = list(collection.aggregate(pipeline))
    results = []

    for doc in aggregated_docs:
        tf_matrix = doc.get("tf_matrix", [])
        total_freq = 0
        term_frequencies = defaultdict(int)

        for term_freqs in tf_matrix:
            if isinstance(term_freqs, list) and len(term_freqs) == 2:
                term, freq = term_freqs
                term_frequencies[term] += freq
            else:
                print(f"Unexpected structure in tf_matrix: {term_freqs}")
                continue

        all_keywords_found = all(keyword in term_frequencies for keyword in keywords)

        if all_keywords_found:
            for keyword in keywords:
                total_freq += term_frequencies[keyword]

            results.append({
                "Document Name": doc.get("Document Name", "Unknown"),
                "Total Frequency": total_freq,
                "Term Frequencies": {term: term_frequencies[term] for term in keywords if term in term_frequencies}
            })
    
    results.sort(key=lambda x: x["Total Frequency"], reverse=True)
    return results[:num_results]

In [None]:
keywords = ["dairy", "milk", "inheritance"]
result_docs = query_tdm_and_condition("publications", "term_matrix_complete", keywords)

for result in result_docs:
    print(f"Document Name: {result['Document Name']}")
    print(f"Total Frequency: {result['Total Frequency']}")
    relevant_terms = {term: freq for term, freq in result['Term Frequencies'].items() if term in keywords}
    for term, freq in relevant_terms.items():
        print(f"  {term}: {freq}")
    print("-" * 40)

In [None]:
def function_call_inc(keywords):
    keyword_list = []
    
    if 'OR' in keywords:
        print('OR')
        keyword_list = [keyword.strip().lower() for keyword in keywords.split('OR')]
        top_documents = query_tdm_or_condition('publications', 'term_matrix_complete', keyword_list, num_results=1000)
    elif 'AND' in keywords:
        print('AND')
        keyword_list = [keyword.strip().lower() for keyword in keywords.split('AND')]
        print(keyword_list)
        top_documents = query_tdm_and_condition('publications', 'term_matrix_complete', keyword_list, num_results=1000)
    else:
        keyword_list.insert(0,str(keywords))
        top_documents = query_tdm_or_condition('publications', 'term_matrix_complete', keyword_list, num_results=1000)
    
    return top_documents

# Dairy and Carbon

In [None]:
kw = 'dairy OR dairying OR dairyman OR dairymen OR dairymaid OR dairymaids OR dairyland OR milk OR cattle'
dairy_doc = function_call_inc(kw)
print(dairy_doc)

In [None]:
kw_carbon = 'carbon OR neutrality OR emissions OR abatement'
carbon_doc = function_call_inc(kw_carbon)
print(carbon_doc)

In [None]:
def find_matching_files(doc_list_other, doc_list_dairy):
    file_names_set_dairy = {doc['Document Name'] for doc in doc_list_dairy}
    matching_records = [other_doc for other_doc in doc_list_other if other_doc['Document Name'] in file_names_set_dairy]
    return matching_records

In [None]:
matching_records = find_matching_files(carbon_doc, dairy_doc)

print("Matching Records:", len(matching_records))
print(len(carbon_doc))
print(len(dairy_doc))

In [None]:
dairy_carbon_doc_with_frequencies = pd.DataFrame(matching_records)
dairy_carbon_doc_with_frequencies['Query Category'] = "Dairy and Carbon"
dairy_carbon_doc_with_frequencies = dairy_carbon_doc_with_frequencies.loc[:, (dairy_carbon_doc_with_frequencies != 0).any()]
print(dairy_carbon_doc_with_frequencies)

In [None]:
dairy_carbon_doc_with_frequencies["Total Freq"] = dairy_carbon_doc_with_frequencies.select_dtypes(include=['number']).sum(axis=1)
dairy_carbon_doc_with_frequencies['Year'] = dairy_carbon_doc_with_frequencies['Document Name'].str.extract(r'(\d{4})')
print(dairy_carbon_doc_with_frequencies)

# Dairy and Water

In [None]:
kw_water = 'water OR waterways OR rivers OR run-off OR nitrogen OR nitrification OR nitrous OR nitrates OR nitrifying OR algae OR algal OR alginate OR derogation OR eutrophication OR bloom OR effluent OR discharge'
water_doc = function_call_inc(kw_water)
print(water_doc)

In [None]:
matching_records_water = find_matching_files(water_doc, dairy_doc)

print("Matching Records:", len(matching_records_water))
print(len(water_doc))
print(len(dairy_doc))

In [None]:
dairy_water_doc_with_frequencies = pd.DataFrame(matching_records_water)
dairy_water_doc_with_frequencies['Query Category'] = "Dairy and Water"
dairy_water_doc_with_frequencies = dairy_water_doc_with_frequencies.loc[:, (dairy_water_doc_with_frequencies != 0).any()]
print(dairy_water_doc_with_frequencies)

In [None]:
dairy_water_doc_with_frequencies["Total Freq"] = dairy_water_doc_with_frequencies.select_dtypes(include=['number']).sum(axis=1)
dairy_water_doc_with_frequencies['Year'] = dairy_water_doc_with_frequencies['Document Name'].str.extract(r'(\d{4})')
print(dairy_water_doc_with_frequencies)

# Dairy and Livelihood

In [None]:
kw_livelihood = 'inherit OR heritage OR inheritable OR inheritance OR inherited OR inheritor OR heritable OR succession OR rural development OR community OR social sustainability OR society OR economic viability'
livelihood_doc = function_call_inc(kw_livelihood)
print(livelihood_doc)

In [None]:
matching_records_livelihood = find_matching_files(livelihood_doc, dairy_doc)

print("Matching Records:", len(matching_records_livelihood))
print(len(livelihood_doc))
print(len(dairy_doc))

In [None]:
dairy_livelihood_doc_with_frequencies = pd.DataFrame(matching_records_livelihood)
dairy_livelihood_doc_with_frequencies['Query Category'] = "Dairy and Livelihood"
dairy_livelihood_doc_with_frequencies = dairy_livelihood_doc_with_frequencies.loc[:, (dairy_livelihood_doc_with_frequencies != 0).any()]
print(dairy_livelihood_doc_with_frequencies)

In [None]:
dairy_livelihood_doc_with_frequencies["Total Freq"] = dairy_livelihood_doc_with_frequencies.select_dtypes(include=['number']).sum(axis=1)
dairy_livelihood_doc_with_frequencies['Year'] = dairy_livelihood_doc_with_frequencies['Document Name'].str.extract(r'(\d{4})')
print(dairy_livelihood_doc_with_frequencies)

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 6))

plt.plot(dairy_carbon_doc_with_frequencies['Total Freq'], label='Carbon')
plt.plot(dairy_water_doc_with_frequencies['Total Freq'], label='Water')
plt.plot(dairy_livelihood_doc_with_frequencies['Total Freq'], label='Livelihood')

plt.xlabel("Index")
plt.ylabel("Total Frequency")
plt.title("Comparison of Total Frequencies")

plt.legend()

plt.grid(True, linestyle="--", alpha=0.5)

plt.show()

In [None]:
output_file = "publicationstf.xlsx"

with pd.ExcelWriter(output_file) as writer:
    dairy_carbon_doc_with_frequencies.to_excel(writer, sheet_name="Carbon", index=False)
    dairy_water_doc_with_frequencies.to_excel(writer, sheet_name="Water", index=False)
    dairy_livelihood_doc_with_frequencies.to_excel(writer, sheet_name="Livelihood", index=False)

print(f"DataFrames exported to {output_file}")