In [5]:
from pymongo import MongoClient

client = MongoClient("localhost", 27017)

In [6]:
# Assign the database to a variable
db = client.NonRelProj 

# Take the collection "Reviews"
reviews = db.Reviews

# Take the collection "MetaData"
meta = db.MetaData

# Data Cleaning

Even though we are interested only in restaurants, let's give a look at the categories of the businesses in the dataset.

In [7]:
# 'Category' may contain multiple values (e.g., ['Pharmacy', 'Health']), let's use aggregation to extract all unique values across documents

categories = meta.aggregate([
    {"$unwind": {"path": "$category"}}, 
    {"$group": {"_id": "$category"}},
    {"$sort": {"_id": 1}}  # Sort alphabetically
])

# Convert to list
categories = [doc["_id"] for doc in categories]
print(categories)

['ATM', 'ATV dealer', 'ATV rental service', 'ATV repair shop', 'Aboriginal and Torres Strait Islander organisation', 'Aboriginal art gallery', 'Abortion clinic', 'Abrasives supplier', 'Accountant', 'Accounting firm', 'Acupuncture clinic', 'Acupuncturist', 'Acura dealer', 'Addiction treatment center', 'Adoption agency', 'Adult DVD store', 'Adult day care center', 'Adult education school', 'Adult entertainment club', 'Adult entertainment store', 'Adventure sports', 'Adventure sports center', 'Advertising agency', 'Advertising service', 'Aerial photographer', 'Aerial sports center', 'Aerospace company', 'After school program', 'Aged care', 'Aggregate supplier', 'Agricultural cooperative', 'Agricultural organization', 'Agricultural product wholesaler', 'Agricultural production', 'Agricultural service', 'Aikido club', 'Air compressor repair service', 'Air compressor supplier', 'Air conditioning contractor', 'Air conditioning repair service', 'Air conditioning store', 'Air conditioning syste

Let's delete all the businesses that do not have 'restaurant' in the 'category' field.

In [8]:
# Step 1: Get the IDs of matching documents
matching_ids = [
    doc["_id"] for doc in meta.find(
        {"category": {"$elemMatch": {"$regex": "restaurant", 
                                     "$options": "i"}}, 
         "state": {"$ne": "Permanently closed"}}, 
        {"_id": 1}  # Fetch only the _id field for efficiency
    )
]

# Step 2: Count total documents before deletion
total_docs_before = meta.count_documents({})

# Step 3: Delete documents that are NOT in the matching list
if matching_ids:  
    result = meta.delete_many({"_id": {"$nin": matching_ids}})
    # Get count of deleted documents
    total_deleted = result.deleted_count  
    # Count remaining documents
    total_docs_after = meta.count_documents({})  
    
    print(f"Deleted {total_deleted} documents that did not match the query.")
    print(f"Total documents before: {total_docs_before}, after: {total_docs_after}")
else:
    print("No matching documents found.")


Deleted 17782 documents that did not match the query.
Total documents before: 21507, after: 3725


Let's remove those attributes that are not interesting for our purposes: 'relative_results', 'url'

In [9]:
meta.update_many({}, {"$unset": {"relative_results":"", "url": ""}})

UpdateResult({'n': 3725, 'nModified': 3725, 'ok': 1.0, 'updatedExisting': True}, acknowledged=True)