In [1]:
import os
import json
from pymongo import MongoClient
from bson import ObjectId, json_util, Timestamp
import logging
from pymongo import errors
import time
import re
from datetime import datetime

In [2]:
pip install pymongo


Defaulting to user installation because normal site-packages is not writeable
Collecting pymongo
  Downloading pymongo-4.9.1-cp312-cp312-win_amd64.whl.metadata (22 kB)
Collecting dnspython<3.0.0,>=1.16.0 (from pymongo)
  Downloading dnspython-2.6.1-py3-none-any.whl.metadata (5.8 kB)
Downloading pymongo-4.9.1-cp312-cp312-win_amd64.whl (923 kB)
   ---------------------------------------- 0.0/923.4 kB ? eta -:--:--
   --- ------------------------------------ 71.7/923.4 kB 1.3 MB/s eta 0:00:01
   ----------- ---------------------------- 256.0/923.4 kB 2.6 MB/s eta 0:00:01
   ------------------------- -------------- 583.7/923.4 kB 4.1 MB/s eta 0:00:01
   ---------------------------------------  921.6/923.4 kB 5.3 MB/s eta 0:00:01
   ---------------------------------------- 923.4/923.4 kB 4.5 MB/s eta 0:00:00
Downloading dnspython-2.6.1-py3-none-any.whl (307 kB)
   ---------------------------------------- 0.0/307.7 kB ? eta -:--:--
   -------------------------------- ------ 256.0/307.7 kB 16


[notice] A new release of pip is available: 24.0 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [None]:
#Can create database manually or:
import pymongo

def create_database(db_name):
   
    client = pymongo.MongoClient("mongodb://localhost:27017/")  # Replace with your connection string
    db = client[db_name]
    if db.list_collection_names():
        print(f"Database '{db_name}' already exists.")
    else:
        print(f"Database '{db_name}' created successfully.")
create_database("Dajan")

In [4]:
#Inserting files in separate collections, ignoring data formats and naming accordingly.
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

def parse_json(data):
    def object_hook(dct):
        for key, value in dct.items():
            if key == '$oid':
                return ObjectId(value)
            elif key == '$date':
                if isinstance(value, dict) and '$numberLong' in value:
                    return datetime.fromtimestamp(int(value['$numberLong']) / 1000)
                return datetime.fromisoformat(value)
            elif key == '$numberInt':
                return int(value)
            elif key == '$numberLong':
                return int(value)
        if '_id' in dct and isinstance(dct['_id'], str) and dct['_id'].startswith("b'") and dct['_id'].endswith("'"):
            dct['_id'] = ObjectId(dct['_id'][2:-1])
        return dct

    try:
        return json.loads(json.dumps(data), object_hook=object_hook)
    except:
        # If parsing fails, return the raw data as a string:
        return {"raw_content": str(data)}

def load_json_data(file_path, db, collection_name):
    collection = db[collection_name]
    successful_inserts = 0
    failed_inserts = 0
    
    try:
        with open(file_path, 'r', encoding='utf-8') as json_file:
            content = json_file.read()
            
            # Split content into individual objects
            objects = re.split(r'}\s*{', content)
            
            # Add back the curly braces that were removed by the split
            objects = [obj if obj.startswith('{') else '{' + obj for obj in objects]
            objects = [obj if obj.endswith('}') else obj + '}' for obj in objects]
            
            data = []
            for obj in objects:
                try:
                    parsed_obj = parse_json(json.loads(obj))
                except json.JSONDecodeError:
                    # If JSON parsing fails, insert the raw text
                    parsed_obj = {"raw_content": obj}
                data.append(parsed_obj)
            
            # Insert data in batches
            batch_size = 1000
            for i in range(0, len(data), batch_size):
                batch = data[i:i+batch_size]
                try:
                    result = collection.insert_many(batch, ordered=False)
                    successful_inserts += len(result.inserted_ids)
                except errors.BulkWriteError as e:
                    successful_inserts += e.details.get('nInserted', 0)
                    failed_inserts += len(e.details.get('writeErrors', []))
    
    except Exception as e:
        logging.error(f"Error processing {file_path}: {e}")
    
    logging.info(f"File {file_path}: Inserted {successful_inserts} documents, Failed {failed_inserts} documents")
    return successful_inserts, failed_inserts

def import_json_files_to_mongodb(directory_path, db_name):
    client = MongoClient('mongodb://localhost:27017/')
    db = client[db_name]
    
    if not os.path.exists(directory_path):
        logging.error(f"Directory {directory_path} does not exist.")
        return
    
    total_files = sum(1 for file_name in os.listdir(directory_path) if file_name.endswith('.json'))
    total_successful = 0
    total_failed = 0
    
    for index, file_name in enumerate(os.listdir(directory_path), 1):
        if file_name.endswith('.json'):
            file_path = os.path.join(directory_path, file_name)
            collection_name = os.path.splitext(file_name)[0]
            logging.info(f"Processing file {index}/{total_files}: {file_name}")
            
            start_time = time.time()
            successful, failed = load_json_data(file_path, db, collection_name)
            end_time = time.time()
            
            total_successful += successful
            total_failed += failed
            
            logging.info(f"Completed {file_name} in {end_time - start_time:.2f} seconds")
    
    logging.info(f"Import process completed. Total files: {total_files}")
    logging.info(f"Total successful inserts: {total_successful}")
    logging.info(f"Total failed inserts: {total_failed}")

# In our directory test the usage: 
directory_path = r'C:\Users\admin\mongodb\Json files'
db_name = 'Dajan'
import_json_files_to_mongodb(directory_path, db_name)

2024-09-22 17:23:02,414 - INFO - Processing file 2/463: 2023q1dailydemandjson.json
2024-09-22 17:23:02,515 - INFO - File C:\Users\admin\mongodb\Json files\2023q1dailydemandjson.json: Inserted 1 documents, Failed 0 documents
2024-09-22 17:23:02,517 - INFO - Completed 2023q1dailydemandjson.json in 0.10 seconds
2024-09-22 17:23:02,518 - INFO - Processing file 3/463: 3-20-DELFSHAVEN.city.json
2024-09-22 17:23:04,270 - INFO - File C:\Users\admin\mongodb\Json files\3-20-DELFSHAVEN.city.json: Inserted 1 documents, Failed 0 documents
2024-09-22 17:23:04,302 - INFO - Completed 3-20-DELFSHAVEN.city.json in 1.78 seconds
2024-09-22 17:23:04,304 - INFO - Processing file 4/463: academias.json
2024-09-22 17:23:04,366 - INFO - File C:\Users\admin\mongodb\Json files\academias.json: Inserted 1 documents, Failed 0 documents
2024-09-22 17:23:04,368 - INFO - Completed academias.json in 0.06 seconds
2024-09-22 17:23:04,370 - INFO - Processing file 5/463: accounts.json
2024-09-22 17:23:04,518 - INFO - File C

In [3]:
print(os.getcwd()) #Used it to find the path

C:\Users\admin\mongodb
