In [1]:
import json
from pymongo import MongoClient
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm

In [2]:
# === Configuración ===
# json_file_path = "arxiv-metadata-oai-snapshot.json"
batch_size = 2000
max_workers = 4  # Ajusta según tu CPU y MongoDB
archivos = ["arxiv-part-aa", "arxiv-part-ab", "arxiv-part-ac", "arxiv-part-ad", "arxiv-part-ae", "arxiv-part-af"]

In [3]:
# === Conectar a MongoDB ===
client = MongoClient("mongodb://localhost:27017/")
db = client["arxiv_db"]
collection = db["articles"]

In [4]:
# === Función para insertar lote ===
def insert_batch(batch):
    try:
        collection.insert_many(batch, ordered=False)
    except Exception as e:
        print("Error al insertar batch:", e)

In [5]:
# === Contar líneas para tqdm ===
def count_lines(filepath):
    with open(filepath, 'r', encoding='utf-8') as f:
        return sum(1 for _ in f)
total_lines = 0
for i in archivos:
    total_lines += count_lines(i)


In [None]:
# === Lectura + carga paralela ===
for archivo in archivos:
    print(f"📂 Cargando {archivo}...")
    with open(archivo, 'r', encoding='utf-8') as f:
        batch = []
        futures = []
        with ThreadPoolExecutor(max_workers=max_workers) as executor:
            for line in tqdm(f, desc=f"Cargando {archivo}"):
                try:
                    record = json.loads(line)
                    record["pdf_source"] = f"https://arxiv.org/pdf/{record['id']}"
                    batch.append(record)
                    if len(batch) >= batch_size:
                        # Ejecutar carga en paralelo
                        future = executor.submit(insert_batch, batch)
                        futures.append(future)
                        batch = []
                except json.JSONDecodeError:
                    continue

            # Cargar el último batch
            if batch:
                futures.append(executor.submit(insert_batch, batch))

        # Esperar que terminen todos los batches de este archivo
        for future in futures:
            future.result()
    print(f"✅ {archivo} cargado completamente.\n")

print("🎉 Todos los archivos fueron cargados.")

📂 Cargando arxiv-part-aa...


Cargando arxiv-part-aa: 500000it [00:15, 31866.48it/s]


Error al insertar batch: localhost:27017: [Errno 111] Connection refused (configured timeouts: socketTimeoutMS: 20000.0ms, connectTimeoutMS: 20000.0ms), Timeout: 30s, Topology Description: <TopologyDescription id: 6847b37fae4431c716338c19, topology_type: Unknown, servers: [<ServerDescription ('localhost', 27017) server_type: Unknown, rtt: None, error=AutoReconnect('localhost:27017: [Errno 111] Connection refused (configured timeouts: socketTimeoutMS: 20000.0ms, connectTimeoutMS: 20000.0ms)')>]>
Error al insertar batch: localhost:27017: [Errno 111] Connection refused (configured timeouts: socketTimeoutMS: 20000.0ms, connectTimeoutMS: 20000.0ms), Timeout: 30s, Topology Description: <TopologyDescription id: 6847b37fae4431c716338c19, topology_type: Unknown, servers: [<ServerDescription ('localhost', 27017) server_type: Unknown, rtt: None, error=AutoReconnect('localhost:27017: [Errno 111] Connection refused (configured timeouts: socketTimeoutMS: 20000.0ms, connectTimeoutMS: 20000.0ms)')>]>


In [13]:
'''
from pymongo import MongoClient

client = MongoClient("mongodb://localhost:27018/")
db = client["arxiv_db"]
collection = db["articles"]

# ⚠️ Eliminar todos los documentos
collection.delete_many({})
print("🗑️ Todos los documentos eliminados.")
'''

🗑️ Todos los documentos eliminados.
