In [54]:
import os
import re
from datetime import datetime
import json
from dotenv import load_dotenv

import pandas as pd
import numpy as np
import worldnewsapi
from worldnewsapi.rest import ApiException
from bson import ObjectId

from functions.misc import NewsAPI_fetch, extract_author_names, extract_website_name, extract_categories

In [71]:
import torch
# Check PyTorch version
print("PyTorch version:", torch.__version__)

# Check if CUDA is available
cuda_available = torch.cuda.is_available()
print("CUDA available:", cuda_available)

# If CUDA is available, print GPU name
if cuda_available:
    print("CUDA version:", torch.version.cuda)
    print("Number of GPUs:", torch.cuda.device_count())
    print("GPU Name:", torch.cuda.get_device_name(0))

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

PyTorch version: 2.0.1+cu118
CUDA available: True
CUDA version: 11.8
Number of GPUs: 1
GPU Name: NVIDIA GeForce GTX 1080 Ti


## Fetch new Records

In [49]:
import time
from uuid import uuid1
from dotenv import load_dotenv
from config.database import MongoDatabaseConnector
from pinecone import Pinecone
# to test asyn requests in upsert
# from pinecone.grpc import PineconeGRPC as Pinecone
# logging
from tqdm.auto import tqdm
from functions.logger import log_item_id, log_record_id_checker
# models
from langchain.text_splitter import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer
from pinecone_text.sparse import BM25Encoder

load_dotenv()
MONGO_URI = os.getenv('MONGO_URI')
MONGO_DB_NAME = os.getenv('MONGO_DB_NAME')
PINECONE_API_KEY = os.getenv('PINECONE_API_KEY')

### Connections
## MongoDB
connection = MongoDatabaseConnector(MONGO_URI)
db = connection[MONGO_DB_NAME]
collection = db['articles']
## Pinecone
pc = Pinecone(api_key=PINECONE_API_KEY)
index_name = 'newsrag'
index = pc.Index(index_name)
### Models
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("DeepMount00/Llama-3-8b-Ita")
# create the length function
def tokenizer_len(text):
    tokens = tokenizer.encode(text)
    return len(tokens)
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=300,
    chunk_overlap=15,
    length_function=tokenizer_len,
    separators=["\n\n", "\n", " ", ""]
)
embedder = SentenceTransformer('nickprock/sentence-bert-base-italian-xxl-uncased')
sparse_embedder = BM25Encoder().default() # eventualmente caricare parametri addestrati


Connection enstablished


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Execution time: 21.10453224182129
Data uploaded successfully!


In [5]:
from functions.misc import NewsAPI_fetch
filtered_news_dicts = NewsAPI_fetch(max_results=2,
                                    NEWSAPI_API_KEY=NEWSAPI_API_KEY,
                                    min_date="2023-01-20",max_date="2023-01-20")

Retrieved 0 articles. Offset: 0/2. Total available: 0.
Retrieved 0 articles. Offset: 100/2. Total available: 0.
Retrieved 0 articles. Offset: 200/2. Total available: 0.
Retrieved 0 articles. Offset: 300/2. Total available: 0.
Retrieved 0 articles. Offset: 400/2. Total available: 0.
Retrieved 0 articles. Offset: 500/2. Total available: 0.
Retrieved 0 articles. Offset: 600/2. Total available: 0.
Retrieved 0 articles. Offset: 700/2. Total available: 0.
Retrieved 0 articles. Offset: 800/2. Total available: 0.
Retrieved 0 articles. Offset: 900/2. Total available: 0.
Retrieved 0 articles. Offset: 1000/2. Total available: 0.
Retrieved 0 articles. Offset: 1100/2. Total available: 0.
Retrieved 0 articles. Offset: 1200/2. Total available: 0.
Retrieved 0 articles. Offset: 1300/2. Total available: 0.
Retrieved 0 articles. Offset: 1400/2. Total available: 0.
Retrieved 0 articles. Offset: 1500/2. Total available: 0.
Retrieved 0 articles. Offset: 1600/2. Total available: 0.
Retrieved 0 articles. Offs

## Elaborazione JSON ricevuto

In [8]:
df = pd.DataFrame(filtered_news_dicts)
# Apply the function to the 'url' column and store the result in the 'category' column
df['category'] = df['url'].apply(lambda x: extract_categories(x))
# Testata
df['publisher'] = df['url'].apply(lambda x: extract_website_name(x))
# Autori
df['authors'] = df['authors'].apply(lambda x: extract_author_names(x))

In [47]:
data = df.to_dict(orient="records")

## update MongoDB

In [None]:
### DATA STORAGE
# Insert data into the collection
start_time = time.time()
collection.insert_many(data) # if csv: data.to_dict(orient="records")
## delete all records : collection.delete_many(filter={})
end_time = time.time()

execution_time = end_time - start_time
print(f"Execution time: {execution_time}")
print("Data uploaded successfully!")


## update vectorDB

In [72]:
def upsert_vectorDB(collection, index, text_splitter, embedder, sparse_embedder, batch_article_limit=3, log_processed_path='logs/ids_updated_vecDB.log'):
    """# Define the batch limit (number of articles to process in each batch)
    batch_article_limit = 3
    index=index
    log_processed_path="logs/ids_processed.log"
    """
    # Retrieve processed IDs from logs
    processed_ids = log_record_id_checker(log_processed_path)
    object_ids = [ObjectId(oid) for oid in list(set(processed_ids))]
    query = {'_id': {'$nin': object_ids}}
    projection = {'_id': 1}

    # Main loop for processing articles in batches
    acab = np.arange(0, 1+len(list(collection.find(query, projection))) // batch_article_limit)
    for _ in acab:
        processed_ids = log_record_id_checker(log_processed_path)
        object_ids = [ObjectId(oid) for oid in list(set(processed_ids))]
        query = {'_id': {'$nin': object_ids}}
        results = collection.find(query).limit(batch_article_limit)
        data = list(results)

        if not data:
            break

        texts = []
        metadatas = []
        for record in tqdm(data):
            metadata = {
                'doc_id': str(record['_id']),
            }
            record_texts = text_splitter.split_text(record['text'])
            record_metadatas = [{
                "chunk_no": j, "context": text, **metadata
            } for j, text in enumerate(record_texts)]
            texts.extend(record_texts)
            metadatas.extend(record_metadatas)

        if texts:
            ids = [str(uuid1()) for _ in range(len(texts))]
            embeds = embedder.encode(texts, device=device)#'cuda'
            ##############sparse_embeds = generate_sparse_vectors(texts)
            sparse_embeds = sparse_embedder.encode_documents(texts)
            # print(ids) # uncomment to debug
            # print(sparse_embeds) # uncomment to debug
            ##########################vectors = list(zip(ids, sparse_embeds, embeds, metadatas))+
            vectors = []
            for _id, sparse, dense, metadataa in zip(ids, sparse_embeds, embeds, metadatas):
                ### MODIFICARE SPARSE QUI
                # floats = [float(x) for x in list(sparse.values())]
                # sparse = dict({'indices': list(sparse.keys()), 'values': floats})
                # print(sparse)
                vectors.append({
                'id': _id,
                'sparse_values': sparse,
                'values': dense,
                'metadata': metadataa
                })
            # print(vectors) # uncomment to debug
            # Store the processed data in Pinecone
            index.upsert(vectors)# , async_req=True)# pero mi pare di capire che gli va dato un vettore alla volta allora andrebbe nel for di sopra (e quindi anche senza l'.append)

            for vec in vectors:
                item_id = str(vec['metadata']["doc_id"])
                chunk_no = str(vec['metadata']["chunk_no"])
                log_item_id(item_id=item_id, chunk_no=chunk_no, chunk_id=str(vec['id'])
                            , log_file_path=log_processed_path)

        print(f"Total number of articles processed in this batch: {len(data)}")

    print(f"Total number of articles processed: {len(acab) * batch_article_limit -(batch_article_limit-1)}")
    return print("Update complete")  

In [73]:
upsert_vectorDB(collection=collection, index=index,
                text_splitter=text_splitter, embedder=embedder, sparse_embedder=sparse_embedder,
                batch_article_limit=100, 
                log_processed_path='logs/ids_updated_vecDB.log')

100%|██████████| 100/100 [00:02<00:00, 38.51it/s]
Batches: 100%|██████████| 9/9 [00:02<00:00,  3.76it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:01<00:00, 58.19it/s]
Batches: 100%|██████████| 8/8 [00:01<00:00,  4.74it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:02<00:00, 46.95it/s]
Batches: 100%|██████████| 9/9 [00:02<00:00,  3.70it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:01<00:00, 54.11it/s]
Batches: 100%|██████████| 8/8 [00:01<00:00,  4.57it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:01<00:00, 55.79it/s]
Batches: 100%|██████████| 8/8 [00:02<00:00,  3.48it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:01<00:00, 54.56it/s]
Batches: 100%|██████████| 8/8 [00:01<00:00,  4.24it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:01<00:00, 50.50it/s]
Batches: 100%|██████████| 9/9 [00:02<00:00,  4.45it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:01<00:00, 61.78it/s]
Batches: 100%|██████████| 7/7 [00:01<00:00,  4.23it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:01<00:00, 56.87it/s]
Batches: 100%|██████████| 8/8 [00:01<00:00,  4.12it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:01<00:00, 60.95it/s]
Batches: 100%|██████████| 8/8 [00:01<00:00,  4.73it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:01<00:00, 53.72it/s]
Batches: 100%|██████████| 8/8 [00:02<00:00,  3.73it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:01<00:00, 71.79it/s]
Batches: 100%|██████████| 7/7 [00:01<00:00,  4.71it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:02<00:00, 44.69it/s]
Batches: 100%|██████████| 8/8 [00:01<00:00,  4.66it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:01<00:00, 76.81it/s]
Batches: 100%|██████████| 7/7 [00:01<00:00,  3.84it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:01<00:00, 62.62it/s]
Batches: 100%|██████████| 8/8 [00:01<00:00,  5.33it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:01<00:00, 50.82it/s]
Batches: 100%|██████████| 9/9 [00:01<00:00,  4.94it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:02<00:00, 36.36it/s]
Batches: 100%|██████████| 9/9 [00:01<00:00,  4.64it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:01<00:00, 53.45it/s]
Batches: 100%|██████████| 7/7 [00:01<00:00,  4.79it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:02<00:00, 38.94it/s]
Batches: 100%|██████████| 9/9 [00:02<00:00,  4.39it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:02<00:00, 33.92it/s]
Batches: 100%|██████████| 10/10 [00:02<00:00,  4.73it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:02<00:00, 42.65it/s]
Batches: 100%|██████████| 8/8 [00:01<00:00,  4.54it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:01<00:00, 64.04it/s]
Batches: 100%|██████████| 8/8 [00:01<00:00,  5.39it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:02<00:00, 39.33it/s]
Batches: 100%|██████████| 9/9 [00:01<00:00,  4.97it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:02<00:00, 44.97it/s]
Batches: 100%|██████████| 8/8 [00:01<00:00,  4.38it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:02<00:00, 43.17it/s]
Batches: 100%|██████████| 8/8 [00:01<00:00,  4.92it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:02<00:00, 40.11it/s]
Batches: 100%|██████████| 9/9 [00:01<00:00,  4.58it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:01<00:00, 59.99it/s]
Batches: 100%|██████████| 8/8 [00:01<00:00,  4.65it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:01<00:00, 51.48it/s]
Batches: 100%|██████████| 9/9 [00:01<00:00,  4.97it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:01<00:00, 81.71it/s]
Batches: 100%|██████████| 6/6 [00:01<00:00,  3.64it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:01<00:00, 56.01it/s]
Batches: 100%|██████████| 8/8 [00:02<00:00,  3.84it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:01<00:00, 67.87it/s]
Batches: 100%|██████████| 7/7 [00:01<00:00,  4.82it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:01<00:00, 54.94it/s]
Batches: 100%|██████████| 8/8 [00:01<00:00,  4.61it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:02<00:00, 40.81it/s]
Batches: 100%|██████████| 9/9 [00:01<00:00,  5.14it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:02<00:00, 43.07it/s]
Batches: 100%|██████████| 8/8 [00:01<00:00,  5.06it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:02<00:00, 37.41it/s]
Batches: 100%|██████████| 10/10 [00:02<00:00,  4.42it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:02<00:00, 34.48it/s]
Batches: 100%|██████████| 10/10 [00:02<00:00,  4.74it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:01<00:00, 66.70it/s]
Batches: 100%|██████████| 7/7 [00:01<00:00,  4.38it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:01<00:00, 54.63it/s]
Batches: 100%|██████████| 8/8 [00:01<00:00,  4.52it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:01<00:00, 63.08it/s]
Batches: 100%|██████████| 7/7 [00:01<00:00,  4.59it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:01<00:00, 55.86it/s]
Batches: 100%|██████████| 8/8 [00:01<00:00,  4.67it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:01<00:00, 53.88it/s]
Batches: 100%|██████████| 8/8 [00:01<00:00,  4.40it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:01<00:00, 64.30it/s]
Batches: 100%|██████████| 7/7 [00:01<00:00,  4.38it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:01<00:00, 52.73it/s]
Batches: 100%|██████████| 9/9 [00:01<00:00,  4.84it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:02<00:00, 49.28it/s]
Batches: 100%|██████████| 9/9 [00:02<00:00,  3.73it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:02<00:00, 45.82it/s]
Batches: 100%|██████████| 7/7 [00:01<00:00,  4.92it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:02<00:00, 35.57it/s]
Batches: 100%|██████████| 9/9 [00:01<00:00,  4.53it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:01<00:00, 50.42it/s]
Batches: 100%|██████████| 9/9 [00:02<00:00,  4.06it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:02<00:00, 40.16it/s]
Batches: 100%|██████████| 9/9 [00:01<00:00,  4.76it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:02<00:00, 45.21it/s]
Batches: 100%|██████████| 10/10 [00:02<00:00,  4.27it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:02<00:00, 43.48it/s]
Batches: 100%|██████████| 8/8 [00:01<00:00,  4.22it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:01<00:00, 60.33it/s]
Batches: 100%|██████████| 8/8 [00:01<00:00,  5.00it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:01<00:00, 59.58it/s]
Batches: 100%|██████████| 8/8 [00:01<00:00,  4.19it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:02<00:00, 39.16it/s]
Batches: 100%|██████████| 9/9 [00:01<00:00,  5.05it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:01<00:00, 58.12it/s]
Batches: 100%|██████████| 8/8 [00:01<00:00,  5.02it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:01<00:00, 58.95it/s]
Batches: 100%|██████████| 8/8 [00:02<00:00,  3.47it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:02<00:00, 43.16it/s]
Batches: 100%|██████████| 8/8 [00:01<00:00,  4.75it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:01<00:00, 61.45it/s]
Batches: 100%|██████████| 8/8 [00:02<00:00,  3.66it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:02<00:00, 38.40it/s]
Batches: 100%|██████████| 9/9 [00:01<00:00,  4.83it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:02<00:00, 35.80it/s]
Batches: 100%|██████████| 10/10 [00:02<00:00,  4.68it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:01<00:00, 55.13it/s]
Batches: 100%|██████████| 8/8 [00:02<00:00,  3.80it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:01<00:00, 53.25it/s]
Batches: 100%|██████████| 9/9 [00:01<00:00,  4.99it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:02<00:00, 43.18it/s]
Batches: 100%|██████████| 10/10 [00:02<00:00,  3.73it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:02<00:00, 49.71it/s]
Batches: 100%|██████████| 9/9 [00:02<00:00,  3.67it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:01<00:00, 69.18it/s]
Batches: 100%|██████████| 7/7 [00:01<00:00,  4.07it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:02<00:00, 46.26it/s]
Batches: 100%|██████████| 9/9 [00:02<00:00,  3.66it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:02<00:00, 40.64it/s]
Batches: 100%|██████████| 8/8 [00:01<00:00,  4.62it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:01<00:00, 68.52it/s]
Batches: 100%|██████████| 7/7 [00:01<00:00,  4.97it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:01<00:00, 62.35it/s]
Batches: 100%|██████████| 8/8 [00:01<00:00,  5.00it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:01<00:00, 64.77it/s]
Batches: 100%|██████████| 7/7 [00:02<00:00,  3.45it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:02<00:00, 41.13it/s]
Batches: 100%|██████████| 10/10 [00:02<00:00,  4.04it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:01<00:00, 67.62it/s]
Batches: 100%|██████████| 7/7 [00:01<00:00,  4.90it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:01<00:00, 72.26it/s]
Batches: 100%|██████████| 7/7 [00:01<00:00,  4.86it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:01<00:00, 55.58it/s]
Batches: 100%|██████████| 8/8 [00:01<00:00,  5.10it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:01<00:00, 56.15it/s]
Batches: 100%|██████████| 8/8 [00:01<00:00,  4.64it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:02<00:00, 43.79it/s]
Batches: 100%|██████████| 10/10 [00:02<00:00,  4.75it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:02<00:00, 48.41it/s]
Batches: 100%|██████████| 9/9 [00:02<00:00,  4.24it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:01<00:00, 54.24it/s]
Batches: 100%|██████████| 8/8 [00:01<00:00,  4.46it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:01<00:00, 62.04it/s]
Batches: 100%|██████████| 8/8 [00:01<00:00,  5.05it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:01<00:00, 60.55it/s]
Batches: 100%|██████████| 8/8 [00:01<00:00,  5.03it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:01<00:00, 55.08it/s]
Batches: 100%|██████████| 8/8 [00:01<00:00,  4.51it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:01<00:00, 67.22it/s]
Batches: 100%|██████████| 7/7 [00:01<00:00,  4.77it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:01<00:00, 54.20it/s]
Batches: 100%|██████████| 8/8 [00:01<00:00,  4.43it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:01<00:00, 71.71it/s]
Batches: 100%|██████████| 7/7 [00:01<00:00,  5.57it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:01<00:00, 68.04it/s]
Batches: 100%|██████████| 7/7 [00:01<00:00,  4.71it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:01<00:00, 57.90it/s]
Batches: 100%|██████████| 8/8 [00:01<00:00,  4.49it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:01<00:00, 65.44it/s]
Batches: 100%|██████████| 7/7 [00:01<00:00,  4.20it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:01<00:00, 62.95it/s]
Batches: 100%|██████████| 7/7 [00:01<00:00,  4.80it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:01<00:00, 58.27it/s]
Batches: 100%|██████████| 8/8 [00:01<00:00,  4.84it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:01<00:00, 51.51it/s]
Batches: 100%|██████████| 9/9 [00:01<00:00,  4.59it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:01<00:00, 56.15it/s]
Batches: 100%|██████████| 8/8 [00:01<00:00,  4.69it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:01<00:00, 66.55it/s]
Batches: 100%|██████████| 7/7 [00:01<00:00,  4.93it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:01<00:00, 71.13it/s]
Batches: 100%|██████████| 7/7 [00:01<00:00,  5.40it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:01<00:00, 62.03it/s]
Batches: 100%|██████████| 8/8 [00:01<00:00,  5.15it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:01<00:00, 56.32it/s]
Batches: 100%|██████████| 8/8 [00:01<00:00,  5.05it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:01<00:00, 55.47it/s]
Batches: 100%|██████████| 8/8 [00:01<00:00,  4.21it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:01<00:00, 61.01it/s]
Batches: 100%|██████████| 8/8 [00:01<00:00,  5.25it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:02<00:00, 46.01it/s]
Batches: 100%|██████████| 9/9 [00:02<00:00,  4.26it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:01<00:00, 76.56it/s]
Batches: 100%|██████████| 7/7 [00:01<00:00,  5.40it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:01<00:00, 56.66it/s]
Batches: 100%|██████████| 8/8 [00:01<00:00,  4.98it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:01<00:00, 52.86it/s]
Batches: 100%|██████████| 8/8 [00:01<00:00,  4.07it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:01<00:00, 50.70it/s]
Batches: 100%|██████████| 9/9 [00:01<00:00,  5.03it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:01<00:00, 57.50it/s]
Batches: 100%|██████████| 8/8 [00:01<00:00,  4.30it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:01<00:00, 58.34it/s]
Batches: 100%|██████████| 8/8 [00:01<00:00,  4.18it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:01<00:00, 52.35it/s]
Batches: 100%|██████████| 9/9 [00:01<00:00,  4.99it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:01<00:00, 60.49it/s]
Batches: 100%|██████████| 8/8 [00:01<00:00,  4.76it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:01<00:00, 56.47it/s]
Batches: 100%|██████████| 8/8 [00:01<00:00,  4.70it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:01<00:00, 62.50it/s]
Batches: 100%|██████████| 8/8 [00:01<00:00,  5.26it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:01<00:00, 67.84it/s]
Batches: 100%|██████████| 7/7 [00:01<00:00,  5.04it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:01<00:00, 51.81it/s]
Batches: 100%|██████████| 8/8 [00:01<00:00,  4.57it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:01<00:00, 56.21it/s]
Batches: 100%|██████████| 8/8 [00:01<00:00,  4.66it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:01<00:00, 57.81it/s]
Batches: 100%|██████████| 8/8 [00:01<00:00,  4.69it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:02<00:00, 48.55it/s]
Batches: 100%|██████████| 9/9 [00:01<00:00,  5.04it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:01<00:00, 55.53it/s]
Batches: 100%|██████████| 8/8 [00:01<00:00,  4.67it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:01<00:00, 57.55it/s]
Batches: 100%|██████████| 8/8 [00:01<00:00,  5.18it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:01<00:00, 52.51it/s]
Batches: 100%|██████████| 9/9 [00:01<00:00,  4.99it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:01<00:00, 58.54it/s]
Batches: 100%|██████████| 8/8 [00:01<00:00,  4.94it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:01<00:00, 52.10it/s]
Batches: 100%|██████████| 9/9 [00:01<00:00,  5.06it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:01<00:00, 62.59it/s]
Batches: 100%|██████████| 8/8 [00:01<00:00,  5.18it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:01<00:00, 51.35it/s]
Batches: 100%|██████████| 9/9 [00:02<00:00,  4.45it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:01<00:00, 58.83it/s]
Batches: 100%|██████████| 8/8 [00:01<00:00,  5.02it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:01<00:00, 60.37it/s]
Batches: 100%|██████████| 8/8 [00:01<00:00,  5.22it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:01<00:00, 63.19it/s]
Batches: 100%|██████████| 7/7 [00:01<00:00,  4.25it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:01<00:00, 50.65it/s]
Batches: 100%|██████████| 9/9 [00:01<00:00,  4.90it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:02<00:00, 48.76it/s]
Batches: 100%|██████████| 9/9 [00:01<00:00,  4.71it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:01<00:00, 53.29it/s]
Batches: 100%|██████████| 8/8 [00:01<00:00,  4.64it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:01<00:00, 67.05it/s]
Batches: 100%|██████████| 7/7 [00:01<00:00,  5.17it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:01<00:00, 56.00it/s]
Batches: 100%|██████████| 8/8 [00:01<00:00,  4.09it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:01<00:00, 66.54it/s]
Batches: 100%|██████████| 7/7 [00:01<00:00,  4.58it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:02<00:00, 48.46it/s]
Batches: 100%|██████████| 9/9 [00:02<00:00,  4.28it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:01<00:00, 60.78it/s]
Batches: 100%|██████████| 8/8 [00:01<00:00,  5.10it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:01<00:00, 58.94it/s]
Batches: 100%|██████████| 8/8 [00:01<00:00,  4.96it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:01<00:00, 65.94it/s]
Batches: 100%|██████████| 7/7 [00:01<00:00,  4.45it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:01<00:00, 58.42it/s]
Batches: 100%|██████████| 8/8 [00:01<00:00,  5.00it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:01<00:00, 54.25it/s]
Batches: 100%|██████████| 8/8 [00:01<00:00,  4.45it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:01<00:00, 58.72it/s]
Batches: 100%|██████████| 8/8 [00:01<00:00,  4.46it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:01<00:00, 68.58it/s]
Batches: 100%|██████████| 7/7 [00:01<00:00,  4.82it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:02<00:00, 44.04it/s]
Batches: 100%|██████████| 10/10 [00:02<00:00,  4.79it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:01<00:00, 54.71it/s]
Batches: 100%|██████████| 8/8 [00:01<00:00,  4.88it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:01<00:00, 61.34it/s]
Batches: 100%|██████████| 8/8 [00:01<00:00,  5.08it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:01<00:00, 67.73it/s]
Batches: 100%|██████████| 7/7 [00:01<00:00,  4.81it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:01<00:00, 67.90it/s]
Batches: 100%|██████████| 7/7 [00:01<00:00,  4.82it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:01<00:00, 58.01it/s]
Batches: 100%|██████████| 8/8 [00:01<00:00,  5.05it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:01<00:00, 59.83it/s]
Batches: 100%|██████████| 8/8 [00:01<00:00,  5.11it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:01<00:00, 51.20it/s]
Batches: 100%|██████████| 9/9 [00:01<00:00,  4.94it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:01<00:00, 52.33it/s]
Batches: 100%|██████████| 9/9 [00:01<00:00,  4.81it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:01<00:00, 53.59it/s]
Batches: 100%|██████████| 8/8 [00:01<00:00,  4.44it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:01<00:00, 52.13it/s]
Batches: 100%|██████████| 9/9 [00:01<00:00,  4.99it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:01<00:00, 56.94it/s]
Batches: 100%|██████████| 8/8 [00:01<00:00,  4.52it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:01<00:00, 60.16it/s]
Batches: 100%|██████████| 8/8 [00:01<00:00,  5.21it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:01<00:00, 52.28it/s]
Batches: 100%|██████████| 8/8 [00:01<00:00,  4.36it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:01<00:00, 62.04it/s]
Batches: 100%|██████████| 8/8 [00:01<00:00,  4.40it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:01<00:00, 50.56it/s]
Batches: 100%|██████████| 9/9 [00:01<00:00,  4.86it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:01<00:00, 55.33it/s]
Batches: 100%|██████████| 8/8 [00:01<00:00,  4.64it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:01<00:00, 65.94it/s]
Batches: 100%|██████████| 7/7 [00:01<00:00,  4.76it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:02<00:00, 47.82it/s]
Batches: 100%|██████████| 9/9 [00:01<00:00,  4.72it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:01<00:00, 60.71it/s]
Batches: 100%|██████████| 8/8 [00:01<00:00,  4.78it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:01<00:00, 58.12it/s]
Batches: 100%|██████████| 8/8 [00:01<00:00,  5.04it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:01<00:00, 59.30it/s]
Batches: 100%|██████████| 8/8 [00:01<00:00,  4.79it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:01<00:00, 61.64it/s]
Batches: 100%|██████████| 7/7 [00:01<00:00,  4.09it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:02<00:00, 49.58it/s]
Batches: 100%|██████████| 9/9 [00:01<00:00,  5.02it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:02<00:00, 40.34it/s]
Batches: 100%|██████████| 11/11 [00:02<00:00,  4.84it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:02<00:00, 49.01it/s]
Batches: 100%|██████████| 9/9 [00:01<00:00,  4.79it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:01<00:00, 53.51it/s]
Batches: 100%|██████████| 8/8 [00:01<00:00,  4.40it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:01<00:00, 63.56it/s]
Batches: 100%|██████████| 8/8 [00:01<00:00,  5.17it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:01<00:00, 59.80it/s]
Batches: 100%|██████████| 8/8 [00:01<00:00,  5.22it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:02<00:00, 43.77it/s]
Batches: 100%|██████████| 10/10 [00:02<00:00,  4.92it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:01<00:00, 59.08it/s]
Batches: 100%|██████████| 8/8 [00:01<00:00,  4.65it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:02<00:00, 46.23it/s]
Batches: 100%|██████████| 9/9 [00:01<00:00,  4.58it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:01<00:00, 58.73it/s]
Batches: 100%|██████████| 8/8 [00:01<00:00,  4.88it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:01<00:00, 56.11it/s]
Batches: 100%|██████████| 8/8 [00:01<00:00,  4.84it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:01<00:00, 62.98it/s]
Batches: 100%|██████████| 7/7 [00:01<00:00,  4.31it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:01<00:00, 55.53it/s]
Batches: 100%|██████████| 8/8 [00:01<00:00,  4.84it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:01<00:00, 71.06it/s]
Batches: 100%|██████████| 7/7 [00:01<00:00,  4.92it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:01<00:00, 56.67it/s]
Batches: 100%|██████████| 8/8 [00:01<00:00,  4.74it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:01<00:00, 50.61it/s]
Batches: 100%|██████████| 9/9 [00:01<00:00,  4.94it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:01<00:00, 56.46it/s]
Batches: 100%|██████████| 8/8 [00:01<00:00,  5.01it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:01<00:00, 62.97it/s]
Batches: 100%|██████████| 8/8 [00:01<00:00,  4.74it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:01<00:00, 58.35it/s]
Batches: 100%|██████████| 8/8 [00:01<00:00,  4.85it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:01<00:00, 52.20it/s]
Batches: 100%|██████████| 9/9 [00:01<00:00,  5.24it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:01<00:00, 55.92it/s]
Batches: 100%|██████████| 8/8 [00:01<00:00,  5.06it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:01<00:00, 59.84it/s]
Batches: 100%|██████████| 8/8 [00:01<00:00,  4.89it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:01<00:00, 58.06it/s]
Batches: 100%|██████████| 8/8 [00:01<00:00,  4.95it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:01<00:00, 50.74it/s]
Batches: 100%|██████████| 9/9 [00:01<00:00,  4.64it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:02<00:00, 44.21it/s]
Batches: 100%|██████████| 10/10 [00:02<00:00,  4.87it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:01<00:00, 56.71it/s]
Batches: 100%|██████████| 8/8 [00:01<00:00,  5.06it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:01<00:00, 58.96it/s]
Batches: 100%|██████████| 8/8 [00:01<00:00,  4.96it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:01<00:00, 56.69it/s]
Batches: 100%|██████████| 8/8 [00:01<00:00,  5.16it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:01<00:00, 51.20it/s]
Batches: 100%|██████████| 9/9 [00:01<00:00,  5.29it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:01<00:00, 54.28it/s]
Batches: 100%|██████████| 8/8 [00:01<00:00,  4.62it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:01<00:00, 54.67it/s]
Batches: 100%|██████████| 8/8 [00:01<00:00,  4.78it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:01<00:00, 60.07it/s]
Batches: 100%|██████████| 8/8 [00:01<00:00,  4.73it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:01<00:00, 50.32it/s]
Batches: 100%|██████████| 9/9 [00:01<00:00,  5.20it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:01<00:00, 54.45it/s]
Batches: 100%|██████████| 8/8 [00:01<00:00,  4.81it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:01<00:00, 52.37it/s]
Batches: 100%|██████████| 9/9 [00:01<00:00,  5.19it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:01<00:00, 51.51it/s]
Batches: 100%|██████████| 9/9 [00:01<00:00,  4.92it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:01<00:00, 53.85it/s]
Batches: 100%|██████████| 8/8 [00:01<00:00,  4.70it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:01<00:00, 50.47it/s]
Batches: 100%|██████████| 9/9 [00:01<00:00,  4.85it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:01<00:00, 53.17it/s]
Batches: 100%|██████████| 9/9 [00:01<00:00,  5.24it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:02<00:00, 47.97it/s]
Batches: 100%|██████████| 9/9 [00:02<00:00,  4.10it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:02<00:00, 47.93it/s]
Batches: 100%|██████████| 9/9 [00:01<00:00,  4.76it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:02<00:00, 48.69it/s]
Batches: 100%|██████████| 9/9 [00:01<00:00,  4.82it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:01<00:00, 56.10it/s]
Batches: 100%|██████████| 8/8 [00:01<00:00,  4.75it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:02<00:00, 39.49it/s]
Batches: 100%|██████████| 10/10 [00:02<00:00,  4.14it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:01<00:00, 60.27it/s]
Batches: 100%|██████████| 8/8 [00:01<00:00,  5.06it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:02<00:00, 42.12it/s]
Batches: 100%|██████████| 10/10 [00:02<00:00,  4.35it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:01<00:00, 61.51it/s]
Batches: 100%|██████████| 8/8 [00:01<00:00,  4.49it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:01<00:00, 53.13it/s]
Batches: 100%|██████████| 9/9 [00:01<00:00,  5.32it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:01<00:00, 53.51it/s]
Batches: 100%|██████████| 9/9 [00:01<00:00,  5.15it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:02<00:00, 48.03it/s]
Batches: 100%|██████████| 9/9 [00:01<00:00,  4.75it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:01<00:00, 61.70it/s]
Batches: 100%|██████████| 8/8 [00:01<00:00,  5.31it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:02<00:00, 49.37it/s]
Batches: 100%|██████████| 9/9 [00:01<00:00,  4.94it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:01<00:00, 50.42it/s]
Batches: 100%|██████████| 9/9 [00:01<00:00,  5.07it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:01<00:00, 64.28it/s]
Batches: 100%|██████████| 8/8 [00:01<00:00,  5.02it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:01<00:00, 53.30it/s]
Batches: 100%|██████████| 9/9 [00:02<00:00,  4.41it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:01<00:00, 52.21it/s]
Batches: 100%|██████████| 8/8 [00:01<00:00,  4.41it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:02<00:00, 49.11it/s]
Batches: 100%|██████████| 9/9 [00:01<00:00,  4.64it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:01<00:00, 68.81it/s]
Batches: 100%|██████████| 7/7 [00:01<00:00,  4.70it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:01<00:00, 54.92it/s]
Batches: 100%|██████████| 8/8 [00:01<00:00,  4.71it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:01<00:00, 57.41it/s]
Batches: 100%|██████████| 8/8 [00:01<00:00,  5.19it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:02<00:00, 49.38it/s]
Batches: 100%|██████████| 9/9 [00:01<00:00,  4.61it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:02<00:00, 46.37it/s]
Batches: 100%|██████████| 9/9 [00:01<00:00,  4.52it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:01<00:00, 67.72it/s]
Batches: 100%|██████████| 7/7 [00:01<00:00,  3.97it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:01<00:00, 57.18it/s]
Batches: 100%|██████████| 8/8 [00:01<00:00,  4.85it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:01<00:00, 59.20it/s]
Batches: 100%|██████████| 8/8 [00:01<00:00,  5.18it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:01<00:00, 51.42it/s]
Batches: 100%|██████████| 9/9 [00:01<00:00,  5.10it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:01<00:00, 55.81it/s]
Batches: 100%|██████████| 8/8 [00:01<00:00,  4.83it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:01<00:00, 57.75it/s]
Batches: 100%|██████████| 8/8 [00:01<00:00,  5.35it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:01<00:00, 51.00it/s]
Batches: 100%|██████████| 9/9 [00:01<00:00,  5.13it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:02<00:00, 48.65it/s]
Batches: 100%|██████████| 9/9 [00:02<00:00,  4.36it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:01<00:00, 57.24it/s]
Batches: 100%|██████████| 8/8 [00:01<00:00,  4.77it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:01<00:00, 52.26it/s]
Batches: 100%|██████████| 9/9 [00:01<00:00,  5.39it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:01<00:00, 53.14it/s]
Batches: 100%|██████████| 8/8 [00:01<00:00,  4.62it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:02<00:00, 48.82it/s]
Batches: 100%|██████████| 9/9 [00:01<00:00,  4.64it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:01<00:00, 54.27it/s]
Batches: 100%|██████████| 8/8 [00:01<00:00,  4.60it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:01<00:00, 56.95it/s]
Batches: 100%|██████████| 8/8 [00:01<00:00,  4.63it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:02<00:00, 37.51it/s]
Batches: 100%|██████████| 11/11 [00:02<00:00,  4.63it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:02<00:00, 35.58it/s]
Batches: 100%|██████████| 12/12 [00:02<00:00,  4.39it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:01<00:00, 60.65it/s]
Batches: 100%|██████████| 8/8 [00:01<00:00,  5.39it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:01<00:00, 60.74it/s]
Batches: 100%|██████████| 8/8 [00:01<00:00,  5.16it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:01<00:00, 64.65it/s]
Batches: 100%|██████████| 7/7 [00:01<00:00,  4.88it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:01<00:00, 60.85it/s]
Batches: 100%|██████████| 8/8 [00:01<00:00,  4.98it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:01<00:00, 59.26it/s]
Batches: 100%|██████████| 8/8 [00:01<00:00,  5.03it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:02<00:00, 47.83it/s]
Batches: 100%|██████████| 9/9 [00:01<00:00,  4.64it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:01<00:00, 55.91it/s]
Batches: 100%|██████████| 8/8 [00:01<00:00,  4.62it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:01<00:00, 60.04it/s]
Batches: 100%|██████████| 8/8 [00:01<00:00,  4.75it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:01<00:00, 52.25it/s]
Batches: 100%|██████████| 9/9 [00:01<00:00,  5.21it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:01<00:00, 60.98it/s]
Batches: 100%|██████████| 8/8 [00:01<00:00,  5.33it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:01<00:00, 58.81it/s]
Batches: 100%|██████████| 8/8 [00:01<00:00,  5.02it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:02<00:00, 49.27it/s]
Batches: 100%|██████████| 9/9 [00:01<00:00,  4.83it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:01<00:00, 59.18it/s]
Batches: 100%|██████████| 8/8 [00:01<00:00,  4.96it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:02<00:00, 45.92it/s]
Batches: 100%|██████████| 9/9 [00:02<00:00,  4.41it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:01<00:00, 59.94it/s]
Batches: 100%|██████████| 8/8 [00:01<00:00,  5.18it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:01<00:00, 59.93it/s]
Batches: 100%|██████████| 8/8 [00:01<00:00,  4.83it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:01<00:00, 63.40it/s]
Batches: 100%|██████████| 8/8 [00:01<00:00,  5.67it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:01<00:00, 54.48it/s]
Batches: 100%|██████████| 8/8 [00:01<00:00,  4.78it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:01<00:00, 65.04it/s]
Batches: 100%|██████████| 7/7 [00:01<00:00,  4.68it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:01<00:00, 51.74it/s]
Batches: 100%|██████████| 9/9 [00:01<00:00,  5.05it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:01<00:00, 52.22it/s]
Batches: 100%|██████████| 9/9 [00:01<00:00,  5.37it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:02<00:00, 48.94it/s]
Batches: 100%|██████████| 9/9 [00:01<00:00,  4.75it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:02<00:00, 49.18it/s]
Batches: 100%|██████████| 9/9 [00:01<00:00,  4.96it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:01<00:00, 62.74it/s]
Batches: 100%|██████████| 8/8 [00:01<00:00,  4.98it/s]


Total number of articles processed in this batch: 100


100%|██████████| 100/100 [00:01<00:00, 61.06it/s]
Batches: 100%|██████████| 8/8 [00:01<00:00,  5.22it/s]


Total number of articles processed in this batch: 100


100%|██████████| 1/1 [00:00<00:00, 99.98it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 86.89it/s]


Total number of articles processed in this batch: 1
Total number of articles processed: 26201
Update complete


In [68]:
index.describe_index_stats()

{'dimension': 768,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 1}},
 'total_vector_count': 1}

# Remove records

In [37]:
to_delete_items = ["665f1222756e8549ca815604", "665f1222756e8549ca815601"]
# mongodb Delete operation
Obj_tbd = [ObjectId(x) for x in to_delete_items]
query = {'_id' : Obj_tbd}
collection.remove(query)

In [None]:
# Pinecone Delete synchronization
import json
filepath='logs/ids_updated_vectorDB.log'
# parse del log per ottenere gli id dei vettori e rimuoviamo dal log
# Initialize an empty list to store record_ID values
record_ids_tbr = []
# Open the file for reading
with open(filepath, 'r') as fr:
    lines = fr.readlines()
    with open(filepath, 'w') as fw:
        # Iterate over each line in the file
        for line in lines:
            if ("INFO:root" in line):
                # Find the position of the JSON string within the log line
                start_pos = line.find("{")
                end_pos = line.rfind("}") + 1
                json_str = line[start_pos:end_pos]
                # Parse the JSON string to a dictionary
                log_data = json.loads(json_str)
                if log_data['record_ID'] not in to_delete_items:
                    fw.write(line)
                else:
                    record_ids_tbr.append(log_data['chunk_id'])
    fw.close()

index.delete(ids=record_ids_tbr)

In [56]:
index.describe_index_stats()

{'dimension': 768,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}