In [1]:
import datetime
import os
from pathlib import Path

from dotenv import load_dotenv
from pydantic import (
    AnyHttpUrl,
    BaseModel,
    ConfigDict,
    field_serializer,
)
from S3_helper.helper import S3

from rag_data_uploader import OpensearchUploader
from rag_data_uploader.utils.mappings import OSBaseMapping
from rag_data_uploader.utils.parsing_models import RegenXBaseParsingModel, metadata_field

In [2]:
_ = load_dotenv()
access_key = os.getenv("MINIO_ACCESS_KEY")
secret_key = os.getenv("MINIO_SECRET_KEY")

In [3]:
# New temporary parsing model for documents which are missing the chapter_title metadata field.
class RegenXParsingModelCPI(RegenXBaseParsingModel):
    """Parsing model for RegenX CPI data."""

    class ContentFromLibraries(BaseModel):
        identity: str
        title: str
        date: datetime.date

        @field_serializer("date")
        def serialize_date(self, v: datetime.date) -> str:
            return str(v)

    source: str = metadata_field("source")
    document_number: str = metadata_field("document_number")
    identity: str = metadata_field("identity")
    # chapter_title: str | list[str] = metadata_field("chapter_title")
    revision: str = metadata_field("revision")
    title: str = metadata_field("title")
    document_type: str | None = metadata_field("document_type")
    category_tree: str = metadata_field("category_tree")
    document_url: AnyHttpUrl = metadata_field("document_url")
    library_url: AnyHttpUrl = metadata_field("library_url")
    external_document_url: AnyHttpUrl = metadata_field("external_document_url")
    content_from_libraries: list[ContentFromLibraries] = metadata_field("content_from_libraries")
    eridoc_document_number: str = metadata_field("eridoc_document_number")

    # @field_validator("chapter_title")
    # @classmethod
    # def validate_chapter_title(cls, v: str | list[str]) -> str:
    #     if isinstance(v, list):
    #         if len(v) == 1:
    #             return next(iter(v))
    #         else:
    #             raise ValueError
    #     return v

    @field_serializer("document_url", "library_url", "external_document_url")
    def serialize_url(self, v: AnyHttpUrl) -> str:
        return str(v)

In [4]:
# Special mapping class which excludes chapter_title
class OSRegenXMappingCPI(OSBaseMapping):
    model_config = ConfigDict(validate_default=True, extra="ignore")

    embedding: int = 1536
    identity: str = "keyword"
    source: str = "keyword"
    document_number: str = "keyword"
    revision: str = "keyword"
    title: str = "keyword"
    document_type: str = "keyword"
    category_tree: str = "keyword"
    document_url: str = "keyword"
    library_url: str = "keyword"
    external_document_url: str = "keyword"
    content_from_libraries: dict[str, str] = {
        "identity": "keyword",
        "title": "keyword",
        "date": "keyword",
    }
    eridoc_document_number: str = "keyword"

### We use the S3 client class to download data for upload from Minio. First, we specify the date which should be downloaded. Note that this might be quite outdated by the time you read this.

### After this, we use the client to download whatever files are not already present on the machine, and extract the folders containing the JSON objects that should be uploaded to the vector store.

In [5]:
date = "2024-07-04"
s3_client = S3(access_key=access_key, secret_key=secret_key)

In [6]:
files = [file for file in s3_client.list_files(f"rag/{date}/", "sandbox")]
embedding_folders = list(
    {Path(file).parent for file in files if Path(file).parent.name == "AZOpenAI"}
)

In [7]:
embedding_folders

[PosixPath('rag/2024-07-04/mini-link_6651/embedding_output/AZOpenAI'),
 PosixPath('rag/2024-07-04/power/embedding_output/AZOpenAI'),
 PosixPath('rag/2024-07-04/mini-link_6654/embedding_output/AZOpenAI'),
 PosixPath('rag/2024-07-04/rbs_series/embedding_output/AZOpenAI'),
 PosixPath('rag/2024-07-04/mini-link_6291/embedding_output/AZOpenAI'),
 PosixPath('rag/2024-07-04/mini-link_6691/embedding_output/AZOpenAI'),
 PosixPath('rag/2024-07-04/router_6000/embedding_output/AZOpenAI'),
 PosixPath('rag/2024-07-04/radio_dot_system/embedding_output/AZOpenAI'),
 PosixPath('rag/2024-07-04/mini-link_6366/embedding_output/AZOpenAI'),
 PosixPath('rag/2024-07-04/mini-link_6693/embedding_output/AZOpenAI'),
 PosixPath('rag/2024-07-04/network_synchornization/embedding_output/AZOpenAI'),
 PosixPath('rag/2024-07-04/mini-link_6692/embedding_output/AZOpenAI'),
 PosixPath('rag/2024-07-04/mini-link_6371/embedding_output/AZOpenAI'),
 PosixPath('rag/2024-07-04/mini-link_6262/embedding_output/AZOpenAI'),
 PosixPath(

In [8]:
embedding_folders = [Path("rag/2024-07-04/radio/embedding_output/AZOpenAI")]

In [9]:
output_dir = Path(f"../data/{date}")
if output_dir.exists():
    existing_folders = [p.name for p in output_dir.glob("*")]
    folders_to_download = [
        folder for folder in embedding_folders if folder.parts[2] not in existing_folders
    ]
else:
    output_dir.mkdir()
    folders_to_download = embedding_folders

for folder in folders_to_download:
    output_folder = output_dir / "/".join(folder.parts[2:])
    s3_client.download_files(
        s3_bucket_name="sandbox", s3_folder_name=str(folder), local_folder_name=output_folder
    )

In [10]:
folders_to_upload = [p for p in Path(output_dir).rglob("*AZOpenAI")]

In [11]:
folders_to_upload

[PosixPath('../data/2024-07-04/radio/embedding_output/AZOpenAI')]

### Once the relevant data has been downloaded and extracted, we use the uploader class from the `rag_data_uploader` library to upload the documents to the vector store. Note that this requires that the vector store is either running locally and can be reached on port 9200, or that we have port-forwarded the vector store to this port. Also note that you need to specify the credentials to the vector store.

In [12]:
url = "https://localhost:9200"
username = "admin"
password = "admin"

In [13]:
mapping = OSRegenXMappingCPI()
uploader = OpensearchUploader(url, username, password, allow_missing_fields=False)

URL:  https://localhost:9200
Response from document store at URL https://localhost:9200:  OK


In [None]:
for folder in folders_to_upload:
    errors = False
    folder_name = folder.parts[3]
    index = f"regenx_data_{folder_name}_{date}"
    alias = f"regenx_data_{folder_name}"

    print("Uploading: ", folder_name)
    uploader.upload_from_folder(
        folder,
        index,
        mapping=mapping,
        parsing_model=RegenXParsingModelCPI,
        alias=alias,
        threading=False,
        batch_size=0.9,
    )

### After all of this, you should have successfully uploaded all of your data to the vector store. If any errors were encountered, it will be printed out at the end of the upload process, and a JSON file called `upload_errors.json` will be saved to the folder which this notebook is run from.

In [15]:
import httpx

In [16]:
res = httpx.get(f"{uploader.url}/regenx*", **uploader.req_kwargs)

In [17]:
list(res.json().keys())

['regenx_data_5g_plugins_2024-05-06',
 'regenx_data_antenna_system_2024-05-06',
 'regenx_data_baseband_radio_node_cpi_24.17_2024-04-30',
 'regenx_data_enclosure_2024-05-06',
 'regenx_data_ericsson_network_connection_2024-05-06',
 'regenx_data_fronthaul_2024-05-06',
 'regenx_data_integrated_site_solutions_2024-05-06',
 'regenx_data_lte_ran_cpi_24.17_2024-04-30',
 'regenx_data_mini-link_6251_2024-05-06',
 'regenx_data_mini-link_6252_2024-05-06',
 'regenx_data_mini-link_6262_2024-05-06',
 'regenx_data_mini-link_6291_2024-05-06',
 'regenx_data_mini-link_6366_2024-05-06',
 'regenx_data_mini-link_6371_2024-05-06',
 'regenx_data_mini-link_6651_2024-05-06',
 'regenx_data_mini-link_6654_2024-05-06',
 'regenx_data_mini-link_6655_2024-05-06',
 'regenx_data_mini-link_6691_2024-05-06',
 'regenx_data_mini-link_6692_2024-05-06',
 'regenx_data_mini-link_6693_2024-05-06',
 'regenx_data_mini-link_6694_2024-05-06',
 'regenx_data_network_synchornization_2024-05-06',
 'regenx_data_nr_ran_cpi_24.17_2024-04-

In [None]:
res = httpx.delete(f"{uploader.url}/regenx_data_radio_2024-05-06/_alias/regenx_data_radio", **uploader.req_kwargs)

In [18]:
uploader.index = "regenx_data_radio_2024-05-06"
uploader.alias = 'regenx_data_radio'

In [25]:
res = uploader._find_alias_indices()

In [26]:
uploader.indices_to_replace

['regenx_data_radio_2024-07-04']

In [22]:
from rag_data_uploader.utils.regexes import INDEX_REGEX

In [24]:
INDEX_REGEX.match(uploader.index)[0]

'regenx_data_radio'