In [1]:
from src.utils.dotenv_utils import FromFileConfigGenerator, load_config
from src.defaults import DEFAULT_ENV_FILE
from src.utils.gcs_utils import GCSClientGenerator, GCSConfig
from src.model.answers_generation import OpenAIConfig

In [2]:
from pydantic import BaseModel
import json


def print_pydantic_model(obj: BaseModel):
    print(json.dumps(obj.model_dump(), indent=4))

In [3]:
configs_getter = FromFileConfigGenerator(DEFAULT_ENV_FILE)
gcs_config: GCSConfig = load_config(GCSConfig, configs_getter.get_config)
openai_config: OpenAIConfig = load_config(OpenAIConfig, configs_getter.get_config)


client_generator = GCSClientGenerator(gcs_config)
storage_client = client_generator.get_client()

BUCKET_NAME = 'bucket-optimusprime'
bucket = storage_client.get_bucket(BUCKET_NAME)

In [22]:
from src.model.files.gcs import GCSFile


bucket_blobs = [GCSFile.from_blob(blob) for blob in bucket.list_blobs()]
print(len(bucket_blobs))

800


In [23]:
bucket_blobs = [GCSFile.from_blob(blob) for blob in bucket.list_blobs()
                        if blob.content_type in ["application/pdf"]]

In [24]:
print(len(bucket_blobs))

798


In [7]:
from openai import OpenAI


openai_client = OpenAI(api_key=openai_config.OPENAI_API_KEY, organization=openai_config.OPENAI_ORG_ID)


In [8]:
from src.ingestion.db_manager import VectorStoreFilesDB
from src.utils.drive_utils import DriveConfig, DriveCredentials, ServiceGenerator

drive_config = load_config(DriveConfig, configs_getter.get_config)
drive_creds = DriveCredentials(drive_config)
drive_service_generator = ServiceGenerator(drive_creds)
vs_files_db = VectorStoreFilesDB(drive_service_generator.get_sheet_service(),
                                 "1XAhPXBsAecJUiyI13l6qtiI-iuITA4XjyDI11BLmGDo",
                                 "VectorStore")

In [21]:
from src.ingestion.db_manager import VectorStoreFileInfo


test_blob = bucket_blobs[6]
file_name = test_blob.name.split('/')[-1]
file_bytes = bucket.blob(test_blob.name).download_as_bytes()
file_folder = "/".join(test_blob.id.split('/')[:-2])

vs_file = openai_client.files.create(
  file=(file_name + ".pdf", file_bytes),
  purpose="assistants"
)

vs_file_info = VectorStoreFileInfo(
    id=vs_file.id,
    source_id=file_name,
    source="gcs",
    folder_id=file_folder,
    last_modified=test_blob.updated
)
vs_files_db.write(vs_file_info)

In [13]:
a = [1, 2, 3, 4, 5]
a[:-1]

[1, 2, 3, 4]

In [17]:
test_blob.id

'bucket-optimusprime/datacore/1-Tkk0P8e4OHE2IgvqBaMvjPzfcvvYuNwbSnR42TBb3A/1734778836618332'