In [None]:
from azure.storage.blob import BlobServiceClient, ContainerClient, BlobClient
import os
from tika import parser
import pdfplumber
import io
from pymongo import MongoClient
import os
from azure.identity import DefaultAzureCredential
from azure.keyvault.secrets import SecretClient


    


In [None]:
from azure.cosmos import exceptions, CosmosClient, PartitionKey
def get_secret(keyvault_name ="chatkeys", secret_name = "openaiKey"):
    """Get secret from Azure Key Vault"""
    credential = DefaultAzureCredential()
    secret_client = SecretClient(vault_url=f"https://{keyvault_name}.vault.azure.net", credential=credential)
    secret = secret_client.get_secret(secret_name)
    return secret.value
  
def get_cosmosdb_keys(resourceGroup,cosmosdb_name):
    """ get cosmos db keys from azure"""
    #create environment variables  
    cosmosDbEndpoint_url = os.popen(f"az cosmosdb show --resource-group {resourceGroup}  --name {cosmosdb_name} --query 'writeLocations[].documentEndpoint' -o tsv").read().strip()
    cosmos_account_key =   os.popen(f"az cosmosdb keys  list --name {cosmosdb_name} --resource-group {resourceGroup} | jq -r '.primaryMasterKey'").read().strip()    
    database_name =        os.popen(f"az cosmosdb database list --name {cosmosdb_name} --resource-group {resourceGroup} | jq -r '.[0].id'").read().strip()
    collection_name =       os.popen(f"az cosmosdb collection list --name {cosmosdb_name} --db-name {database_name} --resource-group {resourceGroup} | jq -r '.[0].id'").read().strip()
    masterkey =            os.popen(f" az cosmosdb list-keys --name {cosmosdb_name} --resource-group {resourceGroup} --query primaryMasterKey").read().strip()
    connecting_string =    os.popen(f"az cosmosdb keys list --type connection-strings --resource-group {resourceGroup}\
                              --name {cosmosdb_name} | jq '.connectionStrings[0].connectionString' ").read().strip().replace('"','')
    
    return {
        
        "cosmosDbEndpoint_url" : cosmosDbEndpoint_url,
        "masterkey" : masterkey,
        "database_name" : database_name,
        "collection_name" : collection_name,
        "connecting_string" : connecting_string
    }

def get_cosmosdb_client():
    """get cosmos db client"""
    cosmosdb_acc = get_env_vars()['cosmosdb_acc']
    resource_group_name = get_env_vars()['resource_group_name']
    cosmosdb_keys = get_cosmosdb_keys(resourceGroup=resource_group_name,cosmosdb_name=cosmosdb_acc)
    cosmosdb_client = CosmosClient(cosmosdb_keys['cosmosDbEndpoint_url'], cosmosdb_keys['masterkey'])
    return cosmosdb_client

def get_env_vars():
    env_dict = {
        "resource_group_name": os.environ.get('resource_group_name'),
        "storage_account_name": os.environ.get('storage_account_name'),
        "container_name": os.environ.get('container_name'),
        "cosmosdb_acc": os.environ.get('cosmosdb_acc'),
        "database_name": os.environ.get('database_name'),
        "collection_name": os.environ.get('collection_name'),
        "OPENAI_API_KEY": os.environ.get('OPENAI_API_KEY'),
    }
    for k, v in env_dict.items():
        if v is None:
            raise Exception(f"{k} environment variable is not set")
    return env_dict
    


In [28]:
cosmos_dict = get_cosmosdb_keys(resourceGroup = "chatgptGp",cosmosdb_name="chatgptdb-acn")
os.environ['storage_account_name'] = 'chatgptv2stn'
os.environ['container_name'] = 'chatgpt-ctn'
os.environ['resource_group_name'] ='chatgptGp'
os.environ['cosmosdb_acc'] ='chatgptdb-acn'
os.environ['database_name']='chatgptdb-dbn'
os.environ['collection_name']='chatgptdb-cln'    
os.environ['connections_string'] = cosmos_dict['connecting_string']
os.environ['OPENAI_API_KEY'] = get_secret(keyvault_name='chatkeys', secret_name='openaiKey')



In [54]:
import pymongo

def list_filepaths_in_container():
    client = pymongo.MongoClient(os.environ['connections_string'])
    collection_client = client.get_database(os.environ['database_name']).get_collection(os.environ['collection_name'])
    list = [item['Filepath'] for item in collection_client.find()]
    return list

In [43]:
os.environ['connections_string']
os.environ['database_name']

'chatgptdb-dbn'

In [55]:
list_filepaths_in_container()

['/src/AcademicJournal/anthropology/beemster/Les Tikar de Bankim.pdf',
 '/src/AcademicJournal/history/Ndobegang_Mbapndah/COLONIAL BACKGROUND TO THE ECONOMIC EMPOWERMENT AND POLITICAL MOBILIZATION.pdf',
 "/src/AcademicJournal/anthropology/Annaud/De l'intestin aux testicules Substances, humeurs et alliance tikar - Cameroun central.pdf",
 '/src/AcademicJournal/anthropology/Price/Descent, Clans and Territorial Organization in the Tikar Chiefdom of Ngambe, Cameroon.pdf',
 '/src/OpED/philosophy/nganang/LE COMPLEXE DE SENGHOR.pdf',
 '/src/AcademicJournal/anthropology/Price/WHO ARE THE TIKAR NOW.pdf',
 '/src/AcademicJournal/anthropology/Jeffreys/Who are the Tikar.pdf',
 '/src/AcademicJournal/anthropology/chilver_kaberry/FROM TRIBUTE TO TAX IN A TIKAR CHIEFDOM.pdf',
 '/src/AcademicJournal/anthropology/hagege/Esquisse linguistique du Tikar, Cameroun (Claude Hagège).pdf',
 '/src/AcademicJournal/geopolitics/Nganang/The Amba Uprising Beyond Frances.pdf',
 '/src/AcademicJournal/anthropology/Tchindak

In [35]:
collection_client = client.get_database('chatgptdb-dbn').get_collection("chatgptdb-cln")
collection_client

Collection(Database(MongoClient(host=['chatgptdb-acn.mongo.cosmos.azure.com:10255'], document_class=dict, tz_aware=False, connect=True, replicaset='globaldb', retrywrites=False, maxidletimems=120000, appname='@chatgptdb-acn@', tls=True), 'chatgptdb-dbn'), 'chatgptdb-cln')

In [31]:
[item['Filepath'] for item in collection_client.find()]

['/src/AcademicJournal/anthropology/beemster/Les Tikar de Bankim.pdf',
 '/src/AcademicJournal/history/Ndobegang_Mbapndah/COLONIAL BACKGROUND TO THE ECONOMIC EMPOWERMENT AND POLITICAL MOBILIZATION.pdf',
 "/src/AcademicJournal/anthropology/Annaud/De l'intestin aux testicules Substances, humeurs et alliance tikar - Cameroun central.pdf",
 '/src/AcademicJournal/anthropology/Price/Descent, Clans and Territorial Organization in the Tikar Chiefdom of Ngambe, Cameroon.pdf',
 '/src/OpED/philosophy/nganang/LE COMPLEXE DE SENGHOR.pdf',
 '/src/AcademicJournal/anthropology/Price/WHO ARE THE TIKAR NOW.pdf',
 '/src/AcademicJournal/anthropology/Jeffreys/Who are the Tikar.pdf',
 '/src/AcademicJournal/anthropology/chilver_kaberry/FROM TRIBUTE TO TAX IN A TIKAR CHIEFDOM.pdf',
 '/src/AcademicJournal/anthropology/hagege/Esquisse linguistique du Tikar, Cameroun (Claude Hagège).pdf',
 '/src/AcademicJournal/geopolitics/Nganang/The Amba Uprising Beyond Frances.pdf',
 '/src/AcademicJournal/anthropology/Tchindak