In [None]:
import os
from dotenv import load_dotenv

# LangChain imports
from langchain_openai import AzureChatOpenAI

# Load environment variables
load_dotenv()

# Get Azure OpenAI configuration from environment variables
azure_openai_api_key = os.getenv("AZURE_OPENAI_KEY")
azure_openai_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT")
azure_openai_api_version = os.getenv("AZURE_OPENAI_VERSION")
azure_openai_deployment_name = os.getenv("AZURE_OPENAI_DEPLOYMENT_NAME")

print("Libraries imported successfully!")

In [None]:
from langchain.document_loaders import PyPDFLoader
loader = PyPDFLoader("docs/test.pdf")
pages = loader.load()

In [None]:
len(pages)

In [None]:
page = pages[0]

In [None]:
print(page.page_content[0:500])

In [None]:
page.metadata

In [None]:
from langchain_community.document_loaders.generic import GenericLoader
from langchain_community.document_loaders.blob_loaders import FileSystemBlobLoader
from langchain_community.document_loaders.parsers.audio import AzureOpenAIWhisperParser
from langchain_community.document_loaders.blob_loaders.youtube_audio import YoutubeAudioLoader

In [None]:
# Replace your existing YouTube loading cell with this corrected version
url=["<youtube-url>"] # Replace with your YouTube video URL, Must be a list
save_dir="docs/youtube/"

# Create the directory if it doesn't exist
os.makedirs(save_dir, exist_ok=True)

# must be deployed in Azure. 
# Only the below version worked with AzureOpenAI and langchain at the time of developing this code
model_name = "whisper"
model_version = "2023-09-01-preview" #important to use the correct version

parser = AzureOpenAIWhisperParser(
    api_key=azure_openai_api_key, azure_endpoint=azure_openai_endpoint, api_version=model_version, deployment_name=model_name
)

loader = GenericLoader(
    YoutubeAudioLoader(url, save_dir), parser
)

# code to fetch locally an audio file
#loader = GenericLoader(
#    FileSystemBlobLoader(save_dir, glob="*.m4a"),   
#    parser
#)

try:
    docs = loader.load()
    print(docs)
except Exception as e:
    import traceback
    traceback.print_exc()
    print(f"An error occurred while loading the YouTube audio: {e}")



In [None]:
#code to test Azure OpenAI Whisper model directly. Here the latest version worked while with langchain only the older version worked
import requests
from pathlib import Path

url = f"{azure_openai_endpoint}/openai/deployments/whisper/audio/transcriptions?api-version=2024-06-01"
audio_file_path = Path("docs/youtube/test.m4a")  # Adjust filename if needed
headers = {
    "api-key": azure_openai_api_key,
}
files = {
    "file": open(audio_file_path, "rb"),
    "model": (None, "whisper"),
}

response = requests.post(url, headers=headers, files=files)
print(response.status_code)
print(response.text)


In [None]:
from langchain.document_loaders import WebBaseLoader

loader = WebBaseLoader("https://github.com/basecamp/handbook/blob/master/titles-for-programmers.md")

In [None]:
docs = loader.load()

In [None]:
print(docs[0].page_content[:500])