In [None]:
!python -m pip install pika --upgrade
!pip install fastapi uvicorn requests python-dotenv


In [None]:
from fastapi import FastAPI, Request
import json
import logging

app = FastAPI()

# Set up logging
logging.basicConfig(level=logging.INFO)

@app.post("/webhook")
async def receive_webhook(request: Request):
    try:
        # Get the JSON payload from the request
        payload = await request.json()

        # Log the received payload
        logging.info("Received webhook: %s", json.dumps(payload, indent=2))

        # Extract information from the payload
        # The structure will depend on what SharePoint sends, typically:
        # {
        #     "value": [
        #         {
        #             "id": "unique_id",
        #             "resource": "https://graph.microsoft.com/v1.0/sites/{site-id}/drives/{drive-id}/items/{item-id}",
        #             "resourceData": {
        #                 "id": "item_id",
        #                 "name": "filename.ext",
        #                 "webUrl": "https://sharepoint_url/file",
        #                 ...
        #             }
        #         }
        #     ]
        # }

        for notification in payload.get("value", []):
            file_info = notification.get("resourceData", {})
            item_id = file_info.get("id")
            file_name = file_info.get("name")
            web_url = file_info.get("webUrl")

            # Process the file information as needed
            logging.info(f"File Changed: ID={item_id}, Name={file_name}, URL={web_url}")

        return {"status": "success"}

    except Exception as e:
        logging.error(f"Error processing webhook: {e}")
        return {"status": "error", "message": str(e)}



In [None]:
@backoff.on_exception(backoff.expo, openai.RateLimitError, max_tries=5)
def completions_with_backoff(**kwargs):
    """Handle rate-limited completions from OpenAI."""
    return openai.Completion.create(**kwargs)


# Function to initialize or load vector store
def load_or_initialize_vector_store(embeddings, search_query, search_url):
    try:
        # Attempt to load an existing vector store
        vector_store = Chroma(collection_name='chroma_index', persist_directory=persist_directory, embedding_function=embeddings)  # Using Chroma, replace with FAISS if necessary

        if vector_store:
            return vector_store

        else:
            print("No vector store found, initializing a new one.")
            chunks = load_and_split_document_with_images(search_url)
            # Initialize a new vector store
            # Save the new vector store
            vector_store = generate_embedding(chunks)
            return vector_store   # Return the new vector store

    except Exception as e:
        print(f"Error loading vector store: {e}")
        # If there's an error, create a new vector store from the provided chunks
        chunks = load_and_split_document_with_images(search_url)
        # Initialize a new vector store
        # Save the new vector store
        vector_store = generate_embedding(chunks)
        return vector_store  # Return the new vector store

def process_pptx_data(pptx_elements):
    # Create Document instances

    documents = []
    for element in pptx_elements:
        doc = Document(
            page_content=element.page_content,
            metadata=element.metadata,
            id=str(uuid4())  # Generate a unique ID for each document
        )
        documents.append(doc)

    return documents

def process_unstructured_data(yolox_elements, documents):
    # Iterate through the elements and extract values based on their type
    for element in yolox_elements:
        if isinstance(element, Image):
          # Debugging statement to show the type of element
          print(f"Processing element of type: {type(element)}")
          print(f"Image attributes: {dir(element)}")  # Print all attributes
          print(f"Image data: {element.__dict__}")  # Inspect instance variables

          # Attempt to access and print the file path
          if hasattr(element, 'filepath'):
              print(f"Image file path: {element.filepath}")
              image_data = perform_ocr_on_image(element.filepath)
              # Add the extracted image data to the document
              doc = Document(
                  page_content=image_data,
                  metadata=element.metadata,
                  id=str(uuid4())  # Generate a unique ID for each document
              )
              documents.append(doc)

          # Perform OCR on the image file if the file path exists

          elif element.__dict__['text']:
              # Add the extracted image data to the document
              if hasattr(element.metadata, 'to_dict'):
                  metadata_dict = element.metadata.to_dict()  # Convert ElementMetadata to a dictionary
              else:
                  metadata_dict = element.metadata
              doc = Document(
                  page_content=element.text,
                  metadata=metadata_dict,
                  id=str(uuid4())  # Generate a unique ID for each document
              )
              documents.append(doc)
          else:
              print("No valid information found from the image.")


    # Return the list of extracted values
    return documents


# Function to load and split the PPTX file
def load_and_split_document_with_images(filename):
    """Load a PPTX document and extract images and text."""

    # Use UnstructuredPowerPointLoader for structured data
    loader = UnstructuredPowerPointLoader(filename, mode="elements")
    pptx_elements = loader.load()  # Load the PPTX elements
    yolox_elements = partition_pptx_with_yolox(filename)
    documents = process_pptx_data(pptx_elements)
    documents_with_images = process_unstructured_data(yolox_elements, documents)


    return documents_with_images  # Return the documents list


# Function to partition PPTX files using Yolox model and extract elements
def partition_pptx_with_yolox(filename):
    """Partition PPTX file using Yolox for high-resolution image processing."""
    with open(filename, "rb") as f:
        files = shared.Files(
            content=f.read(),
            file_name=filename,
        )

    req = shared.PartitionParameters(
        files=files,
        strategy=shared.Strategy.HI_RES,  # High-resolution strategy
        hi_res_model_name="yolox",  # Yolox model
        languages =  ['eng', 'ita'], # an error might occur here
    )

    try:
        resp = s.general.partition(req)

        img_elements = dict_to_elements(resp.elements)  # Extract elements
        return img_elements
    except SDKError as e:
        print(e)
        return []


# Function to perform OCR on images
def perform_ocr_on_image(image_data):
    # If image_data is a file path
    if isinstance(image_data, str):
        image = PILImage.open(image_data)
    # If image_data is binary data, convert it to an image
    elif isinstance(image_data, bytes):
        image = PILImage.open(BytesIO(image_data))
    else:
        raise ValueError("Unsupported image data format.")

    # Perform OCR
    text = pytesseract.image_to_string(image)
    return text


# Backoff for embedding generation
@backoff.on_exception(backoff.expo, openai.RateLimitError, max_tries=10)
def generate_embedding(chunks):
    """Generate embedding for the user query with rate limit handling."""
    uuids = [str(uuid4()) for _ in range(len(chunks))]


    # Save the vector store
    vector_store = Chroma(
    collection_name="chroma_index",
    embedding_function=embeddings,
    persist_directory=persist_directory,  # Where to save data locally, remove if not necessary
    )
    vector_store.add_documents(documents=chunks, ids=uuids)

    return vector_store


# Define a helper function to handle complex metadata conversion
def filter_or_convert_metadata(metadata):
    # Iterate through metadata dictionary and process values
    for key, value in metadata.items():
        if isinstance(value, list):
            # Convert lists to comma-separated strings
            metadata[key] = ', '.join(map(str, value))
        elif isinstance(value, dict):
            # Filter out dictionaries or complex types using helper method
            metadata[key] = filter_complex_metadata(metadata[key])
    return metadata


# Function to initialize or load vector store
def load_or_initialize_vector_store(embeddings, search_query, search_url):
    try:
        # Attempt to load an existing vector store
        vector_store = Chroma(collection_name='chroma_index', persist_directory=persist_directory, embedding_function=embeddings)  # Using Chroma, replace with FAISS if necessary

        if vector_store:
            return vector_store

        else:
            print("No vector store found, initializing a new one.")
            chunks = load_and_split_document_with_images(search_url)
            # Initialize a new vector store
            # Save the new vector store
            vector_store = generate_embedding(chunks)
            return vector_store   # Return the new vector store

    except Exception as e:
        print(f"Error loading vector store: {e}")
        # If there's an error, create a new vector store from the provided chunks
        chunks = load_and_split_document_with_images(search_url)
        # Initialize a new vector store
        # Save the new vector store
        vector_store = generate_embedding(chunks)
        return vector_store  # Return the new vector store

In [None]:
import requests
import json
from datetime import datetime, timedelta

# Microsoft Graph API credentials
client_id = "YOUR_CLIENT_ID"
client_secret = "YOUR_CLIENT_SECRET"
tenant_id = "YOUR_TENANT_ID"
site_id = "YOUR_SHAREPOINT_SITE_ID"  # Replace with your actual SharePoint site ID
drive_id = "YOUR_DRIVE_ID"  # ID of the SharePoint document library

# Get OAuth2 token from Azure AD
def get_access_token():
    url = f"https://login.microsoftonline.com/{tenant_id}/oauth2/v2.0/token"
    headers = {
        "Content-Type": "application/x-www-form-urlencoded"
    }
    body = {
        "client_id": client_id,
        "scope": "https://graph.microsoft.com/.default",
        "client_secret": client_secret,
        "grant_type": "client_credentials"
    }
    response = requests.post(url, headers=headers, data=body)
    return response.json().get("access_token")

# Create a subscription for SharePoint file changes
def create_subscription():
    token = get_access_token()
    url = "https://graph.microsoft.com/v1.0/subscriptions"
    headers = {
        "Authorization": f"Bearer {token}",
        "Content-Type": "application/json"
    }

    # Expiration must be within 3 days max
    expiration_time = (datetime.utcnow() + timedelta(days=3)).strftime("%Y-%m-%dT%H:%M:%S.%fZ")

    body = {
        "changeType": "updated",  # can be "created, deleted" too
        "notificationUrl": "https://your-webhook-url.com/api/notify",  # replace with your webhook
        "resource": f"/sites/{site_id}/drives/{drive_id}/root",  # replace with your SharePoint resource
        "expirationDateTime": expiration_time,
        "clientState": "yourSecretValue"
    }

    response = requests.post(url, headers=headers, json=body)
    print("Subscription Response:", response.json())

# Create the subscription
create_subscription()


In [None]:
from fastapi import FastAPI, Request, HTTPException
import requests
import os
from dotenv import load_dotenv

# Load environment variables from .env file (if applicable)
load_dotenv()

# FastAPI instance
app = FastAPI()

# Microsoft Graph API credentials from environment variables or directly hardcoded
client_id = os.getenv("CLIENT_ID", "YOUR_CLIENT_ID")
client_secret = os.getenv("CLIENT_SECRET", "YOUR_CLIENT_SECRET")
tenant_id = os.getenv("TENANT_ID", "YOUR_TENANT_ID")
site_id = os.getenv("SITE_ID", "YOUR_SHAREPOINT_SITE_ID")
drive_id = os.getenv("DRIVE_ID", "YOUR_DRIVE_ID")

# Get access token from Azure AD
def get_access_token():
    url = f"https://login.microsoftonline.com/{tenant_id}/oauth2/v2.0/token"
    headers = {
        "Content-Type": "application/x-www-form-urlencoded"
    }
    body = {
        "client_id": client_id,
        "scope": "https://graph.microsoft.com/.default",
        "client_secret": client_secret,
        "grant_type": "client_credentials"
    }
    response = requests.post(url, headers=headers, data=body)
    if response.status_code != 200:
        raise HTTPException(status_code=response.status_code, detail="Failed to obtain access token")
    return response.json().get("access_token")

# Fetch updated file content from SharePoint
def get_file_content(item_id):
    token = get_access_token()
    url = f"https://graph.microsoft.com/v1.0/sites/{site_id}/drives/{drive_id}/items/{item_id}/content"
    headers = {
        "Authorization": f"Bearer {token}"
    }
    response = requests.get(url, headers=headers)
    if response.status_code != 200:
        raise HTTPException(status_code=response.status_code, detail="Failed to fetch file content")
    return response.content

# Mock function for embedding and storing the file
def embed_file_and_store(file_content):
    # Add your embedding logic here
    print("Embedding file content:", file_content)

# Notification listener route to receive webhook notifications
@app.post("/api/notify")
async def notification_listener(request: Request):
    data = await request.json()

    # Verify that the clientState matches your expected value
    if data.get("value")[0].get("clientState") != "yourSecretValue":
        raise HTTPException(status_code=403, detail="Invalid client state")

    # Get the item ID of the changed file
    item_id = data.get("value")[0].get("resourceData", {}).get("id")
    if not item_id:
        raise HTTPException(status_code=400, detail="Item ID not found in notification")

    # Fetch the updated file content
    file_content = get_file_content(item_id)

    # Send the file content to embedding function
    embed_file_and_store(file_content)

    return {"status": "success"}



In [None]:
from fastapi import FastAPI, Request
from fastapi.responses import JSONResponse
import json
import logging
import asyncio
import queue

app = FastAPI()

# Set up logging
logging.basicConfig(level=logging.INFO)

# In-memory queue for batching (you may want to replace this with a persistent queue like RabbitMQ)
notification_queue = queue.Queue()
BATCH_SIZE = 5  # Example batch size
BATCH_INTERVAL = 10  # Seconds

async def process_batch():
    while True:
        if not notification_queue.empty():
            batch = []
            while not notification_queue.empty() and len(batch) < BATCH_SIZE:
                batch.append(notification_queue.get())

            # Process the batch
            if batch:
                await generate_embeddings(batch)

        await asyncio.sleep(BATCH_INTERVAL)

async def generate_embeddings(batch):
    # Here, you'd implement the logic to generate embeddings
    # For example, using Hugging Face Transformers
    logging.info("Generating embeddings for batch: %s", batch)
    # Generate embeddings logic goes here...
    # Store embeddings in ChromaDB...

@app.post("/webhook")
async def receive_webhook(request: Request):
    try:
        payload = await request.json()
        logging.info("Received webhook: %s", json.dumps(payload, indent=2))

        # Extract information from payload and put it into the notification queue
        for notification in payload.get("value", []):
            file_info = notification.get("resourceData", {})
            notification_queue.put(file_info)  # Add to the queue

        return JSONResponse(status_code=200, content={"status": "success"})
    except Exception as e:
        logging.error(f"Error processing webhook: {e}")
        return JSONResponse(status_code=500, content={"status": "error", "message": str(e)})

# Start the batch processing in the background
@app.on_event("startup")
async def startup_event():
    asyncio.create_task(process_batch())


In [None]:
from fastapi import FastAPI, Request
import requests
import json
import logging

app = FastAPI()

# Set up logging
logging.basicConfig(level=logging.INFO)

# Configuration for Microsoft Graph API
GRAPH_API_URL = "https://graph.microsoft.com/v1.0"
ACCESS_TOKEN = "your_access_token"  # You would normally retrieve this securely

@app.post("/create-webhook")
async def create_webhook():
    # Define the resource you want to monitor (e.g., a SharePoint document library)
    resource = "https://graph.microsoft.com/v1.0/sites/{site-id}/drives/{drive-id}/root"
    webhook_url = "https://your_fastapi_endpoint/webhook"  # Your FastAPI webhook endpoint
    expiration = "2023-12-31T00:00:00.000Z"  # Set your expiration time (max 4230 minutes)

    # Prepare the request payload
    subscription_data = {
        "changeType": "updated",
        "notificationUrl": webhook_url,
        "resource": resource,
        "expirationDateTime": expiration,
        "clientState": "secretClientValue"  # Optional client state for validation
    }

    # Make a POST request to Graph API to create the subscription
    headers = {
        "Authorization": f"Bearer {ACCESS_TOKEN}",
        "Content-Type": "application/json"
    }

    response = requests.post(
        f"{GRAPH_API_URL}/subscriptions",
        headers=headers,
        json=subscription_data
    )

    if response.status_code == 201:
        logging.info("Webhook subscription created successfully.")
        return {"status": "success", "data": response.json()}
    else:
        logging.error(f"Failed to create webhook subscription: {response.content}")
        return {"status": "error", "message": response.json()}

@app.post("/webhook")
async def receive_webhook(request: Request):
    try:
        payload = await request.json()
        logging.info("Received webhook: %s", json.dumps(payload, indent=2))
        # Process the payload as needed...
        return {"status": "success"}
    except Exception as e:
        logging.error(f"Error processing webhook: {e}")
        return {"status": "error", "message": str(e)}



In [None]:
from fastapi import FastAPI, Request, HTTPException
import pika  # RabbitMQ client library
import json
import os

# FastAPI instance
app = FastAPI()

# RabbitMQ connection parameters
rabbitmq_host = os.getenv("RABBITMQ_HOST", "your-rabbitmq-server")
rabbitmq_queue = os.getenv("RABBITMQ_QUEUE", "your_queue_name")

# Endpoint to receive SharePoint webhook notifications
@app.post("/webhook")
async def handle_webhook(request: Request):
    try:
        # Get the JSON data from the incoming webhook
        data = await request.json()
        print(f"Received webhook data: {data}")

        # Extract relevant data from the webhook payload
        subscription_id = data['value'][0].get('subscriptionId')
        resource_data = data['value'][0].get('resourceData')
        file_id = resource_data.get('Id')
        file_name = resource_data.get('Title')
        modified_time = resource_data.get('LastModifiedDateTime')

        if not file_id or not file_name:
            raise HTTPException(status_code=400, detail="Invalid data format")

        # Create a message to be sent to RabbitMQ
        message = {
            'subscriptionId': subscription_id,
            'fileId': file_id,
            'fileName': file_name,
            'modifiedTime': modified_time,
            'event': 'file_changed'
        }

        # Connect to RabbitMQ
        connection = pika.BlockingConnection(pika.ConnectionParameters(host=rabbitmq_host))
        channel = connection.channel()

        # Ensure the queue exists
        channel.queue_declare(queue=rabbitmq_queue)

        # Publish the message to RabbitMQ
        channel.basic_publish(
            exchange='',
            routing_key=rabbitmq_queue,
            body=json.dumps(message)
        )

        # Close the RabbitMQ connection
        connection.close()

        # Return a success response
        return {"status": "success", "message": "Webhook processed and message sent to RabbitMQ"}

    except Exception as e:
        print(f"Error processing webhook: {e}")
        raise HTTPException(status_code=500, detail="Failed to process webhook")

# If you're running the app using a separate command (like uvicorn), this block is not necessary.
# However, to run it as a script for development/testing, you can uncomment the following:
# if __name__ == '__main__':
#     import uvicorn
#     uvicorn.run(app, host="0.0.0.0", port=5000)


In [None]:
import pika
import json
from sentence_transformers import SentenceTransformer

# Initialize the model
model = SentenceTransformer('all-MiniLM-L6-v2')

def callback(ch, method, properties, body):
    # Decode the message
    notification = json.loads(body)
    # Batch processing logic here
    batch.append(notification)

    # Process the batch once size threshold is met
    if len(batch) >= BATCH_SIZE:
        process_batch(batch)
        batch.clear()  # Reset batch

def process_batch(batch):
    texts = [item['name'] for item in batch]
    embeddings = model.encode(texts)
    # Store embeddings in ChromaDB or another vector database

# Set up RabbitMQ connection and queue
connection = pika.BlockingConnection(pika.ConnectionParameters('localhost'))
channel = connection.channel()
channel.queue_declare(queue='file_changes_queue')

# Start consuming messages
channel.basic_consume(queue='file_changes_queue', on_message_callback=callback, auto_ack=True)
print('Waiting for messages...')
channel.start_consuming()


In [None]:
from fastapi import FastAPI, Request
import requests
import json
import logging

app = FastAPI()

# Set up logging
logging.basicConfig(level=logging.INFO)

# Configuration for Microsoft Graph API
GRAPH_API_URL = "https://graph.microsoft.com/v1.0"
ACCESS_TOKEN = "your_access_token"  # You would normally retrieve this securely

@app.post("/create-webhook")
async def create_webhook():
    # Define the resource you want to monitor (e.g., a SharePoint document library)
    resource = "https://graph.microsoft.com/v1.0/sites/{site-id}/drives/{drive-id}/root"
    webhook_url = "https://your_fastapi_endpoint/webhook"  # Your FastAPI webhook endpoint
    expiration = "2023-12-31T00:00:00.000Z"  # Set your expiration time (max 4230 minutes)

    # Prepare the request payload
    subscription_data = {
        "changeType": "updated",
        "notificationUrl": webhook_url,
        "resource": resource,
        "expirationDateTime": expiration,
        "clientState": "secretClientValue"  # Optional client state for validation
    }

    # Make a POST request to Graph API to create the subscription
    headers = {
        "Authorization": f"Bearer {ACCESS_TOKEN}",
        "Content-Type": "application/json"
    }

    response = requests.post(
        f"{GRAPH_API_URL}/subscriptions",
        headers=headers,
        json=subscription_data
    )

    if response.status_code == 201:
        logging.info("Webhook subscription created successfully.")
        return {"status": "success", "data": response.json()}
    else:
        logging.error(f"Failed to create webhook subscription: {response.content}")
        return {"status": "error", "message": response.json()}

@app.post("/webhook")
async def receive_webhook(request: Request):
    try:
        payload = await request.json()
        logging.info("Received webhook: %s", json.dumps(payload, indent=2))
        # Process the payload as needed...
        return {"status": "success"}
    except Exception as e:
        logging.error(f"Error processing webhook: {e}")
        return {"status": "error", "message": str(e)}



In [None]:
# Main function to tie everything together
def main(file_path, query, max_tokens=100):

    # Step 1: Check if vectore exists and if exists then if embedded chunk already exists in the vectorstore. Then Load and split the document, including handling images and OCR
    vector_store = load_or_initialize_vector_store(embeddings, query, search_url)
    if not vector_store:
        print("Error: Vector Store not found! Creating and loading...")
        # Update the vector store if no results were found
        chunks = load_and_split_document_with_images(file_path)
        # Initialize a new vector store
        # Save the new vector store
        vector_store = generate_embedding(chunks)


search_query = "How many times can you take exams in Italy?"
search_url = "/main.pptx"  # Path to your document file
main(search_url, search_query)


In [None]:
uvicorn app:app --reload