### Libraries

In [22]:
import requests
import pandas as pd
import json
import os
import sys
from azure.storage.blob import BlobServiceClient
from langchain.document_loaders import UnstructuredPowerPointLoader, PyPDFLoader, UnstructuredWordDocumentLoader
from dotenv import load_dotenv
load_dotenv()  # by default get .env file 

from datetime import datetime

### Configurations

In [23]:
# ### Libraries
### Microsoft Graph API
# Credential REQ000010741904
# is a function that reads an environment variable's value.
# Global variables
client_id = os.getenv("O365_CLIENT_ID")
client_secret = os.getenv("O365_CLIENT_SECRET")
tenant_name = "enelcom"
tenant_id = os.getenv("SHAREPOINT_TENANT_ID")
site_name = os.getenv("SHAREPOINT_SITE_NAME")
collection_id = os.getenv("SHAREPOINT_COLLECTION_ID")
subsite_id = os.getenv("SHAREPOINT_SUBSITE_ID")  # This was missing in your original script
sharepoint_site_id = f"{tenant_name}.sharepoint.com,{collection_id},{subsite_id}"
document_library_id = os.getenv("SHAREPOINT_DOCUMENT_LIBRARY_ID")
account_name = os.getenv('BLOB_ACCOUNT_NAME')
account_key = os.getenv('BLOB_ACCOUNT_KEY')
connect_str = f"DefaultEndpointsProtocol=https;AccountName={account_name};AccountKey={account_key};EndpointSuffix=core.windows.net"
# container_name = "documents/sharepoint"
destination_folder = "documents\\sharepoint"

# Function to log messages
log_folder = "logs"

# list_doc_type = ['pptx', 'docx', 'pdf']
# list_doc_type = ['docx']
list_doc_type = ['pptx', 'docx', 'pdf']
# sharepoint_folder = ["how to"]
sharepoint_folder = ["how to", "organization", "guidelines"]

log function

In [24]:
# os.chdir("..")
os.getcwd()

'c:\\Giulia\\01_Projects\\ITS_AI\\chatbot_azure_test'

In [25]:
# log function
def log_message(message):
    with open(os.path.join(log_folder, 'download_log.txt'), 'a', encoding='utf-8') as log_file:
        log_file.write(message + "\n")

In [26]:
with open(os.path.join(log_folder, 'download_log.txt'), 'w', encoding='utf-8')as log_file:
        log_file.write(f"Date: {datetime.now().strftime("%Y-%d-%m")} \n\n")

### Microsoft Graph API

In [27]:
token_url = f'https://login.microsoftonline.com/{tenant_id}/oauth2/v2.0/token'
headers = {'Content-Type': 'application/x-www-form-urlencoded'}
token_data = {
    'grant_type': 'client_credentials',
    'client_id': client_id,
    'client_secret': client_secret,
    'scope': 'https://graph.microsoft.com/.default'
}
token_r = requests.post(token_url, data=token_data, headers=headers)
token = token_r.json().get('access_token')
headers = {'Authorization': 'Bearer ' + token}

### Download files

In [29]:
data_info=[]

for folder in sharepoint_folder:

    # create a new folder
    folder_path = os.path.join(destination_folder, folder)

    # check whether exists
    if not os.path.exists(folder_path):
        os.makedirs(folder_path)

    file_url = f'https://graph.microsoft.com/v1.0/drives/{document_library_id}/root:/{folder}:/children'

    response = requests.get(file_url, headers=headers)

    if response.status_code == 200:
        folder_contents = json.loads(response.text)

        # Loop through each item in the folder
        for item in folder_contents['value']:
            file_name = item['name']
            file_weburl = item["webUrl"] # url to the file
            file_extension = file_name.split('.')[-1]

            # Check the file extension and handle accordingly
            if file_extension in list_doc_type:
                file_url = item['@microsoft.graph.downloadUrl']
                
                # Download the file
                file_response = requests.get(file_url, headers=headers)
                if file_response.status_code == 200:
    
                    with open(os.path.join(folder_path, file_name) , 'wb') as f:
                        f.write(file_response.content)
                    log_message(f"{datetime.now().strftime("%Y-%m-%d %H:%M:%S")} - folder {folder} Downloaded {file_name}")
                    print("Downloaded from ", folder, " ",  file_name)

                    # storing info documents like weburl, path and name document (will be used then to upload document into the blob storage
                    # and to upload source into the vectordb)

                    data_info.append({                    
                        "name": file_name,
                        "webUrl": file_weburl,
                        "path": folder_path + "\\" + file_name,
                        "path_blob": folder_path + "\\" + os.path.splitext(file_name)[0] + ".txt"
                    })
                    
                    with open(os.path.join(destination_folder, "info_docs.json"), mode="w", encoding="utf-8") as f:
                        json.dump(data_info, f, indent=4, ensure_ascii=False)

                else:
                    log_message(f"{datetime.now().strftime("%Y-%m-%d %H:%M:%S")} - folder: {folder} Failed to download {file_name}")
                    print("Failed to download from ", folder, " ", file_name)
            else:
                # If the file is not one of the desired types, skip to the next item
                log_message(f"{datetime.now().strftime("%Y-%m-%d %H:%M:%S")} - folder {folder} Skipping {file_name}, not a target file type.")
                print("Skipping from folder ", folder, " ", file_name)

    else:
        log_message(f"{datetime.now().strftime("%Y-%m-%d %H:%M:%S")} Failed to access folder {folder}: {response.status_code}")
        print("Failed to access folder")
        print(response.text)

Skipping from folder  how to   [GLOBAL] VPN PALO ALTO - EXTERNAL USERS - COMPANY ACCOUNT ISSUE.url
Downloaded from  how to   Active Directory Management_v1.7.pptx
Downloaded from  how to   ADFS Service request - english version.docx
Downloaded from  how to   Application_pubblication_internet_2.pdf
Downloaded from  how to   AWS Enterprise Support.pdf
Downloaded from  how to   Bastion Host and Obsolete_RO-Apps_EN.pdf
Downloaded from  how to   Bigdata Powercenter - How to use Service Requests.pdf
Downloaded from  how to   Control-M Authorization.pptx
Downloaded from  how to   Dynatrace Overview.pptx
Downloaded from  how to   Enel_-_AWS_Architecture_v2.0.pdf
Downloaded from  how to   Enel_-_AWS_Guidelines_RDS_v5.pdf
Downloaded from  how to   Flavour PHP.pdf
Downloaded from  how to   Gold-Applications_SLA-Measurement_DH-DSValidationPhase_v2-0 .pptx
Downloaded from  how to   Guida SR SCA - Synthetic test (ETE).pptx
Downloaded from  how to   HOW TO Fill a Internet Publishing Application SR V7

In [65]:
blob_service_client = BlobServiceClient.from_connection_string(connect_str)


def process_and_upload_file(file_path):
    # Extract the file name and extension
    file_name = os.path.basename(file_path)

    file_extension = file_name.split('.')[-1]

    print(f"uploading {file_name} from {file_path}")

    # Initialize the document
    doc = None

    # Check the file extension and call the appropriate loader
    if file_extension == 'pptx':
        loader = UnstructuredPowerPointLoader(file_path)
        doc = loader.load()
    elif file_extension == 'docx':
        loader = UnstructuredWordDocumentLoader(file_path)
        doc = loader.load()
    elif file_extension == 'pdf':
        loader = PyPDFLoader(file_path)
        doc = loader.load()

    # Process the document if it was successfully loaded
    if doc:
        all_pages = "\n".join(page.to_json()["kwargs"]["page_content"] for page in doc)

        # Set blob name to the current file name with .txt extension
        blob_name = os.path.splitext(file_name)[0] + ".txt"

        # Create a ContainerClient
        container_client = blob_service_client.get_container_client(os.path.dirname(file_path).replace("\\", "/"))

        # Create a BlobClient to upload the text file
        blob_client = container_client.get_blob_client(blob_name)

        # Upload to Azure Blob Storage
        blob_client.upload_blob(all_pages, overwrite=True)

# Start the process from the 'docs' directory
for path in data:
    process_and_upload_file(path["path"])

uploading ADFS Service request - english version.docx from documents\sharepoint\how to\ADFS Service request - english version.docx
uploading Operational Instruction to Manage Planned Downtime Activities.docx from documents\sharepoint\how to\Operational Instruction to Manage Planned Downtime Activities.docx


In [None]:
# to run it from the 
if __name__ == "__main__":
    run_sharepoint_tasks()