# Google Workspace API Credential Guide

Use Google Drive API to Read files in Google Workspace 
1. Set up a Google Cloud Project

    Go to Google Cloud Platform(GCP) Console

    Create a new project

2. Enable the Gmail API for that project

    Select the created project and go to "APIs & services" page

    Click "+ Enable APIs and services" button, enable these APIs: Google Drive API, Google Docs API, Google Sheets API, and Google Slides API 

3. Go to "OAuth Consent Screen" and configure:

    Choose External and Fill in app name, dedveloper email, etc.

4. Create OAuth Credentials

    Go to APIs & Services > Credentials

    Click "+ Create Credentials" > "OAuth client ID"

    Choose Desktop App

    Download the generated credentials.json

    Sometimes, GCP will navigate you to "Google Auth Platform" > "Clients", and you can click "+ Create client" here to create the OAuth Credentials

    
5. Add Test Users for Gmail API OAuth Access
    
    Go to "APIs & Services" > "OAuth consent screen" > "Audience" > "Test Users"

    Add the email account from which you want to extract email content.


6. Create 'credentials' folders to store google workspace credential and user tokens

In [None]:
# !pip install PyPDF2
# !pip install --upgrade google-api-python-client google-auth-httplib2 google-auth-oauthlib

In [None]:
from google_auth_oauthlib.flow import InstalledAppFlow
from googleapiclient.discovery import build
from google.oauth2.credentials import Credentials
from googleapiclient.http import MediaIoBaseDownload
import os

import io
from PyPDF2 import PdfReader
from langchain.vectorstores import Chroma
from langchain.embeddings import OpenAIEmbeddings
from langchain.schema import Document

GOOGLE_WORKSPACE_SCOPES = ["https://www.googleapis.com/auth/drive.readonly",
          'https://www.googleapis.com/auth/documents.readonly',
          'https://www.googleapis.com/auth/spreadsheets.readonly',
          'https://www.googleapis.com/auth/presentations.readonly'
         ]

In [None]:
def extract_google_doc(docs_service, file_id):
    doc = docs_service.documents().get(documentId=file_id).execute()
    content = ""
    for elem in doc.get("body", {}).get("content", []):
        if "paragraph" in elem:
            for run in elem["paragraph"]["elements"]:
                content += run.get("textRun", {}).get("content", "")
    return content.strip()

def extract_google_sheet(service, file_id):
    # Get spreadsheet metadata
    spreadsheet = service.spreadsheets().get(spreadsheetId=file_id).execute()
    all_text = ""

    # Loop through each sheet
    for sheet in spreadsheet.get("sheets", []):
        title = sheet["properties"]["title"]
        result = service.spreadsheets().values().get(
            spreadsheetId=file_id,
            range=title
        ).execute()

        values = result.get("values", [])
        sheet_text = f"### Sheet: {title} ###\n"
        sheet_text += "\n".join([", ".join(row) for row in values])
        all_text += sheet_text + "\n\n"

    return all_text.strip()


def extract_google_slide(slides_service, file_id):
    pres = slides_service.presentations().get(presentationId=file_id).execute()
    text = ""
    for slide in pres.get("slides", []):
        for element in slide.get("pageElements", []):
            shape = element.get("shape")
            if shape:
                for p in shape.get("text", {}).get("textElements", []):
                    if "textRun" in p:
                        text += p["textRun"]["content"]
    return text.strip()

def extract_pdf_from_drive(drive_service, file_id, filename='downloaded.pdf'):
    request = drive_service.files().get_media(fileId=file_id)
    fh = io.BytesIO()
    downloader = MediaIoBaseDownload(fh, request)
    done = False
    while not done:
        _, done = downloader.next_chunk()
    fh.seek(0)
    reader = PdfReader(fh)
    return "\n".join([page.extract_text() for page in reader.pages if page.extract_text()])

In [None]:
def get_creds():
    if os.path.exists("token.json"):
        creds = Credentials.from_authorized_user_file("token.json", SCOPES)
    else:
        flow = InstalledAppFlow.from_client_secrets_file("credentials/google_drive_workspace_credentials.json", SCOPES)
        creds = flow.run_local_server(port=0)
        with open("token.json", "w") as token:
            token.write(creds.to_json())
    return creds
    

def get_folder_id_by_name(drive_service, folder_name):
    query = f"mimeType='application/vnd.google-apps.folder' and name='{folder_name}' and trashed=false"
    results = drive_service.files().list(
        q=query,
        fields="files(id, name)",
        pageSize=1
    ).execute()

    folders = results.get("files", [])
    if not folders:
        raise ValueError(f"❌ Folder named '{folder_name}' not found.")
    return folders[0]['id']


def extract_docs_from_google_workspace(folder_name):
    info = ""
    
    creds = get_creds()

    file_types = {
        'application/vnd.google-apps.document': lambda fid: extract_google_doc(docs_service, fid),
        'application/vnd.google-apps.spreadsheet': lambda fid: extract_google_sheet(sheets_service, fid),
        'application/vnd.google-apps.presentation': lambda fid: extract_google_slide(slides_service, fid),
        'application/pdf': lambda fid: extract_pdf_from_drive(drive_service, fid),
    }
    
    drive_service = build("drive", "v3", credentials=creds)
    docs_service = build('docs', 'v1', credentials=creds)
    sheets_service = build('sheets', 'v4', credentials=creds)
    slides_service = build('slides', 'v1', credentials=creds)

    folder_id = get_folder_id_by_name(drive_service, folder_name)
    info += f"Collection files from folder: {folder_name}\n"
    
    query = (
        f"'{folder_id}' in parents and ("
        'mimeType="application/vnd.google-apps.document" or '
        'mimeType="application/vnd.google-apps.spreadsheet" or '
        'mimeType="application/vnd.google-apps.presentation" or '
        'mimeType="application/pdf")'
    )
    
    results = drive_service.files().list(
        q=query,
        fields="files(id, name, mimeType)",
        pageSize=20
    ).execute()

    docs = []
    summary_info = {
        'application/vnd.google-apps.document': {'file_type': 'Google Doc', 'count': 0},
        'application/vnd.google-apps.spreadsheet': {'file_type': 'Google Sheet', 'count': 0},
        'application/vnd.google-apps.presentation': {'file_type': 'Google Silde', 'count': 0},
        'application/pdf': {'file_type': 'PDF', 'count': 0}
    }
    for file in results.get("files", []):
        extractor = file_types.get(file['mimeType'])
        if extractor:
            try:
                content = extractor(file["id"])
                if content:
                    docs.append(Document(page_content=content, metadata={"source": file["name"]}))
                    summary_info[file['mimeType']]['count'] += 1
            except Exception as e:
                print(f"❌ Error processing {file['name']}: {e}")
    
    total = 0;
    for file_type, element in summary_info.items():
        total += element['count']
        info += f"Found {element['count']} {element['file_type']} files\n"
    info += f"Total documents loaded: {total}"
    return docs, info

In [None]:
docs, info = extract_docs_from_google_workspace("google_workspace_knowledge_base")