<a href="https://colab.research.google.com/github/fmind/BKFC/blob/main/BKFC_Build_a_Knowledge_base_From_Chats.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# SETUP


## Google Cloud

1.  **Google Cloud Project:**
    * Ensure you have a Google Cloud Project. Note your **Project ID**.
    * In the GCP Console, enable the **Google Chat API** and **Vertex AI API** for this project (APIs & Services -> Library).

2.  **OAuth Credentials:**
    * Go to APIs & Services -> **OAuth consent screen**:
        * Configure it (User Type, App Name e.g., `BKFC`, User Support Email).
        * Add Scopes: `chat.spaces.readonly`, `chat.messages.readonly`, `cloud-platform`.
    * Go to APIs & Services -> **Credentials**:
        * Click "+ CREATE CREDENTIALS" -> "OAuth client ID".
        * Select Application type: **Desktop app**.
        * Name it (e.g., `BKFC Colab Client`).
        * Copy the **Client ID** and **Client Secret**.

3.  **Colab Configuration:**
    * Click the **Secrets** icon (🔑) in the left sidebar.
    * Add the following secrets:
        * `BKFC_CLIENT_ID`: (Paste your Client ID here)
        * `BKFC_CLIENT_SECRET`: (Paste your Client Secret here)
        * `BKFC_PROJECT_ID`: (Paste your GCP Project ID here)
    * Review the parameters under `# CONFIGS` below (Model, Location, etc.). Project ID, Client ID/Secret should be automatically picked up from Secrets if set.

4.  **Authentication:**
    * When you run the `!gcloud auth application-default login...` cell under `# SERVICES`:
        * Follow the URL printed in the output.
        * Authenticate with your Google Account.
        * Grant the requested permissions.
        * Copy the authorization code provided.
        * Paste the code back into the input field in the Colab output and press Enter.

*You can now proceed to run the rest of the notebook cells*

# CONFIGS

In [None]:
# @title Generative AI

MODEL = "gemini-2.0-flash" # @param {"type":"string"}
TEMPERATURE = 0.0 # @param {"type":"slider","min":0,"max":2,"step":0.1}

In [None]:
# @title Google Cloud

PROJECT_ID = "" # @param {"type":"string"}
LOCATION = "us-central1" # @param {"type":"string"}
SECRETS = "client_secrets.json" # @param {"type":"string"}

In [None]:
# @title Chat API

CLIENT_ID = "" # @param {"type": "string"}
CLIENT_SECRET = "" # @param {"type": "string"}
SCOPES = [
    'https://www.googleapis.com/auth/cloud-platform',
    'https://www.googleapis.com/auth/chat.spaces.readonly',
    'https://www.googleapis.com/auth/chat.messages.readonly',
]

In [None]:
# @title App

DOWNLOAD = False # @param {"type": "boolean"}
PAGE_SIZE = 1000 # @param {"type": "integer"}
SINCE_DAYS = 7 # @param {"type": "integer"}

# IMPORTS

## Internal

In [None]:
import io
import json
import string
import textwrap
import typing as T
import datetime as dt
import itertools as it

## External

In [None]:
import pydantic as pdt
from google import genai
from IPython import display
from google.genai import types as GT
from google.colab import userdata, files
from googleapiclient.discovery import build

# SECRETS

## Project

In [None]:
PROJECT_ID = PROJECT_ID or userdata.get('BKFC_PROJECT_ID')

## Client

In [None]:
CLIENT_ID = CLIENT_ID or userdata.get('BKFC_CLIENT_ID')
CLIENT_SECRET = CLIENT_SECRET or userdata.get('BKFC_CLIENT_SECRET')

## File

In [None]:
with open(SECRETS, 'w') as file:
    secrets = {
        "installed": {
            "client_id": CLIENT_ID,
            "project_id": PROJECT_ID,
            "auth_uri": "https://accounts.google.com/o/oauth2/auth",
            "token_uri": "https://oauth2.googleapis.com/token",
            "auth_provider_x509_cert_url": "https://www.googleapis.com/oauth2/v1/certs",
            "client_secret": CLIENT_SECRET,
            "redirect_uris": [
                "http://localhost"
            ]
        }
    }
    file.write(json.dumps(secrets, indent=4))

# SERVICES

## Authentication

In [None]:
!gcloud auth application-default login --no-browser --client-id-file={SECRETS} --scopes={",".join(SCOPES)}

## Gen AI

In [None]:
genai_client = genai.Client(vertexai=True, project=PROJECT_ID, location=LOCATION)

## Chat

In [None]:
chat_service = build('chat', 'v1')

# CONTENTS

## Filters

In [None]:
today = dt.date.today()
since = today - dt.timedelta(days=SINCE_DAYS)
since

## Spaces

In [None]:
spaces = []
page_token = None
while True:
    response = chat_service.spaces().list(pageSize=PAGE_SIZE, pageToken=page_token).execute()
    for space in response.get('spaces', []):
        last_active_time = dt.datetime.fromisoformat(space['lastActiveTime'])
        last_active_date = last_active_time.date()
        if last_active_date >= since:
            spaces.append(space)
    if not page_token:
        break
len(spaces)

## Messages

### Unsorted

In [None]:
messages = []
for space in spaces:
    page_token = None
    while True:
        response = chat_service.spaces().messages().list(
            parent=space['name'],
            filter=f'createTime > "{since}T00:00:00+00:00"',
            orderBy='createTime DESC',
            pageToken=page_token,
            pageSize=PAGE_SIZE,

        ).execute()
        messages.extend(response.get('messages', []))
        if not page_token:
            break
len(messages)

### Sorted

In [None]:
def message_sorted_key(message: dict) -> tuple[str, str, str]:
    """Sort the message by space, thread, and create time."""
    space = message.get('space', {})
    space_name = space.get('name', '')
    thread = message.get('thread', {})
    thread_name = thread.get('name', '')
    create_time = message.get('createTime', '')
    return space_name, thread_name, create_time

messages = sorted(messages, key=message_sorted_key, reverse=True)
len(messages)

## Groups

In [None]:
def message_groupby_key(message: dict) -> str:
    """Sort the message by space name."""
    space = message.get('space', {})
    space_name = space.get('name', '')
    return space_name

groups = {key: list(values) for key, values in it.groupby(messages, key=message_groupby_key)}
len(groups)

## Pages

In [None]:
pages = {}
for key, messages in groups.items():
    page = io.StringIO()
    last_thread = None
    for message in messages:
        text = message.get('formattedText')
        if not text:
            continue
        page.write(f'{text}\n\n')
    pages[key] = page.getvalue()
len(pages)

# ANALYSIS

## Data Classes

In [None]:
# --- Define the structure for individual pieces of information ---

class QuestionAnswerPair(pdt.BaseModel):
  """Represents a question asked and its corresponding answer."""
  question: str = pdt.Field(description="The question that was asked.")
  answer: str = pdt.Field(description="The corresponding answer found in the chat.")

class ProjectInfo(pdt.BaseModel):
  """Represents a project mentioned in the chat."""
  name: str = pdt.Field(description="The name of the project mentioned.")
  details: str = pdt.Field(description="A brief summary of the status or key discussion points about this project found in the chat.")

class ActionItem(pdt.BaseModel):
  """Represents an action item or task identified in the chat."""
  task: str = pdt.Field(description="The description of the action item or task.")
  assignee: T.Optional[str] = pdt.Field(default=None, description="The person assigned to the task, if specified in the chat.")

# --- Define the main structure for the overall chat insights ---

class ChatInsight(pdt.BaseModel):
  """Structured insight extracted from a Google Chat conversation history."""
  summary: T.Optional[str] = pdt.Field(
      default=None,
      description="A concise summary of the main topics discussed in the chat. Null if no clear overall topic."
  )
  questions_answers: T.Optional[list[QuestionAnswerPair]] = pdt.Field(
      default=None,
      description="A list of distinct questions and their corresponding answers found in the chat. Null or empty list if none found."
  )
  unanswered_questions: T.Optional[list[str]] = pdt.Field(
      default=None,
      description="A list of questions that were asked but do not appear to have been answered in the chat. Null or empty list if none found."
  )
  projects: T.Optional[list[ProjectInfo]] = pdt.Field(
      default=None,
      description="A list of projects mentioned along with their status or discussion summary. Null or empty list if none found."
  )
  action_items: T.Optional[list[ActionItem]] = pdt.Field(
      default=None,
      description="A list of specific tasks or action items mentioned, including assignees if specified. Null or empty list if none found."
  )
  feedback_suggestions: T.Optional[list[str]] = pdt.Field(
      default=None,
      description="A list of feedback provided, suggestions made for improvements, or problems/challenges raised in the chat. Null or empty list if none found."
  )
  technical_insights: T.Optional[list[str]] = pdt.Field(
      default=None,
      description="A list of specific mentions related to MLOps, AI, GenAI, tools (like Vertex AI), technical solutions, configurations, or code snippets discussed. Null or empty list if none found."
  )

## Templates

In [None]:
ANALYSIS_TEMPLATE = string.Template("""
Analyze the following Google Chat history and extract the relevant information according to the provided schema.
Identify key topics, questions (answered and unanswered), mentioned projects, action items, feedback, suggestions, and technical details.

# Chat History

${page}
""")

## Insights

In [None]:
insights = {}
for key, page in pages.items():
    page = pages[key]
    prompt = ANALYSIS_TEMPLATE.substitute(page=page)
    try:
        response = genai_client.models.generate_content(
            model=MODEL,
            contents=prompt,
            config={
                "response_mime_type": "application/json",
                "response_schema": ChatInsight,
                "temperature": TEMPERATURE
            },
        )
        print(key, response.usage_metadata.total_token_count)
        insights[key] = response.parsed
    except Exception as error:
        print(f"An error occurred during API call for space {key}: {error}")
len(insights)

## Markdowns

In [None]:
markdowns = []
for key, insight in insights.items():
    parts = []
    parts.append(f"# {key}")
    # --- Summary ---
    if summary := insight.summary:
        parts.append("## Summary")
        parts.append(summary)
    # --- Questions & Answers ---
    if qas := insight.questions_answers:
        parts.append("## Questions & Answers")
        for qa in qas:
            parts.append(f"- Q: **{qa.question}** A: {qa.answer}")
    # --- Unanswered Questions ---
    if unanswered := insight.unanswered_questions:
        parts.append("## Unanswered Questions")
        for question in unanswered:
            parts.append(f"- {question}")
    # --- Projects ---
    if projects := insight.projects:
        parts.append("## Projects")
        for project in projects:
            parts.append(f"- **{project.name}:** {project.details}\n")
    # --- Action Items ---
    if actions := insight.action_items:
        parts.append("## Action Items")
        for item in actions:
            parts.append(f"- {item.task} *(Assigned: {item.assignee})*\n")
    # --- Feedback & Suggestions ---
    if feedback := insight.feedback_suggestions:
        parts.append("## Feedback & Suggestions")
        for suggestion in feedback:
            parts.append(f"- {suggestion}")
    # --- Technical Insights ---
    if tech := insight.technical_insights:
        parts.append("## Technical Insights")
        for insight in tech:
            parts.append(f"- {insight}")
    # Join all parts into a single Markdown string
    markdown = display.Markdown("\n".join(parts).strip())
    markdowns.append(markdown)
len(markdowns)

# EXPORTS

## Jsonlines

In [None]:
jsonlines_path = 'chat_insights.jsonlines'
with open(jsonlines_path, 'w') as file:
    for key, insight in insights.items():
        dump = insight.model_dump_json()
        file.write(dump)
        file.write('\n')
        print(dump)
if DOWNLOAD:
    files.download(jsonlines_path)

## Markdowns

In [None]:
markdowns_path = 'chat_insights.md'
with open(markdowns_path, 'w') as file:
    for markdown in markdowns:
        file.write(markdown.data)
        display.display(markdown)
if DOWNLOAD:
    files.download(markdowns_path)