In [None]:
import os

from google import genai
import vertexai
from google.colab import auth

PROJECT_ID = "customer-team-hackathon-2025"
LOCATION = "us-east4"

# Authenticate to Google Cloud
auth.authenticate_user()

vertexai.init(project=PROJECT_ID, location=LOCATION)
client = genai.Client(vertexai=True, project=PROJECT_ID, location=LOCATION)

In [None]:
from google.genai.types import GenerateContentConfig, Retrieval, Tool, VertexRagStore
from vertexai import rag



EMBEDDING_MODEL = "publishers/google/models/text-multilingual-embedding-002"

rag_corpus = rag.create_corpus(
    display_name="sdr-chatbot-corpus",
    backend_config=rag.RagVectorDbConfig(
        rag_embedding_model_config=rag.RagEmbeddingModelConfig(
            vertex_prediction_endpoint=rag.VertexPredictionEndpoint(
                publisher_model=EMBEDDING_MODEL
            )
        )
    ),
)

rag.list_corpora()

In [None]:
# 1) (Colab) Authenticate
from google.colab import auth
auth.authenticate_user()

# 2) Obtain ADC
import google.auth
from googleapiclient.discovery import build

creds, _ = google.auth.default(
    scopes=['https://www.googleapis.com/auth/drive.readonly']
)
drive_svc = build('drive', 'v3', credentials=creds)

# 3) Correct folder ID
FOLDER_ID = "1qh3Gjo6UNrfZ6f6fILQqwj6wZgSRPxH1"

# 4) List files
all_files, page_token = [], None
while True:
    resp = drive_svc.files().list(
        q=f"'{FOLDER_ID}' in parents and trashed = false",
        fields="nextPageToken, files(id, name, createdTime)",
        pageSize=1000,
        pageToken=page_token
    ).execute()
    all_files.extend(resp.get('files', []))
    page_token = resp.get('nextPageToken')
    if not page_token:
        break

# 3. Sort & slice
#    e.g. by creation date descending (newest first)
all_files.sort(key=lambda f: f['createdTime'], reverse=True)
top_files = all_files[:8000]

# 4. Build Drive‐file URLs
paths = [
    f"https://drive.google.com/file/d/{f['id']}"
    for f in top_files
]

In [None]:
from math import ceil

# Utility to split a list into chunks of size n
def chunked(lst, n):
    for i in range(0, len(lst), n):
        yield lst[i : i + n]

# Suppose `paths` is your list of Drive URLs
batch_size = 25
num_batches = ceil(len(paths) / batch_size)

for idx, batch in enumerate(chunked(paths, batch_size), start=1):
    print(f"Importing batch {idx}/{num_batches} ({len(batch)} files)…")
    result = rag.import_files(
            rag_corpus.name,
            paths=batch,
            transformation_config=rag.TransformationConfig(
                rag.ChunkingConfig(chunk_size=512, chunk_overlap=100),
            ),
            max_embedding_requests_per_min=1000,
        )
    print(f"  → Imported {result.imported_rag_files_count} files")


# TRANSCRIPT UPLOAD

In [None]:
import os
import requests
import time
import json

# --- Configuration ---
# It's highly recommended to use environment variables for your keys.
ACCESS_KEY = ""
SECRET_KEY = ""
BASE_URL = ""
OUTPUT_DIR = "gong_transcripts"
RATE_LIMIT_DELAY = 0.2  # Seconds to wait between transcript requests
FROM_DATETIME = "2025-01-01T00:00:00-04:00"
TO_DATETIME = "2025-08-01T23:59:59-04:00"

def fetch_transcripts_batch(cursor=None):
    """
    POST to /v2/calls/transcript with optional cursor,
    returns (records, callTranscripts).
    """
    url = f"{BASE_URL}/v2/calls/transcript"
    payload = {
        "filter": {
            "fromDateTime": FROM_DATETIME,
            "toDateTime": TO_DATETIME,
        }
    }
    if cursor:
        payload["cursor"] = cursor

    resp = requests.post(
        url,
        auth=(ACCESS_KEY, SECRET_KEY),
        headers={"Content-Type": "application/json"},
        json=payload
    )
    resp.raise_for_status()
    data = resp.json()
    return data["records"], data.get("callTranscripts", [])

def format_and_write(transcript_obj):
    """
    Given a single callTranscripts entry, format out its transcript
    and write it to a text file.
    """
    call_id = transcript_obj["callId"]
    lines = []
    for utterance in transcript_obj.get("transcript", []):
        spk = utterance.get("speakerId", "Unknown")
        # each utterance may contain one or more sentences
        text = utterance.get("text") or ""
        lines.append(f"{spk}: {text}")
    body = "\n".join(lines) or "<no transcript text>"

    fn = os.path.join(OUTPUT_DIR, f"gong_transcript_{call_id}.txt")
    with open(fn, "w", encoding="utf-8") as f:
        f.write(body)
    print(f"  → Saved transcript for call {call_id} → {fn}")

def main():
    if not all([ACCESS_KEY, SECRET_KEY, FROM_DATETIME, TO_DATETIME]):
        raise RuntimeError("Make sure GONG_ACCESS_KEY, GONG_SECRET_KEY, GONG_WORKSPACE_ID, GONG_FROM and GONG_TO are set.")

    os.makedirs(OUTPUT_DIR, exist_ok=True)
    print(f"Fetching transcripts from {FROM_DATETIME} to {TO_DATETIME}...")

    cursor = None
    batch_num = 1
    total_fetched = 0

    while True:
        print(f"\nBatch #{batch_num} (cursor={cursor})...")
        records, transcripts = fetch_transcripts_batch(cursor)
        print(f"  → {len(transcripts)} transcripts returned (totalRecords={records['totalRecords']})")

        for ct in transcripts:
            format_and_write(ct)
            total_fetched += 1

        cursor = records.get("cursor")
        if not cursor:
            break

        batch_num += 1
        time.sleep(RATE_LIMIT_DELAY)

    print(f"\nDone — wrote {total_fetched} transcript files.")

if __name__ == "__main__":
    main()

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
cp -r /content/gong_transcripts/* /content/drive/MyDrive/gong_transcripts/

# DELETE EXISTING FOLDER

In [None]:
import shutil
folder_path = "/content/ttt"
shutil.rmtree(folder_path)