# Twin Professional Chatbot

Professional Twin chatbot using the Ollama LLM framework. The chatbot is designed to provide information about my professional background based on a summary text file and a linked PDF profile and portfolio website.

## Setup

In [5]:
# If you don't know what any of these packages do - you can always ask ChatGPT for a guide!

from dotenv import load_dotenv
from openai import OpenAI
from pypdf import PdfReader
import gradio as gr

In [2]:
load_dotenv(override=True)

True

In [3]:
OLLAMA_API_KEY = os.getenv('OLLAMA_API_KEY')
OLLAMA_BASE_URL = os.getenv('OLLAMA_BASE_URL')
OLLAMA_MODEL_LLAMA = os.getenv('OLLAMA_MODEL_LLAMA')
OLLAMA_MODEL_PHI = os.getenv('OLLAMA_MODEL_PHI')

# Check Ollama
if OLLAMA_API_KEY and OLLAMA_BASE_URL and OLLAMA_MODEL_LLAMA and OLLAMA_MODEL_PHI:
    print(f"Ollama is set:")
    print(f"\t- OLLAMA_BASE_URL = {OLLAMA_BASE_URL}")
    print(f"\t- OLLAMA_MODEL_LLAMA = {OLLAMA_MODEL_LLAMA}")
    print(f"\t- OLLAMA_MODEL_PHI = {OLLAMA_MODEL_PHI}")
else:
    print("Ollama parameter(s) not set.")

Ollama is set:
	- OLLAMA_BASE_URL = http://localhost:11434/v1
	- OLLAMA_MODEL_LLAMA = llama3.2
	- OLLAMA_MODEL_PHI = phi4-mini


Install additional libraries for web scraping and data handling:

In [1]:
%pip install requests beautifulsoup4 lxml pandas

/Users/nc/Library/CloudStorage/OneDrive-Personal/github/agents/.venv/bin/python: No module named pip
Note: you may need to restart the kernel to use updated packages.


## Load my professional data

Summary:

In [8]:
with open("me/summary.txt", "r", encoding="utf-8") as f:
    summary = f.read()

LinkedIn profile:

In [9]:
reader = PdfReader("me/linkedin.pdf")
linkedin = ""
for page in reader.pages:
    text = page.extract_text()
    if text:
        linkedin += text

Website portfolio:

In [29]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from typing import Dict, List, Optional, Any, Iterator
import re
import json
import os

In [22]:
URL = "https://cordovank.github.io"
HEADERS = {"User-Agent": "Mozilla/5.0 (compatible; your-email@example.com)"}

In [23]:
def fetch_html(url: str, timeout: int = 10) -> str:
    resp = requests.get(url, headers=HEADERS, timeout=timeout)
    resp.raise_for_status()
    return resp.text

def make_soup(html: str):
    soup = BeautifulSoup(html, "lxml")
    # remove scripts/styles which pollute text
    for tag in soup(["script", "style", "noscript", "template"]):
        tag.decompose()
    return soup

def get_meta(soup: BeautifulSoup) -> Dict[str, Optional[str]]:
    title = soup.title.string.strip() if soup.title and soup.title.string else None
    meta_desc = (soup.find("meta", {"name":"description"}) or {}).get("content")
    canonical = (soup.find("link", {"rel":"canonical"}) or {}).get("href")
    return {"title": title, "meta_description": meta_desc, "canonical": canonical}

def normalize_links(soup: BeautifulSoup, base_url: str) -> List[str]:
    raw = [a.get("href") for a in soup.select("a[href]")]
    normalized = []
    for h in raw:
        if not h:
            continue
        absurl = urljoin(base_url, h)
        # optionally skip mailto: or javascript:
        if absurl.startswith("javascript:"):
            continue
        normalized.append(absurl)
    # unique preserve order
    seen = set()
    out = []
    for u in normalized:
        if u not in seen:
            seen.add(u)
            out.append(u)
    return out

def get_visible_text(soup: BeautifulSoup, max_chars: int = 100000) -> str:
    # prefer main/article sections if present
    selectors = ["main", "article", "body"]
    for sel in selectors:
        node = soup.select_one(sel)
        if node:
            text = node.get_text(separator="\n").strip()
            if text:
                return text[:max_chars]
    # fallback
    return soup.get_text(separator="\n").strip()[:max_chars]

def extract_section_by_id(soup: BeautifulSoup, id_name: str) -> Dict:
    node = soup.find(id=id_name)
    if not node:
        return {"id": id_name, "present": False}
    # title: first header
    header = None
    for htag in ["h1","h2","h3","h4","h5","h6"]:
        h = node.find(htag)
        if h and h.get_text(strip=True):
            header = h.get_text(strip=True)
            break
    text = node.get_text(separator="\n", strip=True)
    links = [urljoin("", a.get("href")) for a in node.select("a[href]")]
    return {"id": id_name, "present": True, "header": header, "text": text, "links": links}

def extract_portfolio_items(soup: BeautifulSoup, base_url: str) -> List[Dict]:
    items = []
    for item in soup.select(".portfolio-item"):
        title_node = item.select_one(".portfolio-txt h4") or item.select_one("h4")
        title = title_node.get_text(strip=True) if title_node else None
        tags = [t.get_text(strip=True) for t in item.select(".portfolio-tags span")]
        img = item.select_one("img")
        img_src = urljoin(base_url, img.get("src")) if img and img.get("src") else None
        link = None
        # try to find repo/external link inside overlay or links
        a = item.select_one("a[href]")
        if a:
            link = urljoin(base_url, a.get("href"))
        items.append({"title": title, "tags": tags, "img": img_src, "link": link})
    return items

_email_re = re.compile(r"[a-zA-Z0-9.\-+_]+@[a-zA-Z0-9.\-+_]+\.[a-zA-Z]+")
_phone_re = re.compile(r"(?:\+?\d{1,3}[-.\s]?)?(?:\(\d{2,4}\)|\d{2,4})[-.\s]?\d{3}[-.\s]?\d{3,4}")

def extract_contact_info(soup: BeautifulSoup) -> Dict:
    text = soup.get_text(" ", strip=True)
    emails = list(set(_email_re.findall(text)))
    phones = list(set(_phone_re.findall(text)))
    social = {}
    # find common social links
    for a in soup.select("a[href]"):
        href = a.get("href")
        if "linkedin.com" in href:
            social.setdefault("linkedin", []).append(href)
        if "github.com" in href:
            social.setdefault("github", []).append(href)
        if href.startswith("mailto:"):
            social.setdefault("mailto", []).append(href)
    # dedupe
    for k in social:
        social[k] = list(dict.fromkeys(social[k]))
    return {"emails": emails, "phones": phones, "social": social}

def extract_page(url: str) -> Dict:
    html = fetch_html(url)
    soup = make_soup(html)
    meta = get_meta(soup)
    links = normalize_links(soup, url)
    visible_text = get_visible_text(soup)
    # sections present on your portfolio
    sections = {sid: extract_section_by_id(soup, sid) for sid in ["hero","about","resume","portfolio","contact"]}
    portfolio = extract_portfolio_items(soup, url)
    contact = extract_contact_info(soup)
    return {
        "url": url,
        "meta": meta,
        "links": links,
        "visible_text": visible_text,
        "sections": sections,
        "portfolio_items": portfolio,
        "contact": contact
    }

html = fetch_html(URL)
soup = make_soup(html)


In [None]:
# Example usage:
# data = extract_page("https://cordovank.github.io")
# print(json.dumps({k:v for k,v in data.items() if k!='visible_text'}, indent=2))
# Optionally save:
# with open("cordovank_page.json", "w", encoding="utf-8") as f:
#     json.dump(data, f, ensure_ascii=False, indent=2)

In [30]:
data = extract_page("https://cordovank.github.io")

In [35]:
print("Title:", data["meta"]["title"])
print("Contact emails:", data["contact"]["emails"])
print("First portfolio item:", data["portfolio_items"][0] if data["portfolio_items"] else "none")

Title: Nellie's Personal Portfolio
Contact emails: ['cordova.nellie@outlook.com']
First portfolio item: {'title': 'RAG System with Guardrails', 'tags': ['FastAPI', 'LLM', 'FAISS'], 'img': 'https://cordovank.github.io/assets/img/project/rag.webp', 'link': 'https://cordovank.github.io/assets/pages/project-details/rag.html'}


Save to JSON file:

In [32]:
# --- path helpers ----------------------------------------------------------
OUTPUT_DIR = "me/"
os.makedirs(OUTPUT_DIR, exist_ok=True)
PAGE_JSON_PATH = os.path.join(OUTPUT_DIR, "cordovank_page.json")
PAGE_JSONL_PATH = os.path.join(OUTPUT_DIR, "cordovank_chunks.jsonl")

# --- save/load helpers ----------------------------------------------------
def save_json(obj: Dict[str, Any], path: str = PAGE_JSON_PATH, indent: int = 2) -> None:
    """Save dict to JSON (UTF-8)."""
    with open(path, "w", encoding="utf-8") as f:
        json.dump(obj, f, ensure_ascii=False, indent=indent)
    print(f"Saved JSON -> {path}")

def load_json(path: str = PAGE_JSON_PATH) -> Dict[str, Any]:
    """Load JSON from disk and return as dict."""
    with open(path, "r", encoding="utf-8") as f:
        return json.load(f)

# --- very small schema / sanity check -------------------------------------
# optional: lightweight validation you can expand or replace with jsonschema
def basic_validate_page(obj: Dict[str, Any]) -> List[str]:
    """Return list of problems (empty list == ok)."""
    problems = []
    if not isinstance(obj, dict):
        problems.append("Top-level object is not a dict.")
        return problems
    for key in ["url", "meta", "visible_text"]:
        if key not in obj:
            problems.append(f"Missing key: {key}")
    # meta should have at least title (or None)
    if "meta" in obj and not isinstance(obj["meta"], dict):
        problems.append("meta is not a dict")
    return problems

# --- chunking for embeddings / RAG ---------------------------------------
def chunk_text(text: str, max_chars: int = 2000, overlap: int = 200) -> Iterator[str]:
    """Yield chunks of text for embeddings. Uses simple sliding window by characters.
    For better behavior split by sentences/paragraphs (you can split by '\n\n' etc.)."""
    if not text:
        return
    start = 0
    text_len = len(text)
    while start < text_len:
        end = min(start + max_chars, text_len)
        chunk = text[start:end]
        yield chunk
        if end == text_len:
            break
        start = max(0, end - overlap)

def page_to_jsonl_chunks(page_obj: Dict[str, Any], out_path: str = PAGE_JSONL_PATH) -> int:
    """Write JSONL where each line is a chunk record with metadata for retrieval/embedding."""
    visible = page_obj.get("visible_text", "")
    url = page_obj.get("url")
    title = page_obj.get("meta", {}).get("title")
    section_meta = page_obj.get("sections", {})  # optional: you can create section-specific chunks too

    count = 0
    with open(out_path, "w", encoding="utf-8") as fout:
        for i, chunk in enumerate(chunk_text(visible, max_chars=1500, overlap=200)):
            record = {
                "id": f"{os.path.basename(out_path)}::chunk::{i}",
                "url": url,
                "title": title,
                "chunk_index": i,
                "text": chunk,
                # minimal metadata to aid retrieval
                "meta": {
                    "source": "portfolio_page",
                    "section": None,
                }
            }
            fout.write(json.dumps(record, ensure_ascii=False) + "\n")
            count += 1
    print(f"Wrote {count} chunk(s) -> {out_path}")
    return count

In [None]:
# 1) Extract page data
# data = extract_page("https://cordovank.github.io")

# 2) Save full page JSON
# save_json(data, PAGE_JSON_PATH)

# 3) Load and validate
# loaded = load_json(PAGE_JSON_PATH)
# problems = basic_validate_page(loaded)
# if problems:
#     print("Validation problems:", problems)
# else:
#     print("Loaded JSON ok:", PAGE_JSON_PATH)

# 4) Create chunked JSONL for embeddings
# page_to_jsonl_chunks(loaded, PAGE_JSONL_PATH)

In [None]:
save_json(data, PAGE_JSON_PATH)

Saved JSON -> me/cordovank_page.json


In [34]:
loaded = load_json(PAGE_JSON_PATH)
problems = basic_validate_page(loaded)
if problems:
    print("Validation problems:", problems)
else:
    print("Loaded JSON ok:", PAGE_JSON_PATH)

Loaded JSON ok: me/cordovank_page.json


## Prepare Prompt

In [None]:
ollama = OpenAI(base_url=OLLAMA_BASE_URL, api_key=OLLAMA_API_KEY)

# using OLLAMA_MODEL_LLAMA and OLLAMA_MODEL_PHI
model_name = OLLAMA_MODEL_LLAMA

In [None]:
name = "Nellie Cordova"

system_prompt = f"You are acting as {name}. You are answering questions on {name}'s website, \
particularly questions related to {name}'s career, background, skills and experience. \
Your responsibility is to represent {name} for interactions on the website as faithfully as possible. \
You are given a summary of {name}'s background and LinkedIn profile which you can use to answer questions. \
Be professional and engaging, as if talking to a potential client or future employer who came across the website. \
If you don't know the answer, say so."

system_prompt += f"\n\n## Summary:\n{summary}\n\n## LinkedIn Profile:\n{linkedin}\n\n"
system_prompt += f"With this context, please chat with the user, always staying in character as {name}."


In [None]:
def chat(message, history):
    messages = [{"role": "system", "content": system_prompt}] + history + [{"role": "user", "content": message}]
    response = ollama.chat.completions.create(model=model_name, messages=messages)
    return response.choices[0].message.content

## Launch the chatbot 

Run the next cell and ask your professional twin questions about you!

Questions to try:
- Tell me a bit about yourself.
- What is your greatest accomplishment?
- What would you say are your top skills?
- What is a challenge that you encountered and needed to overcome?
- What are you looking for in your next role?

In [None]:
gr.ChatInterface(chat, type="messages").launch()