In [15]:
import json
from minsearch import AppendableIndex

In [16]:
import requests 

docs_url = 'https://github.com/alexeygrigorev/llm-rag-workshop/raw/main/notebooks/documents.json'
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()

documents = []

for course in documents_raw:
    course_name = course['course']

    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)

In [17]:
index = AppendableIndex(
    text_fields=["question", "text", "section"],
    keyword_fields=["course"]
)

index.fit(documents)

<minsearch.append.AppendableIndex at 0x72569407ad50>

In [18]:
from openai import OpenAI
from dotenv import load_dotenv
import os
import json  # Important for parsing arguments

# Load environment variables
load_dotenv()
DEEPSEEK_API_KEY = os.getenv("DEEPSEEK_API_KEY")

# Initialize DeepSeek client
client = OpenAI(api_key=DEEPSEEK_API_KEY, base_url="https://api.deepseek.com/v1")

In [19]:
# Your search implementation
def search(query):
    # Assuming 'index' is defined elsewhere
    boost = {'question': 3.0, 'section': 0.5}
    
    results = index.search(
        query=query,
        filter_dict={'course': 'data-engineering-zoomcamp'},
        boost_dict=boost,
        num_results=5,
        output_ids=True
    )
    
    return results

In [20]:
# CORRECTED TOOL DEFINITION
search_tool = {
    "type": "function",
    "function": {  # Critical: Wrap in "function" key
        "name": "search",
        "description": "Search the FAQ database",
        "parameters": {
            "type": "object",
            "properties": {
                "query": {
                    "type": "string",
                    "description": "Search query text to look up in the course FAQ."
                }
            },
            "required": ["query"],
            "additionalProperties": False
        }
    }
}

tools = [search_tool]  # Tools list for API call

In [21]:
# Example usage
question = "How do I prepare for module 1?"

In [22]:
developer_prompt = """
You're a course teaching assistant. 
You're given a question from a course student and your task is to answer it.
""".strip()

messages = [
    {"role": "system", "content": developer_prompt},
    {"role": "user", "content": question}
]

# First API call
response = client.chat.completions.create(
    model="deepseek-chat",
    messages=messages,
    tools=tools,
    tool_choice="auto"
)


In [23]:
response

ChatCompletion(id='743f7cc8-8474-4328-9e10-5e59a1acfe12', choices=[Choice(finish_reason='tool_calls', index=0, logprobs=None, message=ChatCompletionMessage(content='', refusal=None, role='assistant', annotations=None, audio=None, function_call=None, tool_calls=[ChatCompletionMessageToolCall(id='call_0_a20fd0d0-c3e8-4a83-ac79-51ca9b48a35a', function=Function(arguments='{"query":"prepare for module 1"}', name='search'), type='function', index=0)]))], created=1752481759, model='deepseek-chat', object='chat.completion', service_tier=None, system_fingerprint='fp_8802369eaa_prod0623_fp8_kvcache', usage=CompletionUsage(completion_tokens=23, prompt_tokens=147, total_tokens=170, completion_tokens_details=None, prompt_tokens_details=PromptTokensDetails(audio_tokens=None, cached_tokens=128), prompt_cache_hit_tokens=128, prompt_cache_miss_tokens=19))

In [24]:
response.choices[0].message

ChatCompletionMessage(content='', refusal=None, role='assistant', annotations=None, audio=None, function_call=None, tool_calls=[ChatCompletionMessageToolCall(id='call_0_a20fd0d0-c3e8-4a83-ac79-51ca9b48a35a', function=Function(arguments='{"query":"prepare for module 1"}', name='search'), type='function', index=0)])

In [25]:
# Process response
#Step 1: Extract Initial Message
#Gets the assistant's response from the first API call
# The response could be either: -> # A direct answer (if no tool is needed), OR -> # A request to execute a tool function
message = response.choices[0].message

#Step 2: Check for Tool Calls
#Checks if the response contains tool call requests -> #If true → Proceed to handle tool execution -> #If false → Jump to final else block (direct answer)

if message.tool_calls:
    #Step 3:Extract Tool Call Details-> Gets the first tool call request (assumes one tool call per response) -> Verifies it's the expected "search" tool
    tool_call = message.tool_calls[0]
    if tool_call.function.name == "search":
        
        #Step 4: Parse Arguments  -> Extracts the function arguments (a JSON string) -> Converts to a Python dictionary
        #Example output: {'query': 'How to do well in module 1'}
        arguments = json.loads(tool_call.function.arguments)
        
        # Execute Local Function -> Calls your actual search function with the parsed query -> This executes your custom search logic (e.g., querying a database)
        #Returns raw search results
        search_results = search(arguments["query"])
        
        # Create follow-up messages -> Step 6: Construct New Message History
        #Builds new conversation history with 3 parts:
        #Original messages (user + system prompts)
        #Assistant's tool request (recorded as if it "said" this)
        #Tool's response with search results
        
        new_messages = [
            *messages,
            {
                "role": "assistant",
                "content": None,
                "tool_calls": [{
                    "id": tool_call.id,
                    "type": "function",
                    "function": {
                        "name": "search",
                        "arguments": tool_call.function.arguments
                    }
                }]
            },
            {
                "role": "tool",
                "content": json.dumps(search_results),  # Must be string
                "tool_call_id": tool_call.id
            }
        ]
        
        # Final response with search results
        final_response = client.chat.completions.create(
            model="deepseek-chat",
            messages=new_messages,
            tools=tools
        )
        print(final_response.choices[0].message.content)
else:
    print(message.content)

To prepare for Module 1 of the course, here are some key steps and tips based on common issues and solutions from the FAQ:

1. **Docker and PostgreSQL Setup**:
   - Ensure you have Docker installed and running.
   - If you encounter a `ModuleNotFoundError: No module named 'psycopg2'` error, install it using:
     ```bash
     pip install psycopg2-binary
     ```
     If the issue persists, update your package manager (pip or conda) and reinstall `psycopg2`.

2. **SQLAlchemy Configuration**:
   - If you face a `TypeError: 'module' object is not callable` error when using `create_engine`, ensure your connection string is correctly formatted. For example:
     ```python
     conn_string = "postgresql+psycopg://root:root@localhost:5432/ny_taxi"
     engine = create_engine(conn_string)
     ```

3. **General Preparation**:
   - Review the course materials and documentation for Module 1 to understand the tools and concepts you'll be working with (e.g., Docker, PostgreSQL, SQLAlchemy).
   - S

### Iterative function calling

In [26]:
from IPython.display import display, HTML, Markdown # pip install markdown

In [27]:
from openai import OpenAI
import json

# Initialize client
client = OpenAI(api_key=DEEPSEEK_API_KEY, base_url="https://api.deepseek.com/v1")

# Define search tool (same as before)
search_tool = {
    "type": "function",
    "function": {
        "name": "search",
        "description": "Search the FAQ database",
        "parameters": {
            "type": "object",
            "properties": {
                "query": {"type": "string", "description": "Search query text"}
            },
            "required": ["query"],
            "additionalProperties": False
        }
    }
}

# Initial user question
question = "How do I prepare for module 1?"

# System prompt with iterative instructions
system_prompt = """
You're a teaching assistant helping students with course questions. 
When a student asks a question:
1. Generate an initial search query
2. Review the search results
3. If results are insufficient, refine the query and search again
4. After 1-3 searches, provide a final answer
""".strip()

messages = [
    {"role": "system", "content": system_prompt},
    {"role": "user", "content": question}
]

# Maximum search iterations
max_iterations = 3
iteration = 0
final_answer = None

while iteration < max_iterations and final_answer is None:
    iteration += 1
    
    # API call to get search request or final answer
    response = client.chat.completions.create(
        model="deepseek-chat",
        messages=messages,
        tools=[search_tool],
        tool_choice="auto"
    )
    
    message = response.choices[0].message
    
    # If tool is called
    if message.tool_calls:
        for tool_call in message.tool_calls:
            if tool_call.function.name == "search":
                # Parse and execute search
                arguments = json.loads(tool_call.function.arguments)
                search_results = search(arguments["query"])
                
                # Add to message history
                messages.append({
                    "role": "assistant",
                    "content": None,
                    "tool_calls": [{
                        "id": tool_call.id,
                        "type": "function",
                        "function": {
                            "name": "search",
                            "arguments": tool_call.function.arguments
                        }
                    }]
                })
                
                messages.append({
                    "role": "tool",
                    "content": json.dumps(search_results),
                    "tool_call_id": tool_call.id
                })
    
    # If direct answer is provided
    else:
        final_answer = message.content
        break

# Output final result
if final_answer:
    print("Final Answer:")
    print(final_answer)
else:
    # Force a final answer if max iterations reached
    final_response = client.chat.completions.create(
        model="deepseek-chat",
        messages=messages,
        tools=[]  # Disable tools to force direct answer
    )
    print("Final Answer after refinements:")
    print(final_response.choices[0].message.content)

Final Answer:
The search results didn't provide specific preparation steps for Module 1 of the Data Engineering Zoomcamp. However, here are some general steps you can follow to prepare for Module 1, which typically covers Docker and Terraform:

### Preparation Steps for Module 1:
1. **Install Docker**:
   - Download and install Docker Desktop (or Docker Engine for Linux) from the [official Docker website](https://www.docker.com/).
   - Ensure Docker is running by testing the command `docker --version` in your terminal.

2. **Install Terraform**:
   - Download Terraform from the [official Terraform website](https://www.terraform.io/downloads).
   - Add Terraform to your system's PATH so you can run it from the terminal.

3. **Set Up a Code Editor**:
   - Use an IDE like VS Code, PyCharm, or any other editor you're comfortable with.

4. **Familiarize Yourself with Basic Concepts**:
   - Understand the basics of containers (Docker) and infrastructure as code (Terraform).
   - Review the c

In [28]:
display(Markdown(final_response.choices[0].message.content))

To prepare for Module 1 of the course, here are some key steps and tips based on common issues and solutions from the FAQ:

1. **Docker and PostgreSQL Setup**:
   - Ensure you have Docker installed and running.
   - If you encounter a `ModuleNotFoundError: No module named 'psycopg2'` error, install it using:
     ```bash
     pip install psycopg2-binary
     ```
     If the issue persists, update your package manager (pip or conda) and reinstall `psycopg2`.

2. **SQLAlchemy Configuration**:
   - If you face a `TypeError: 'module' object is not callable` error when using `create_engine`, ensure your connection string is correctly formatted. For example:
     ```python
     conn_string = "postgresql+psycopg://root:root@localhost:5432/ny_taxi"
     engine = create_engine(conn_string)
     ```

3. **General Preparation**:
   - Review the course materials and documentation for Module 1 to understand the tools and concepts you'll be working with (e.g., Docker, PostgreSQL, SQLAlchemy).
   - Set up your development environment in advance to avoid last-minute issues.

If you have specific tasks or topics in Module 1 you'd like help with, let me know!

In [29]:
#pip install bs4

Collecting bs4
  Downloading bs4-0.0.2-py2.py3-none-any.whl.metadata (411 bytes)
Downloading bs4-0.0.2-py2.py3-none-any.whl (1.2 kB)
Installing collected packages: bs4
Successfully installed bs4-0.0.2
Note: you may need to restart the kernel to use updated packages.


In [56]:
import requests
from bs4 import BeautifulSoup
import time

BASE_URL = "https://punchng.com"

def fetch_page(url):
    headers = {"User-Agent": "MyBot (https://github.com/you/yourproject)"}
    resp = requests.get(url, headers=headers)
    resp.raise_for_status()
    return resp.text

def parse_homepage(html):
    soup = BeautifulSoup(html, "html.parser")
    articles = []
    for link in soup.select("h3.entry-title a"):
        title = link.get_text(strip=True)
        href = link["href"]
        articles.append({
    "title": title,
    "url": href
})

    return articles

def main():
    html = fetch_page(BASE_URL)
    articles = parse_homepage(html)
    return articles
    #for art in articles:
        #print(art["title"], art["url"])
        #time.sleep(1)  # be courteous!



In [57]:
news = main()

In [58]:
print(news)

[{'title': 'Indian PM Modi mourns Buhari, recalls warm ties with Nigeria', 'url': 'https://punchng.com/indian-pm-modi-mourns-buhari-recalls-warm-ties-with-nigeria/'}, {'title': 'Sanwo-Olu celebrates LG poll victory, says Lagosians confident in APC', 'url': 'https://punchng.com/sanwo-olu-celebrates-lg-poll-victory-says-lagosians-confident-in-apc/'}, {'title': 'N’Assembly suspends plenary to honour ex-President Buhari', 'url': 'https://punchng.com/nassembly-suspends-plenary-to-honour-ex-president-buhari/'}, {'title': 'PICTORIAL: Shettima arrives London to accompany Buhari’s remains to Nigeria', 'url': 'https://punchng.com/pictorial-shettima-arrives-london-to-accompany-buharis-remains-to-nigeria/'}, {'title': 'Katsina declares Monday work-free day to mourn Buhari', 'url': 'https://punchng.com/katsina-declares-monday-work-free-day-to-mourn-buhari/'}]


In [61]:
display(news[0]["url"])

'https://punchng.com/indian-pm-modi-mourns-buhari-recalls-warm-ties-with-nigeria/'

In [63]:
BASE_URL2 = news[0]["url"]
html = fetch_page(BASE_URL2)

In [None]:
display(Markdown(html))

In [71]:
import requests
from bs4 import BeautifulSoup
import re

url = "https://punchng.com/indian-pm-modi-mourns-buhari-recalls-warm-ties-with-nigeria/"
resp = requests.get(url)
soup = BeautifulSoup(resp.text, "html.parser")

# Find the script tag containing `window._ain`
script_tag = soup.find("script", string=re.compile("window._ain"))

if script_tag:
    js_text = script_tag.string

    # Extract key-value pairs manually
    def extract_js_field(field, text):
        match = re.search(fr'{field}:\s*"([^"]+)"', text)
        return match.group(1) if match else None

    data = {
        "title": extract_js_field("title", js_text),
        "author": extract_js_field("authors", js_text),
        "pubdate": extract_js_field("pubdate", js_text),
        "section": extract_js_field("sections", js_text),
        "tags": extract_js_field("tags", js_text),
        "maincontent": extract_js_field("maincontent", js_text),
    }

    # Extract the content from the HTML using the selector
    content_blocks = soup.select(data["maincontent"])
    article_text = "\n\n".join(block.get_text(strip=True) for block in content_blocks)

    data["content"] = article_text

    print(display(data))
else:
    print("Could not find metadata script tag.")


{'title': 'Indian PM Modi mourns Buhari, recalls warm ties with Nigeria',
 'author': 'Olugbenga Ige',
 'pubdate': '2025-07-14T09:25:21+01:00',
 'section': 'News',
 'tags': 'buhari, condolence, Death of Buhari, former president, India, India-Nigeria relations, Muhammadu Buhari, Narendra Modi, nigeria, world news',
 'maincontent': '.post-content, .post-title',
 'content': "Indian PM Modi mourns Buhari, recalls warm ties with Nigeria\n\nIndia’s Prime Minister, Narendra Modi, has expressed deep sorrow over the death of the former Nigerian President, Muhammadu Buhari, describing him as a statesman whose legacy of diplomacy and commitment to bilateral cooperation will be remembered.In a condolence message shared via his X handle on Monday, Modi said he had fond memories of his meetings and conversations with Buhari, highlighting the late leader’s wisdom, warmth, and his unwavering support for stronger India–Nigeria relations.“Deeply saddened by the passing of former President of Nigeria, Muh

None


| News Outlet              | Website                                              | Known For                                              |
| ------------------------ | ---------------------------------------------------- | ------------------------------------------------------ |
| **The Punch**            | [punchng.com](https://punchng.com)                   | One of the largest newspapers; politics, breaking news |
| **The Guardian Nigeria** | [guardian.ng](https://guardian.ng)                   | Deep analysis, features, and national updates          |
| **Vanguard**             | [vanguardngr.com](https://www.vanguardngr.com)       | Real-time political and regional updates               |
| **Premium Times**        | [premiumtimesng.com](https://www.premiumtimesng.com) | Investigative journalism, governance                   |
| **Channels TV**          | [channelstv.com](https://www.channelstv.com)         | Reputable TV station with timely website updates       |
| **Daily Trust**          | [dailytrust.com](https://dailytrust.com)             | Northern Nigeria coverage, national affairs            |
| **ThisDay**              | [thisdaylive.com](https://www.thisdaylive.com)       | Business, government policy, editorials                |
| **Sahara Reporters**     | [saharareporters.com](https://saharareporters.com)   | Controversial but widely read; leaks, activism         |
| **Nairametrics**         | [nairametrics.com](https://nairametrics.com)         | Economy, markets, finance, policy news                 |
| **Leadership News**      | [leadership.ng](https://leadership.ng)               | National and public service reporting                  |
