In [1]:
from dotenv import load_dotenv
import os
import json
from pathlib import Path
import re

from minsearch import Index

import requests
from openai import OpenAI

from typing import List, Any, Dict

In [2]:
CREDS_PATH = Path.cwd() / '..' / '..' / '.env'

In [3]:
load_dotenv(CREDS_PATH)
api_key = os.getenv('OPENAI_API_KEY')

In [4]:
openai_client = OpenAI()

### Q4: How many records are there?

In [5]:
URL = ' https://github.com/DataTalksClub/datatalksclub.github.io/tree/main/_podcast'
DATA = 'https://api.github.com/repos/DataTalksClub/datatalksclub.github.io/contents/_podcast'

In [6]:
# First, let's see what files are available in the _podcast directory
docs_raw = requests.get(DATA)

# Debug: Check the response before trying to parse JSON
print(f"Status code: {docs_raw.status_code}")
print(f"Content type: {docs_raw.headers.get('content-type', 'Unknown')}")
print(f"Response length: {len(docs_raw.text)}")
print(f"First 200 characters: {docs_raw.text[:200]}")

if docs_raw.status_code == 200:
    try:
        files_list = docs_raw.json()
        print("=== GITHUB API METADATA ===")
        print(f"Total files found: {len(files_list)}")
        print("\nFile types and names:")
        for i, file_info in enumerate(files_list[:10]):  # Show first 10 files
            print(f"{i+1}. {file_info['name']} (type: {file_info['type']}, size: {file_info['size']} bytes)")
    except json.JSONDecodeError as e:
        print(f"JSON decode error: {e}")
        print("Response is not valid JSON. This might be due to:")
        print("1. Rate limiting (GitHub API)")
        print("2. Authentication issues")
        print("3. Invalid endpoint")
else:
    print(f"HTTP error: {docs_raw.status_code}")
    print("Response:", docs_raw.text[:500])
for i, file_info in enumerate(files_list[:10]):  # Show first 10 files
    print(f"{i+1}. {file_info['name']} (type: {file_info['type']}, size: {file_info['size']} bytes)")

if len(files_list) > 10:
    print(f"... and {len(files_list) - 10} more files")

print(f"\nFirst file details:")
print(f"Name: {files_list[0]['name']}")
print(f"Type: {files_list[0]['type']}")
print(f"Size: {files_list[0]['size']} bytes")
print(f"Download URL: {files_list[0]['download_url']}")

# Show the structure of the metadata
print(f"\nMetadata keys: {list(files_list[0].keys())}")
files_list[0:4]


Status code: 200
Content type: application/json; charset=utf-8
Response length: 212835
First 200 characters: [{"name":"_s12e08.md","path":"_podcast/_s12e08.md","sha":"713ef42e7cc080cbb8c6e1ae4978fa0b5a4304b3","size":56384,"url":"https://api.github.com/repos/DataTalksClub/datatalksclub.github.io/contents/_pod
=== GITHUB API METADATA ===
Total files found: 185

File types and names:
1. _s12e08.md (type: file, size: 56384 bytes)
2. _template.md (type: file, size: 284 bytes)
3. s01e01-roles.md (type: file, size: 430 bytes)
4. s01e02-processes.md (type: file, size: 494 bytes)
5. s01e03-building-ds-team.md (type: file, size: 52722 bytes)
6. s01e04-standing-out-as-a-data-scientist.md (type: file, size: 63713 bytes)
7. s01e05-mentoring.md (type: file, size: 375 bytes)
8. s02e01-writing.md (type: file, size: 505 bytes)
9. s02e02-developer-advocacy.md (type: file, size: 54972 bytes)
10. s02e03-open-source.md (type: file, size: 654 bytes)
1. _s12e08.md (type: file, size: 56384 bytes)
2. _template

[{'name': '_s12e08.md',
  'path': '_podcast/_s12e08.md',
  'sha': '713ef42e7cc080cbb8c6e1ae4978fa0b5a4304b3',
  'size': 56384,
  'url': 'https://api.github.com/repos/DataTalksClub/datatalksclub.github.io/contents/_podcast/_s12e08.md?ref=main',
  'html_url': 'https://github.com/DataTalksClub/datatalksclub.github.io/blob/main/_podcast/_s12e08.md',
  'git_url': 'https://api.github.com/repos/DataTalksClub/datatalksclub.github.io/git/blobs/713ef42e7cc080cbb8c6e1ae4978fa0b5a4304b3',
  'download_url': 'https://raw.githubusercontent.com/DataTalksClub/datatalksclub.github.io/main/_podcast/_s12e08.md',
  'type': 'file',
  '_links': {'self': 'https://api.github.com/repos/DataTalksClub/datatalksclub.github.io/contents/_podcast/_s12e08.md?ref=main',
   'git': 'https://api.github.com/repos/DataTalksClub/datatalksclub.github.io/git/blobs/713ef42e7cc080cbb8c6e1ae4978fa0b5a4304b3',
   'html': 'https://github.com/DataTalksClub/datatalksclub.github.io/blob/main/_podcast/_s12e08.md'}},
 {'name': '_templat

In [7]:
print("=== TRANSCRIPT PROCESSING ===")

print("All file extensions found:")
extensions = set()
for f in files_list:
    if '.' in f['name']:
        ext = f['name'].split('.')[-1]
        extensions.add(ext)
    else:
        extensions.add('no extension')
print(f"Extensions: {sorted(extensions)}")

# Let's also see some actual filenames
print(f"\nFirst 10 filenames:")
for i, f in enumerate(files_list[:10]):
    print(f"{i+1}. {f['name']}")

md_files = [f for f in files_list if f['name'].endswith('.md')]
print(f"\nFound {len(md_files)} Markdown files")

# For now, let's use the first Markdown file as an example
if md_files:
    first_file = md_files[0]
    print(f"Using file: {first_file['name']}")
    
    # Get the actual content of the file (Markdown is text, not JSON)
    content_response = requests.get(first_file['download_url'])
    markdown_content = content_response.text
    print(f"Loaded markdown content ({len(markdown_content)} characters)")
    
    # Show first few lines to understand the structure
    lines = markdown_content.split('\n')
    print(f"\nFirst 10 lines of the markdown file:")
    for i, line in enumerate(lines[:10]):
        print(f"{i+1}: {line}")
    
    # For now, we'll work with the raw markdown content
    docs_response = markdown_content
else:
    print("No Markdown files found")
    docs_response = ""

=== TRANSCRIPT PROCESSING ===
All file extensions found:
Extensions: ['md']

First 10 filenames:
1. _s12e08.md
2. _template.md
3. s01e01-roles.md
4. s01e02-processes.md
5. s01e03-building-ds-team.md
6. s01e04-standing-out-as-a-data-scientist.md
7. s01e05-mentoring.md
8. s02e01-writing.md
9. s02e02-developer-advocacy.md
10. s02e03-open-source.md

Found 185 Markdown files
Using file: _s12e08.md
Loaded markdown content (55835 characters)

First 10 lines of the markdown file:
1: ---
2: episode: 8
3: guests:
4: - jekaterinakokatjuhha
5: ids:
6:   anchor: The-Journey-of-a-Data-Generalist-From-Bioinformatics-to-Freelancing---Jekaterina-Kokatjuhha-e1upvim
7:   youtube: FRi0SUtxdMw
8: image: images/podcast/s12e08-journey-of-data-generalist-from-bioinformatics-to-freelancing.jpg
9: links:
10:   anchor: https://anchor.fm/datatalksclub/episodes/The-Journey-of-a-Data-Generalist-From-Bioinformatics-to-Freelancing---Jekaterina-Kokatjuhha-e1upvim


In [8]:
# Create a simple class to represent transcript entries
class TranscriptEntry:
    def __init__(self, start, text):
        self.start = start
        self.text = text
    
    def __str__(self):
        return f"TranscriptEntry(start={self.start}s, text='{self.text[:50]}{'...' if len(self.text) > 50 else ''}')"
    
    def __repr__(self):
        return self.__str__()

# Try to parse timestamps and text from the markdown
transcript_entries = []
timestamp_pattern = r'(\d{1,2}:\d{2}(?::\d{2})?)'  # Matches MM:SS or HH:MM:SS

for line in lines:
    line = line.strip()
    if not line:
        continue
    
    # Look for timestamp patterns
    match = re.search(timestamp_pattern, line)
    if match:
        timestamp_str = match.group(1)
        # Convert timestamp to seconds
        parts = timestamp_str.split(':')
        if len(parts) == 2:  # MM:SS
            seconds = int(parts[0]) * 60 + int(parts[1])
        elif len(parts) == 3:  # HH:MM:SS
            seconds = int(parts[0]) * 3600 + int(parts[1]) * 60 + int(parts[2])
        else:
            continue
        
        # Extract text after timestamp
        text = line[match.end():].strip()
        if text:
            transcript_entries.append(TranscriptEntry(seconds, text))

print(f"\nParsed {len(transcript_entries)} transcript entries")
if transcript_entries:
    print("First few entries:")
    for i, entry in enumerate(transcript_entries[:5]):
        print(f"{i+1}. {entry.start}s: {entry.text[:50]}...")

# Update docs_response to be the parsed transcript entries
docs_response = transcript_entries



Parsed 128 transcript entries
First few entries:
1. 71s: '...
2. 112s: '...
3. 122s: '...
4. 135s: '...
5. 135s: '...


In [9]:
f'There are {len(docs_response)} records'

'There are 128 records'

In [10]:
docs_response[43]

TranscriptEntry(start=1282s, text=''')

### Q5: How many chunks do you have in the result?
* chunk size 30 
* overlap 15

In [11]:
def format_timestamp(seconds: float) -> str:
    """Convert seconds to H:MM:SS if > 1 hour, else M:SS"""
    total_seconds = int(seconds)
    hours, remainder = divmod(total_seconds, 3600)
    minutes, secs = divmod(remainder, 60)

    if hours > 0:
        return f"{hours}:{minutes:02}:{secs:02}"
    else:
        return f"{minutes}:{secs:02}"

In [12]:
def make_subtitles(transcript: List[Any]) -> str:
    lines = []

    for entry in transcript:
        ts = format_timestamp(entry.start)
        text = entry.text.replace('\n', ' ')
        lines.append(ts + ' ' + text)

    return '\n'.join(lines)

In [13]:
def sliding_window(seq: List[Any], size: int, step: int) -> List[List[Any]]:
    """Create overlapping chunks using sliding window approach."""
    if size <= 0 or step <= 0:
        raise ValueError("size and step must be positive")

    n = len(seq)
    result = []

    for i in range(0, n, step):
        batch = seq[i:i+size]
        result.append(batch)
        if i + size >= n:
            break

    return result

In [14]:
def join_lines(transcript: List[Any]) -> str:
    """Join transcript entries into continuous text."""
    lines = []

    for entry in transcript:
        text = entry.text.replace('\n', ' ')
        lines.append(text)

    return ' '.join(lines)

In [15]:
def format_chunk(chunk: List[Any]) -> Dict[str, str]:
    """Format a chunk with start/end timestamps and text."""
    time_start = format_timestamp(chunk[0].start)
    time_end = format_timestamp(chunk[-1].start)
    text = join_lines(chunk)

    return {
        'start': time_start,
        'end': time_end,
        'text': text
    }

In [16]:
chunks = []

for chunk in sliding_window(docs_response, 30, 15):  #step = chunk_size - overlap
    processed = format_chunk(chunk)
    chunks.append(processed)

f'There are a total of {len(chunks)} chunks'

'There are a total of 8 chunks'

#### Q6: What's the first episode in the results for "how do I make money with AI?"

In [17]:
QUESTION = 'how do I make money with AI?'

* Search (text)

In [18]:
index = Index(
    text_fields=["short", "title"],
)

index.fit(chunks)

<minsearch.minsearch.Index at 0x11de2f790>

In [19]:
def search(query):
    return index.search(
        query=query,
        num_results=15
    )

* Build prompt

In [20]:
instructions = """
You're an assistant that helps with the documentation.
Answer the QUESTION based on the CONTEXT from the search engine of our documentation.

Use only the facts from the CONTEXT when answering the QUESTION.

When answering the question, provide the reference to the file with the source.
Use the filename field for that. The repo url is: https://api.github.com/repos/DataTalksClub/datatalksclub.github.io/contents/_podcast
Include code examples when relevant. 
If the question is discussed in multiple documents, cite all of them.

Don't use markdown or any formatting in the output.
""".strip()

prompt_template = """
<QUESTION>
{question}
</QUESTION>

<CONTEXT>
{context}
</CONTEXT>
""".strip()

In [21]:
def build_prompt(question, search_results):
    context = json.dumps(search_results)

    prompt = prompt_template.format(
        question=question,
        context=context
    ).strip()
    
    return prompt

In [22]:
def llm(user_prompt, instructions=None, model="gpt-4o-mini"):
    messages = []

    if instructions:
        messages.append({
            "role": "system",
            "content": instructions
        })

    messages.append({
        "role": "user",
        "content": user_prompt
    })

    response = openai_client.responses.create(
        model=model,
        input=messages
    )

    return response.output_text

In [23]:
def rag(query):
    search_results = search(query)
    prompt = build_prompt(query, search_results)
    response = llm(prompt)
    return response

In [25]:
rag(QUESTION)

'Making money with AI can be approached in various ways depending on your skills, resources, and market demand. Here are some ideas:\n\n1. **Develop AI Applications**: If you have coding skills, consider creating applications that leverage AI for specific industries (healthcare, finance, education, etc.).\n\n2. **Freelance AI Solutions**: Offer freelance services in machine learning, data analysis, or natural language processing for businesses that need AI expertise.\n\n3. **AI Consulting**: Help companies integrate AI into their processes, providing your expertise on strategy and implementation.\n\n4. **Online Courses and Tutorials**: Create and sell online courses teaching AI concepts and applications. Platforms like Udemy or Coursera can be used.\n\n5. **Data Annotation Services**: Offer services to label or annotate data for AI training. Many companies require large datasets to train their models.\n\n6. **Affiliate Marketing with AI Tools**: Promote AI-driven tools or platforms thr