### Q1 Running Mage

Answer 1: v0.9.72

### Q2 Reading the documents

In [1]:
import io
import requests
import docx

The history saving thread hit an unexpected error (OperationalError('attempt to write a readonly database')).History will not be written to the database.


In [2]:
def clean_line(line):
    line = line.strip()
    line = line.strip('\uFEFF')
    return line

# Read the files
def read_faq(file_id):
    url = f'https://docs.google.com/document/d/{file_id}/export?format=docx'
    
    response = requests.get(url)
    response.raise_for_status()
    
    with io.BytesIO(response.content) as f_in:
        doc = docx.Document(f_in)

    questions = []

    question_heading_style = 'heading 2'
    section_heading_style = 'heading 1'
    
    heading_id = ''
    section_title = ''
    question_title = ''
    answer_text_so_far = ''
     
    for p in doc.paragraphs:
        style = p.style.name.lower()
        p_text = clean_line(p.text)
    
        if len(p_text) == 0:
            continue
    
        if style == section_heading_style:
            section_title = p_text
            continue
    
        if style == question_heading_style:
            answer_text_so_far = answer_text_so_far.strip()
            if answer_text_so_far != '' and section_title != '' and question_title != '':
                questions.append({
                    'text': answer_text_so_far,
                    'section': section_title,
                    'question': question_title,
                })
                answer_text_so_far = ''
    
            question_title = p_text
            continue
        
        answer_text_so_far += '\n' + p_text
    
    answer_text_so_far = answer_text_so_far.strip()
    if answer_text_so_far != '' and section_title != '' and question_title != '':
        questions.append({
            'text': answer_text_so_far,
            'section': section_title,
            'question': question_title,
        })

    return questions

In [3]:
# Put the documents into a dictionary
def ingest():
    faq_documents = {
        'llm-zoomcamp': '1qZjwHkvP0lXHiE4zdbWyUXSVfmVGzougDD6N37bat3E',
    }

    documents = []

    for course, file_id in faq_documents.items():
        #print(course)
        course_documents = read_faq(file_id)
        documents.append({'course': course, 'documents': course_documents})
    
    print("Answer 2:", len(documents))

    return documents

### Mage code

In [None]:
# Final code for the custom block, do not execute here

if 'data_loader' not in globals():
    from mage_ai.data_preparation.decorators import data_loader
if 'test' not in globals():
    from mage_ai.data_preparation.decorators import test

import io
import requests
import docx
    
@data_loader
def load_data(*args, **kwargs):
    faq_documents = {
        'llm-zoomcamp': '1qZjwHkvP0lXHiE4zdbWyUXSVfmVGzougDD6N37bat3E',
    }

    documents = []

    for course, file_id in faq_documents.items():
        print(course)
        course_documents = read_faq(file_id)
        documents.append({'course': course, 'documents': course_documents})
    
    print("Number of documents: ", len(documents))

    return documents

def clean_line(line):
    line = line.strip()
    line = line.strip('\uFEFF')
    return line

def read_faq(file_id):
    url = f'https://docs.google.com/document/d/{file_id}/export?format=docx'
    
    response = requests.get(url)
    response.raise_for_status()
    
    with io.BytesIO(response.content) as f_in:
        doc = docx.Document(f_in)

    questions = []

    question_heading_style = 'heading 2'
    section_heading_style = 'heading 1'
    
    heading_id = ''
    section_title = ''
    question_title = ''
    answer_text_so_far = ''
     
    for p in doc.paragraphs:
        style = p.style.name.lower()
        p_text = clean_line(p.text)
    
        if len(p_text) == 0:
            continue
    
        if style == section_heading_style:
            section_title = p_text
            continue
    
        if style == question_heading_style:
            answer_text_so_far = answer_text_so_far.strip()
            if answer_text_so_far != '' and section_title != '' and question_title != '':
                questions.append({
                    'text': answer_text_so_far,
                    'section': section_title,
                    'question': question_title,
                })
                answer_text_so_far = ''
    
            question_title = p_text
            continue
        
        answer_text_so_far += '\n' + p_text
    
    answer_text_so_far = answer_text_so_far.strip()
    if answer_text_so_far != '' and section_title != '' and question_title != '':
        questions.append({
            'text': answer_text_so_far,
            'section': section_title,
            'question': question_title,
        })

    return questions
    
@test
def test_output(output, *args) -> None:
    """
    Template code for testing the output of the block.
    """
    assert output is not None, 'The output is undefined'

### Q3 Chunking

In [None]:
import hashlib

# Chunking the documents
def chunking():
    data = ingest()
    #print(type(data))
    #print(data)
    documents = []
    
    for course_dict in data:  # Loop through each dictionary in the list
        #print("Course dict:", course_dict)
        for doc in course_dict['documents']:  # Then loop through the documents in each dictionary
            doc['course'] = course_dict['course']
            doc['document_id'] = generate_document_id(doc)
            documents.append(doc)
    
    print("Answer 3:", len(documents), "chunks")
    
    return documents

# Generating the document ids
def generate_document_id(doc):
    combined = f"{doc['course']}-{doc['question']}-{doc['text'][:10]}"
    hash_object = hashlib.md5(combined.encode())
    hash_hex = hash_object.hexdigest()
    document_id = hash_hex[:8]
    return document_id

chunking()

### Mage code:

In [None]:
if 'transformer' not in globals():
    from mage_ai.data_preparation.decorators import transformer
if 'test' not in globals():
    from mage_ai.data_preparation.decorators import test
import hashlib

@transformer
def transform(data, *args, **kwargs):
    print(type(data))
    print("Printing data:", data)
    documents = []

    for doc in data['documents']:
        doc['course'] = data['course']
        # previously we used just "id" for document ID
        doc['document_id'] = generate_document_id(doc)
        documents.append(doc)

    print("Number of chunks:", len(documents))

    return documents

def generate_document_id(doc):
    combined = f"{doc['course']}-{doc['question']}-{doc['text'][:10]}"
    hash_object = hashlib.md5(combined.encode())
    hash_hex = hash_object.hexdigest()
    document_id = hash_hex[:8]
    return document_id
    
@test
def test_output(output, *args) -> None:
    """
    Template code for testing the output of the block.
    """
    assert output is not None, 'The output is undefined'

### Q4 Export

In [None]:
from typing import Dict, List, Union

import numpy as np
from elasticsearch import Elasticsearch
from datetime import datetime

def elasticsearch_export(
    documents: List[Dict[str, Union[Dict, List[int], np.ndarray, str]]], 
    connection_string='https://elasticsearch:9200/',
    index_name_prefix='documents',
    number_of_shards=1,
    number_of_replicas=0,
    vector_column_name='embedding',
    dimensions=None,
):
    """
    Exports document data to an Elasticsearch database.
    """

    # Adjusting index name
    current_time = datetime.now().strftime("%Y%m%d_%M%S")
    index_name = f"{index_name_prefix}_{current_time}"
    print("index name:", index_name)

    # Connection to Elasticsearch
    es_client = Elasticsearch(connection_string)
    print(f'Connecting to Elasticsearch at {connection_string}')

    # Determine dimensions if not provided
    if dimensions is None and len(documents) > 0:
        document = documents[0]
        dimensions = len(document.get(vector_column_name) or [])

    # Index settings
    index_settings = {
        "settings": {
            "number_of_shards": number_of_shards,
            "number_of_replicas": number_of_replicas
        },
        "mappings": {
            "properties": {
                "text": {"type": "text"},
                "section": {"type": "text"},
                "question": {"type": "text"},
                "course": {"type": "keyword"},
                "document_id": {"type": "keyword"}
            }
        }
    }

    # Create index if it doesn't exist
    if not es_client.indices.exists(index=index_name):
        es_client.indices.create(index=index_name, body=index_settings)
        print('Index created with properties:', index_settings)
        print('Embedding dimensions:', dimensions)

    # Index documents
    print(f'Indexing {len(documents)} documents to Elasticsearch index {index_name}')
    for document in documents:
        print(f'Indexing document {document["document_id"]}')
        es_client.index(index=index_name, document=document)

    print("Indexing completed.")

# Example usage (make sure to define your `documents` list)
documents = chunking()
elasticsearch_export(documents)

### Mage code

In [None]:
# Final code
from typing import Dict, List, Tuple, Union

import numpy as np
from elasticsearch import Elasticsearch
from datetime import datetime
from mage_ai.data_preparation.variable_manager import set_global_variable

if 'data_exporter' not in globals():
    from mage_ai.data_preparation.decorators import data_exporter

@data_exporter
def elasticsearch(
    documents: List[Dict[str, Union[Dict, List[int], np.ndarray, str]]], *args, **kwargs,
):
    """
    Exports document data to an Elasticsearch database.
    """

    connection_string = kwargs.get('connection_string', 'http://localhost:9200/')

    # Adjusting index name
    index_name_prefix = kwargs.get('index_name', 'documents')
    current_time = datetime.now().strftime("%Y%m%d_%M%S")
    index_name = f"{index_name_prefix}_{current_time}"
    print("index name:", index_name)

    # Setting global variable
    set_global_variable('resplendent_radiance', 'index_name', index_name)

    number_of_shards = kwargs.get('number_of_shards', 1)
    number_of_replicas = kwargs.get('number_of_replicas', 0)
    vector_column_name = kwargs.get('vector_column_name', 'embedding')

    dimensions = kwargs.get('dimensions')
    if dimensions is None and len(documents) > 0:
        document = documents[0]
        dimensions = len(document.get(vector_column_name) or [])

    es_client = Elasticsearch(connection_string)

    print(f'Connecting to Elasticsearch at {connection_string}')

    index_settings = {
    "settings": {
        "number_of_shards": number_of_shards,
        "number_of_replicas": number_of_replicas
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"},
            "document_id": {"type": "keyword"}
        }
    }
}

    if not es_client.indices.exists(index=index_name):
        es_client.indices.create(index=index_name)
        print('Index created with properties:', index_settings)
        print('Embedding dimensions:', dimensions)

    print(f'Indexing {len(documents)} documents to Elasticsearch index {index_name}')
    for document in documents:
        print(f'Indexing document {document["document_id"]}')

        es_client.index(index=index_name, document=document)

    print(document)

Output:

In [None]:




index name: documents_20240820_2506
index name: documents_20240820_2506
Connecting to Elasticsearch at http://elasticsearch:9200
Connecting to Elasticsearch at http://elasticsearch:9200
Index created with properties: {'settings': {'number_of_shards': 1, 'number_of_replicas': 0}, 'mappings': {'properties': {'text': {'type': 'text'}, 'section': {'type': 'text'}, 'question': {'type': 'text'}, 'course': {'type': 'keyword'}, 'document_id': {'type': 'keyword'}}}}
Embedding dimensions: 0
Indexing 86 documents to Elasticsearch index documents_20240820_2506
Indexing document 97872393
Index created with properties: {'settings': {'number_of_shards': 1, 'number_of_replicas': 0}, 'mappings': {'properties': {'text': {'type': 'text'}, 'section': {'type': 'text'}, 'question': {'type': 'text'}, 'course': {'type': 'keyword'}, 'document_id': {'type': 'keyword'}}}}
Embedding dimensions: 0
Indexing 86 documents to Elasticsearch index documents_20240820_2506
Indexing document 97872393
Indexing document a57f9581
Indexing document fb81c6ff
Indexing document bf024675
Indexing document e0d2caf7
Indexing document a57f9581
Indexing document fb81c6ff
Indexing document bf024675
Indexing document e0d2caf7
Indexing document 7bd989aa
Indexing document 1c96a1fb
Indexing document 01cb301e
Indexing document ca68d283
Indexing document 6fc3236a
Indexing document 6d61aae2
Indexing document cbe66cfe
Indexing document 7bd989aa
Indexing document 1c96a1fb
Indexing document 01cb301e
Indexing document ca68d283
Indexing document 6fc3236a
Indexing document 6d61aae2
Indexing document cbe66cfe
Indexing document a5301a1f
Indexing document 9816f1ae
Indexing document 98c1bc60
Indexing document befeedef
Indexing document baea0a66
Indexing document a976d6e7
Indexing document a5301a1f
Indexing document 9816f1ae
Indexing document 98c1bc60
Indexing document befeedef
Indexing document baea0a66
Indexing document a976d6e7
Indexing document 18a32cec
Indexing document 764f2789
Indexing document baae926f
Indexing document 190fc999
Indexing document d8c4c7bb
Indexing document 1a9b8b53
Indexing document a310259a
Indexing document 5a995cf3
Indexing document 18a32cec
Indexing document 764f2789
Indexing document baae926f
Indexing document 190fc999
Indexing document d8c4c7bb
Indexing document 1a9b8b53
Indexing document a310259a
Indexing document 5a995cf3
Indexing document 67da0fb5
Indexing document a1ec19a2
Indexing document d710981f
Indexing document 19811ec0
Indexing document aa8f7017
Indexing document 48312d67
Indexing document 1c862647
Indexing document 67da0fb5
Indexing document a1ec19a2
Indexing document d710981f
Indexing document 19811ec0
Indexing document aa8f7017
Indexing document 48312d67
Indexing document 1c862647
Indexing document fd874951
Indexing document 0536ca0b
Indexing document 8ac0422d
Indexing document 6ef32048
Indexing document 3ffb9e62
Indexing document 8efc052a
Indexing document 7b87b859
Indexing document fd874951
Indexing document 0536ca0b
Indexing document 8ac0422d
Indexing document 6ef32048
Indexing document 3ffb9e62
Indexing document 8efc052a
Indexing document 7b87b859
Indexing document 5734b048
Indexing document 1804f538
Indexing document f8f7469d
Indexing document 4b95ba51
Indexing document 12f1a26a
Indexing document aace1f4a
Indexing document db816465
Indexing document eb00a0c9
Indexing document 5734b048
Indexing document 1804f538
Indexing document f8f7469d
Indexing document 4b95ba51
Indexing document 12f1a26a
Indexing document aace1f4a
Indexing document db816465
Indexing document eb00a0c9
Indexing document 1eb85e18
Indexing document 54dd72ba
Indexing document 464b4d9c
Indexing document 2806a1c1
Indexing document 9068bbd5
Indexing document ee355823
Indexing document 0a101a81
Indexing document 84ef78df
Indexing document 1eb85e18
Indexing document 54dd72ba
Indexing document 464b4d9c
Indexing document 2806a1c1
Indexing document 9068bbd5
Indexing document ee355823
Indexing document 0a101a81
Indexing document 84ef78df
Indexing document a1419bf6
Indexing document 5f8fd79d
Indexing document 0deabb27
Indexing document a2dca2e2
Indexing document a262c532
Indexing document 8912e711
Indexing document 005ecede
Indexing document fe48ad62
Indexing document a1419bf6
Indexing document 5f8fd79d
Indexing document 0deabb27
Indexing document a2dca2e2
Indexing document a262c532
Indexing document 8912e711
Indexing document 005ecede
Indexing document fe48ad62
Indexing document c13c26c8
Indexing document d8c4c7bb
Indexing document d8c4c7bb
Indexing document 258a03fe
Indexing document d8c4c7bb
Indexing document 794ed89c
Indexing document e9107390
Indexing document 43b399a8
Indexing document c13c26c8
Indexing document d8c4c7bb
Indexing document d8c4c7bb
Indexing document 258a03fe
Indexing document d8c4c7bb
Indexing document 794ed89c
Indexing document e9107390
Indexing document 43b399a8
Indexing document 534f8148
Indexing document 79f67e08
Indexing document d8c4c7bb
Indexing document 1fc5e366
Indexing document 6cf805ca
Indexing document e18124d4
Indexing document a705279d
Indexing document f5f83001
Indexing document 534f8148
Indexing document 79f67e08
Indexing document d8c4c7bb
Indexing document 1fc5e366
Indexing document 6cf805ca
Indexing document e18124d4
Indexing document a705279d
Indexing document f5f83001
Indexing document db752798
Indexing document e2433e15
Indexing document 99ab2f5d
Indexing document f250bb18
Indexing document d8c4c7bb
Indexing document fa136280
{'text': 'Yes, you need to pass the Capstone project to get the certificate. Homework is not mandatory, though it is recommended for reinforcing concepts, and the points awarded count towards your rank on the leaderboard.', 'section': 'General course-related questions', 'question': 'I missed the first homework - can I still get a certificate?', 'course': 'llm-zoomcamp', 'document_id': 'fa136280'}