In [1]:
import nest_asyncio
from playground_secret_key import SECRET_KEY

nest_asyncio.apply()

import os
import openai

os.environ['OPENAI_API_KEY'] = SECRET_KEY

In [2]:
from llama_index.llms.openai import OpenAI
from llama_index.core.schema import MetadataMode

In [3]:
llm = OpenAI(temperature=0.1, model="gpt-3.5-turbo", max_tokens=512)

In [4]:
from llama_index.core.extractors import(
    SummaryExtractor,
    QuestionsAnsweredExtractor,
    TitleExtractor,
    KeywordExtractor,
    BaseExtractor,
)

from llama_index.extractors.entity import EntityExtractor # takes a long time
from llama_index.core.node_parser import TokenTextSplitter

# doesn't work and there is no documentation
# class AuthorExtractor(BaseExtractor):
#     def extract(self, nodes):
#         metadata_list = [
#             {'author' : (
#                 node.metadata.get('author',default='Unknown')
#             +
#             '\n'
#             +
#             node.metadata['excerpt_keywords'])
#             }
#         for node in nodes
#         ]
#         return metadata_list
#
#     def aextract(self, nodes):
#         pass

In [5]:
text_splitter = TokenTextSplitter(
    separator=" ", chunk_size=512, chunk_overlap=128
)


extractors = [TitleExtractor(nodes=5, llm=llm),
QuestionsAnsweredExtractor(questions=3, llm=llm),
# SummaryExtractor(llm=llm), # not needed here, makes longer summary than slide
KeywordExtractor(llm=llm)]

transformations = [text_splitter] + extractors

In [6]:
from llama_index.core import SimpleDirectoryReader

In [7]:
docs = SimpleDirectoryReader(input_files=['../data/Annotated Handouts-20240310/full_chapters_annotated/01_basics_annotated.pdf']).load_data()

In [8]:
type(docs)

list

In [9]:
docs[20].text # slide 21

'OverﬁttingIssue with evaluation on training dataIf we allow ourselves to build verycomplex models,w ec a na l w a y sb eas accurate as we like on the training setThe only measure of whether an algorithm will perform well on newdata is theevaluation on the test setOverﬁttingWe expect simple models to generalize better to new data. Therefore,we always want to ﬁnd the simplest model.Building a model that is too complex for the amount of informationwe have, is calledoverﬁttingMichela Papandrea (SUPSI)Introduction to Supervised Learning21 / 25'

In [10]:
docs_front_page = docs[:2]
docs_content = docs[2:]
# don't know why
docs = docs_front_page + docs_content

In [17]:
docs[0]

Document(id_='0f37775d-1265-4ae1-8072-2f513f492fb4', embedding=None, metadata={'page_label': '1', 'file_name': '01_basics_annotated.pdf', 'file_path': '..\\data\\Annotated Handouts-20240310\\full_chapters_annotated\\01_basics_annotated.pdf', 'file_type': 'application/pdf', 'file_size': 1398475, 'creation_date': '2023-11-02', 'last_modified_date': '2024-03-10'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={}, text='Introduction to Supervised LearningMichela Papandreamichela.papandrea@supsi.chSupervised LearningBachelor of Data Science and Artiﬁcial IntelligenceUniversity of Applied Sciences and Arts of Southern Switzerland\nMichela Papandrea (SUPSI)Introduction to Supervised Learning1/2 5 ', start_char_idx=None, end_char_idx=None, text_template='{metadata_str}\

In [11]:
from llama_index.core.ingestion import IngestionPipeline

# metadata pipeline
ing_pipeline = IngestionPipeline(transformations=transformations)

docs_nodes = ing_pipeline.run(documents=docs)

100%|██████████| 1/1 [00:01<00:00,  1.53s/it]
100%|██████████| 1/1 [00:00<00:00,  1.19it/s]
100%|██████████| 1/1 [00:00<00:00,  1.63it/s]
100%|██████████| 1/1 [00:00<00:00,  1.94it/s]
100%|██████████| 1/1 [00:00<00:00,  1.36it/s]
100%|██████████| 1/1 [00:00<00:00,  1.96it/s]
100%|██████████| 1/1 [00:00<00:00,  1.43it/s]
100%|██████████| 1/1 [00:00<00:00,  1.75it/s]
100%|██████████| 1/1 [00:00<00:00,  2.66it/s]
100%|██████████| 1/1 [00:00<00:00,  1.91it/s]
100%|██████████| 1/1 [00:00<00:00,  2.18it/s]
100%|██████████| 1/1 [00:00<00:00,  1.53it/s]
100%|██████████| 1/1 [00:00<00:00,  1.45it/s]
100%|██████████| 1/1 [00:00<00:00,  1.61it/s]
100%|██████████| 1/1 [00:00<00:00,  2.08it/s]
100%|██████████| 1/1 [00:00<00:00,  2.14it/s]
100%|██████████| 1/1 [00:00<00:00,  2.62it/s]
100%|██████████| 1/1 [00:00<00:00,  1.40it/s]
100%|██████████| 1/1 [00:00<00:00,  1.64it/s]
100%|██████████| 1/1 [00:00<00:00,  1.41it/s]
100%|██████████| 1/1 [00:00<00:00,  1.93it/s]
100%|██████████| 1/1 [00:00<00:00,

In [12]:
docs_nodes[2].metadata

{'page_label': '3',
 'file_name': '01_basics_annotated.pdf',
 'file_path': '..\\data\\Annotated Handouts-20240310\\full_chapters_annotated\\01_basics_annotated.pdf',
 'file_type': 'application/pdf',
 'file_size': 1398475,
 'creation_date': '2023-11-02',
 'last_modified_date': '2024-03-10',
 'document_title': 'The Power and Ubiquity of Machine Learning in Modern Technology: A Comprehensive Overview',
 'questions_this_excerpt_can_answer': '1. How is machine learning defined and what disciplines does it intersect with?\n2. What are some examples of ubiquitous machine learning applications in modern technology?\n3. Who is Michela Papandrea and what topic is she introducing in relation to supervised learning?',
 'excerpt_keywords': 'Machine Learning, Statistics, Artificial Intelligence, Predictive Analytics, Supervised Learning'}

In [None]:
'8bfaa646-4b3a-4fb7-bdc8-e69c55aae402'

In [45]:
docs_nodes[0].id_

'e1dc1ce2-924a-4fbb-807d-ad8ad2e6159f'

In [85]:
from llama_index.core.question_gen import LLMQuestionGenerator
from llama_index.core.question_gen.prompts import DEFAULT_SUB_QUESTION_PROMPT_TMPL

question_gen = LLMQuestionGenerator.from_defaults(llm=llm,
                                                  prompt_template_str="""
        Follow the example, but instead of giving a question, always prefix the question
        with: 'By first identifying and quoting the most relevant sources, '.
        """ + DEFAULT_SUB_QUESTION_PROMPT_TMPL)

In [86]:
from llama_index.core import VectorStoreIndex
from llama_index.core.query_engine import SubQuestionQueryEngine
from llama_index.core.tools import QueryEngineTool,ToolMetadata

In [87]:
index = VectorStoreIndex(nodes=docs_nodes)
engine = index.as_query_engine(similarity_top_k=10, llm=OpenAI(model="gpt-4"))

In [88]:
final_engine = SubQuestionQueryEngine.from_defaults(
    query_engine_tools=[
        QueryEngineTool(query_engine=engine,metadata=ToolMetadata(name='ml_documents',description='introduction to machine learning'))],question_gen=question_gen,use_async=True
)

In [89]:
response = final_engine.query(
    """
    What classification and regression examples were used? Give your answer as a JSON.
    """
)
print(response.response)

Generated 2 sub questions.
[1;3;38;2;237;90;200m[ml_documents] Q: By first identifying and quoting the most relevant sources, What classification examples were used in the 'introduction to machine learning' document
[0m[1;3;38;2;90;149;237m[ml_documents] Q: By first identifying and quoting the most relevant sources, What regression examples were used in the 'introduction to machine learning' document
[0m[1;3;38;2;90;149;237m[ml_documents] A: The document provides two examples of regression. The first example is about predicting a person's annual income based on their education, age, and where they live. The second example is related to the basic linear regression equation, where the outcome is predicted based on the features of the data.
[0m[1;3;38;2;237;90;200m[ml_documents] A: The document provides several examples of classification tasks. One example is identifying the zip code from handwritten digits on an envelope, where the input is a scan of the handwriting and the output