In [1]:
import nest_asyncio
from playground_secret_key import SECRET_KEY

nest_asyncio.apply()

import os
import openai

os.environ['OPENAI_API_KEY'] = SECRET_KEY

In [3]:
from llama_index.llms.openai import OpenAI
from llama_index.core.schema import MetadataMode

In [4]:
llm = OpenAI(temperature=0.1, model="gpt-3.5-turbo", max_tokens=512)

In [71]:
from llama_index.core.extractors import(
    SummaryExtractor,
    QuestionsAnsweredExtractor,
    TitleExtractor,
    KeywordExtractor,
    BaseExtractor,
)

from llama_index.extractors.entity import EntityExtractor # takes a long time
from llama_index.core.node_parser import TokenTextSplitter

# doesn't work and there is no documentation
# class AuthorExtractor(BaseExtractor):
#     def extract(self, nodes):
#         metadata_list = [
#             {'author' : (
#                 node.metadata.get('author',default='Unknown')
#             +
#             '\n'
#             +
#             node.metadata['excerpt_keywords'])
#             }
#         for node in nodes
#         ]
#         return metadata_list
#
#     def aextract(self, nodes):
#         pass

In [82]:
text_splitter = TokenTextSplitter(
    separator=" ", chunk_size=512, chunk_overlap=128
)


extractors = [TitleExtractor(nodes=5, llm=llm),
QuestionsAnsweredExtractor(questions=3, llm=llm),
# SummaryExtractor(llm=llm), # not needed here, makes longer summary than slide
KeywordExtractor(llm=llm)]

transformations = [text_splitter] + extractors

In [73]:
from llama_index.core import SimpleDirectoryReader

In [74]:
docs = SimpleDirectoryReader(input_files=['../data/01_introduction_to_SL-4.pdf']).load_data()

In [75]:
docs[20].text # slide 21

'Overﬁtting\nIssue with evaluation on training data\nIf we allow ourselves to build very complex models , we can always be\nas accurate as we like on the training set\nThe only measure of whether an algorithm will perform well on new\ndata is the evaluation on the test set\nOverﬁtting\nWe expect simple models to generalize better to new data. Therefore,\nwe always want to ﬁnd the simplest model.\nBuilding a model that is too complex for the amount of information\nwe have, is called overﬁtting\nMichela Papandrea (SUPSI) Introduction to Supervised Learning 21 / 25'

In [79]:
docs_front_page = docs[:2]
docs_content = docs[2:]
# don't know why
docs = docs_front_page + docs_content

In [83]:
from llama_index.core.ingestion import IngestionPipeline

# metadata pipeline
ing_pipeline = IngestionPipeline(transformations=transformations)

docs_nodes = ing_pipeline.run(documents=docs)

100%|██████████| 1/1 [00:00<00:00,  1.01it/s]
100%|██████████| 1/1 [00:00<00:00,  1.54it/s]
100%|██████████| 1/1 [00:00<00:00,  2.09it/s]
100%|██████████| 1/1 [00:00<00:00,  1.69it/s]
100%|██████████| 1/1 [00:00<00:00,  2.02it/s]
100%|██████████| 1/1 [00:00<00:00,  2.48it/s]
100%|██████████| 1/1 [00:00<00:00,  1.99it/s]
100%|██████████| 1/1 [00:00<00:00,  1.82it/s]
100%|██████████| 1/1 [00:00<00:00,  1.38it/s]
100%|██████████| 1/1 [00:00<00:00,  1.44it/s]
100%|██████████| 1/1 [00:00<00:00,  1.59it/s]
100%|██████████| 1/1 [00:00<00:00,  1.51it/s]
100%|██████████| 1/1 [00:00<00:00,  1.79it/s]
100%|██████████| 1/1 [00:00<00:00,  1.30it/s]
100%|██████████| 1/1 [00:01<00:00,  1.03s/it]
100%|██████████| 1/1 [00:00<00:00,  2.06it/s]
100%|██████████| 1/1 [00:00<00:00,  1.71it/s]
100%|██████████| 1/1 [00:00<00:00,  2.63it/s]
100%|██████████| 1/1 [00:00<00:00,  1.53it/s]
100%|██████████| 1/1 [00:00<00:00,  2.07it/s]
100%|██████████| 1/1 [00:00<00:00,  1.46it/s]
100%|██████████| 1/1 [00:01<00:00,

In [84]:
docs_nodes[-2].metadata

{'page_label': '24',
 'file_name': '01_introduction_to_SL-4.pdf',
 'file_path': '..\\data\\01_introduction_to_SL-4.pdf',
 'file_type': 'application/pdf',
 'file_size': 790104,
 'creation_date': '2024-03-07',
 'last_modified_date': '2024-03-07',
 'document_title': '"Balancing Model Complexity and Dataset Size in Supervised Learning: A Comprehensive Analysis"',
 'questions_this_excerpt_can_answer': '1. How does the variation of inputs in a training dataset impact the complexity of the model that can be used in supervised learning without overfitting?\n2. What role does dataset size play in determining the complexity of models that can be built in supervised learning?\n3. Why is it important to have a diverse range of data points in a dataset when building complex models in supervised learning, as opposed to simply duplicating or collecting very similar data points?',
 'excerpt_keywords': 'Supervised Learning, Model Complexity, Dataset Size, Overfitting, Training Dataset'}

In [85]:
from llama_index.core.question_gen import LLMQuestionGenerator
from llama_index.core.question_gen.prompts import DEFAULT_SUB_QUESTION_PROMPT_TMPL

question_gen = LLMQuestionGenerator.from_defaults(llm=llm,
                                                  prompt_template_str="""
        Follow the example, but instead of giving a question, always prefix the question
        with: 'By first identifying and quoting the most relevant sources, '.
        """ + DEFAULT_SUB_QUESTION_PROMPT_TMPL)

In [86]:
from llama_index.core import VectorStoreIndex
from llama_index.core.query_engine import SubQuestionQueryEngine
from llama_index.core.tools import QueryEngineTool,ToolMetadata

In [87]:
index = VectorStoreIndex(nodes=docs_nodes)
engine = index.as_query_engine(similarity_top_k=10, llm=OpenAI(model="gpt-4"))

In [88]:
final_engine = SubQuestionQueryEngine.from_defaults(
    query_engine_tools=[
        QueryEngineTool(query_engine=engine,metadata=ToolMetadata(name='ml_documents',description='introduction to machine learning'))],question_gen=question_gen,use_async=True
)

In [89]:
response = final_engine.query(
    """
    What classification and regression examples were used? Give your answer as a JSON.
    """
)
print(response.response)

Generated 2 sub questions.
[1;3;38;2;237;90;200m[ml_documents] Q: By first identifying and quoting the most relevant sources, What classification examples were used in the 'introduction to machine learning' document
[0m[1;3;38;2;90;149;237m[ml_documents] Q: By first identifying and quoting the most relevant sources, What regression examples were used in the 'introduction to machine learning' document
[0m[1;3;38;2;90;149;237m[ml_documents] A: The document provides two examples of regression. The first example is about predicting a person's annual income based on their education, age, and where they live. The second example is related to the basic linear regression equation, where the outcome is predicted based on the features of the data.
[0m[1;3;38;2;237;90;200m[ml_documents] A: The document provides several examples of classification tasks. One example is identifying the zip code from handwritten digits on an envelope, where the input is a scan of the handwriting and the output