# Se debe tener instalado en local Ollama y llama2 

In [1]:
%%capture
# !pip install langchain pypdf openai chromadb tiktoken

In [3]:
# !pip install langchain-community

Collecting langchain-community
  Using cached langchain_community-0.2.1-py3-none-any.whl (2.1 MB)
Collecting dataclasses-json<0.7,>=0.5.7
  Using cached dataclasses_json-0.6.6-py3-none-any.whl (28 kB)
Collecting typing-inspect<1,>=0.4.0
  Using cached typing_inspect-0.9.0-py3-none-any.whl (8.8 kB)
Collecting marshmallow<4.0.0,>=3.18.0
  Using cached marshmallow-3.21.2-py3-none-any.whl (49 kB)
Collecting mypy-extensions>=0.3.0
  Using cached mypy_extensions-1.0.0-py3-none-any.whl (4.7 kB)
Installing collected packages: mypy-extensions, marshmallow, typing-inspect, dataclasses-json, langchain-community
Successfully installed dataclasses-json-0.6.6 langchain-community-0.2.1 marshmallow-3.21.2 mypy-extensions-1.0.0 typing-inspect-0.9.0


In [4]:
import requests
from langchain.document_loaders import PyPDFLoader

urls = [
    'https://arxiv.org/pdf/2306.06031v1.pdf',
    'https://arxiv.org/pdf/2306.12156v1.pdf',
    'https://arxiv.org/pdf/2306.14289v1.pdf',
    'https://arxiv.org/pdf/2305.10973v1.pdf',
    'https://arxiv.org/pdf/2306.13643v1.pdf'
]

ml_papers = []

for i, url in enumerate(urls):
    response = requests.get(url)
    filename = f'paper{i+1}.pdf'
    with open(filename, 'wb') as f:
        f.write(response.content)
        print(f'Descargado {filename}')

        loader = PyPDFLoader(filename)
        data = loader.load()
        ml_papers.extend(data)

# Utiliza la lista ml_papers para acceder a los elementos de todos los documentos descargados
print('Contenido de ml_papers:')
print()

Descargado paper1.pdf
Descargado paper2.pdf
Descargado paper3.pdf
Descargado paper4.pdf
Descargado paper5.pdf
Contenido de ml_papers:



In [5]:
type(ml_papers), len(ml_papers), ml_papers[3]

(list,
 57,
 Document(page_content='Figure 1: FinGPT Framework.\n4.1 Data Sources\nThe first stage of the FinGPT pipeline involves the collec-\ntion of extensive financial data from a wide array of online\nsources. These include, but are not limited to:\n•Financial news: Websites such as Reuters, CNBC, Yahoo\nFinance, among others, are rich sources of financial news\nand market updates. These sites provide valuable informa-\ntion on market trends, company earnings, macroeconomic\nindicators, and other financial events.\n•Social media : Platforms such as Twitter, Facebook, Red-\ndit, Weibo, and others, offer a wealth of information in\nterms of public sentiment, trending topics, and immediate\nreactions to financial news and events.\n•Filings : Websites of financial regulatory authorities, such\nas the SEC in the United States, offer access to company\nfilings. These filings include annual reports, quarterly earn-\nings, insider trading reports, and other important company-\nspecific in

# Split de documentos

In [6]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1500,  #Tamaño del texto de cada chunk
    chunk_overlap=200, #Hace que al principio de cada chunk esten 200 caracteres del anterior, para dar continuidad
    length_function=len #Se hace que el chunk sea por longitud
    # Hay una forma de evaluar los chunks https://chunkviz.up.railway.app/
    )

documents = text_splitter.split_documents(ml_papers)

In [7]:
len(documents), documents[10]

(211,
 Document(page_content='highly volatile, changing rapidly in response to news events\nor market movements.\nTrends , often observable through websites like Seeking\nAlpha, Google Trends, and other finance-oriented blogs and\nforums, offer critical insights into market movements and in-\nvestment strategies. They feature:\n•Analyst perspectives: These platforms provide access to\nmarket predictions and investment advice from seasoned\nfinancial analysts and experts.\n•Market sentiment: The discourse on these platforms can\nreflect the collective sentiment about specific securities,\nsectors, or the overall market, providing valuable insights\ninto the prevailing market mood.\n•Broad coverage: Trends data spans diverse securities and\nmarket segments, offering comprehensive market coverage.\nEach of these data sources provides unique insights into\nthe financial world. By integrating these diverse data types,\nfinancial language models like FinGPT can facilitate a com-\nprehensive 

# Embeddings e ingesta de datos vectorial

In [10]:
from langchain_community.chat_models import ChatOllama
ollama = ChatOllama(model="llama2")
from langchain_community.embeddings import OllamaEmbeddings
from langchain.vectorstores import Chroma


embeddings = OllamaEmbeddings(model="llama2")

vectorstore = Chroma.from_documents(
    documents=documents,
    embedding=embeddings
)
# Un retriever convierte la base de datos "retorna" los fragmentos clave para responder la pregunta
retriever = vectorstore.as_retriever(
    search_kwargs={"k": 3} # Solo busca los 3 fragmentos que más se parecen
    )

In [11]:
from langchain_community.llms import Ollama
from langchain.chains import RetrievalQA

chat = ChatOllama(
    model_name='llama2'
)

qa_chain = RetrievalQA.from_chain_type(
    llm=chat,
    chain_type="stuff",  # Lo que quepa en el prompt vamos a utilizar
    retriever=retriever
)

In [14]:
query = "qué es fingpt?"  # Me apareció que ahora no se usa .run sino invoke
qa_chain.run(query)

'FInGPT is an open-source large language model for the financial sector. It is designed to provide researchers and practitioners with accessible and transparent resources to develop their own financial language models (FinLLMs). FinGPT takes a data-centric approach, unlike proprietary models that rely on privileged access to high-quality financial data.\n\nThe paper presents an overview of FinGPT and its importance in democratizing financial language models. The authors highlight the need for an automatic data curation pipeline and a lightweight low-rank adaptation technique in building FinGPT. They also showcase several potential applications of FinGPT, such as robo-advising, algorithmic trading, and low-code development.\n\nFinGPT is part of the open-source AI4Finance community, which aims to stimulate innovation, democratize financial language models, and unlock new opportunities in open finance. Two associated code repos are available on GitHub.'

In [15]:
query = "qué hace complicado entrenar un modelo como el fingpt?"
qa_chain.run(query)

'Based on the context provided, it seems that finetuning a model like FiGPT (a lightweight image encoder) can be challenging for several reasons:\n\n1. Data quality and availability: The performance of the model depends on the quality and quantity of the training data. However, high-quality financial data can be difficult to obtain, especially for non-mainstream assets like cryptocurrencies.\n2. Customization: FiGPT champions open-source values, which means that users need to adapt the model to their specific needs. This can be time-consuming and require significant expertise in machine learning.\n3. Cost: The cost of training a model like FiGPT can be high, typically between $100 to $300. This can be a barrier for individuals or organizations with limited budgets.\n4. Computational resources: Training a large language model like FiGPT requires significant computational resources, which can be challenging for mobile devices or low-power hardware.\n5. Mask decoder: The mask decoder in F

In [16]:
query = "qué es fast segment?"
qa_chain.run(query)

'Based on the provided context, "Fast Segment" seems to refer to a task or technique related to image segmentation. However, I cannot provide a definitive answer without more information or context. Could you please provide more details or clarify what you mean by "Fast Segment"?'

In [17]:
query = "cuál es la diferencia entre fast sam y mobile sam?"
qa_chain.run(query)

'Based on the provided context, I can answer your question as follows:\n\nFastSAM and MobileSAM are both variants of the Segment Anything Model (SAM), a vision foundation model that can be fine-tuned for various tasks. The main difference between FastSAM and MobileSAM is their design and purpose:\n\n1. FastSAM:\nFastSAM uses TensorRT for inference, which allows for faster performance compared to traditional SAM. This is achieved by leveraging the optimized hardware of TensorRT, such as GPUs or TPUs, to accelerate the inference process. FastSAM is designed primarily for high-performance computing tasks that require fast inference times, such as real-time object detection or autonomous driving applications.\n2. MobileSAM:\nMobileSAM is designed for mobile and edge devices, where computational resources are limited. It uses a different architecture and training procedure than FastSAM to achieve better performance on these devices. MobileSAM trades off some of the accuracy of FastSAM in fa