# SEC Llamaindex

### Setting Up

In [None]:
!pip install llama-index<0.6.0

* Downloading the files

In [None]:
!mkdir data
!wget https://www.dropbox.com/s/948jr9cfs7fgj99/UBER.zip?dl=1 -O data/UBER.zip
!unzip data/UBER.zip -d data

* Use the model

In [None]:
import os
os.environ['OPENAI_API_KEY'] = ""

In [None]:
from llama_index import download_loader, GPTSimpleVectorIndex
from pathlib import Path

In [None]:
UnstructuredReader = download_loader("UnstructuredReader", refresh_cache=True)

In [None]:
loader = UnstructuredReader()
doc_set = {}
all_docs = []
years = [2022, 2021, 2020, 2019]
for year in years:
    year_docs = loader.load_data(file=Path(f'./data/UBER/UBER_{year}.html'), split_documents=False)
    # insert year metadata into each year
    for d in year_docs:
        d.extra_info = {"year": year}
    doc_set[year] = year_docs
    all_docs.extend(year_docs)

In [None]:
from llama_index import ServiceContext

service_context = ServiceContext.from_defaults(chunk_size_limit=512)

### Index Various Filings

* Set up a separate vector index for each SEC filing from 2019-2022.

In [None]:
# initialize simple vector indices + global vector index
# NOTE: don't run this cell if the indices are already loaded! 
index_set = {}
for year in years:
    cur_index = GPTSimpleVectorIndex.from_documents(doc_set[year], service_context=service_context)
    index_set[year] = cur_index
    cur_index.save_to_disk(f'index_{year}.json')

* Then load from disk

In [None]:
index_set = {}
for year in years:
    cur_index = GPTSimpleVectorIndex.load_from_disk(f'index_{year}.json', service_context=service_context)
    index_set[year] = cur_index

* Then ask the question

In [None]:
response = index_set[2020].query("What were some of the biggest risk factors in 2020?", similarity_top_k=3)

### Resources

[10k Analysis](https://colab.research.google.com/drive/1uL1TdMbR4kqa0Ksrd_Of_jWSxWt1ia7o?usp=sharing#scrollTo=VzpGEMJsauHh)

* See additonal features in the collab