In [9]:
from llama_index.core import SimpleDirectoryReader, StorageContext, load_index_from_storage
from llama_index.core.node_parser import SentenceSplitter
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core import VectorStoreIndex

In [2]:
embed_model = OpenAIEmbedding(model="text-embedding-3-small")
reader = SimpleDirectoryReader(
    input_dir="../contents/files/",
)

docs = reader.load_data()
print(f"Count of Techcrunch articles: {len(docs)}")
print(docs[0])

node_parser = SentenceSplitter(chunk_size=512, chunk_overlap=100)
nodes = node_parser.get_nodes_from_documents(docs)
index = VectorStoreIndex(nodes, embed_model=embed_model, show_progress=True)

Count of Techcrunch articles: 129
Doc ID: c95c9023-9715-4363-a6b7-92b825ed53b2
Text: Published as a conference paper at ICLR 2021 AN IMAGE IS WORTH
16X16 W ORDS : TRANSFORMERS FOR IMAGE RECOGNITION AT SCALE Alexey
Dosovitskiy∗,†, Lucas Beyer∗, Alexander Kolesnikov∗, Dirk
Weissenborn∗, Xiaohua Zhai∗, Thomas Unterthiner, Mostafa Dehghani,
Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil
Houlsby∗,† ∗equal tech...


  from .autonotebook import tqdm as notebook_tqdm
Generating embeddings: 100%|██████████| 450/450 [00:07<00:00, 59.15it/s]


In [3]:
retriever = index.as_retriever(similarity_top_k=3)
query = "Tell me moer about ViT"

for node in retriever.retrieve(query):
    print(node)
    print(node.get_content())
    print("===")

Node ID: 98bfea55-fc22-428e-baf2-69bdfe8759f5
Text: T2T-ViT Backbone As many channels in the backbone of vanilla ViT
are in- valid (Fig. 2), we plan to ﬁnd an efﬁcient backbone for our
T2T-ViT to reduce the redundancy and improve the feature richness.
Thus we explore different architecture designs for ViT and borrow some
designs from CNNs to improve the backbone efﬁciency and enhance the
richness...
Score:  0.407

T2T-ViT Backbone
As many channels in the backbone of vanilla ViT are in-
valid (Fig. 2), we plan to ﬁnd an efﬁcient backbone for our
T2T-ViT to reduce the redundancy and improve the feature
richness. Thus we explore different architecture designs for
ViT and borrow some designs from CNNs to improve the
backbone efﬁciency and enhance the richness of the learned
features. As each transformer layer has skip connection as
ResNets, a straightforward idea is to apply dense connec-
tion as DenseNet [21] to increase the connectivity and fea-
ture richness, or apply Wide-ResNets or Re

In [8]:
index.storage_context.persist(
    persist_dir="../contents/indexes/",
)

In [10]:
storage_context = StorageContext.from_defaults(
    persist_dir="../contents/indexes/",
)
new_index = load_index_from_storage(storage_context)
retriever = new_index.as_retriever(similarity_top_k=3)
query = "Tell me more about ViT"
for node in retriever.retrieve(query):
    print(node)
    print(node.get_content())
    print("===")

Node ID: d446eea8-0987-4890-9779-321db6fed418
Text: 2. Audio Spectrogram Transformer 2.1. Model Architecture Figure
1 illustrates the proposed Audio Spectrogram Trans- former (AST)
architecture. First, the input audio waveform of t seconds is
converted into a sequence of 128-dimensional log Mel ﬁlterbank (fbank)
features computed with a 25ms Ham- ming window every 10ms. This
results in a128 ×100t...
Score:  0.041

2. Audio Spectrogram Transformer
2.1. Model Architecture
Figure 1 illustrates the proposed Audio Spectrogram Trans-
former (AST) architecture. First, the input audio waveform of t
seconds is converted into a sequence of 128-dimensional log
Mel ﬁlterbank (fbank) features computed with a 25ms Ham-
ming window every 10ms. This results in a128 ×100t spectro-
gram as input to the AST. We then split the spectrogram into a
sequence of N 16×16 patches with an overlap of 6 in both time
and frequency dimension, where N = 12⌈(100t −16)/10⌉is
the number of patches and the effective input