In [7]:
from langchain_community.vectorstores import FAISS
from langchain.embeddings import CacheBackedEmbeddings
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.storage import LocalFileStore

store = LocalFileStore("./cache/")


In [8]:
import warnings
warnings.filterwarnings("ignore")

In [9]:
#embed_model_id = 'sentence-transformers/all-MiniLM-L6-v2'
#embed_model_id = 'voyage-lite-02-instruct' #not available through HF
#embed_model_id = 'WhereIsAI/UAE-Large-V1'
embed_model_id = 'mixedbread-ai/mxbai-embed-large-v1'
core_embeddings_model = HuggingFaceEmbeddings(
    model_name=embed_model_id,
    model_kwargs={"trust_remote_code":True}
    # model_name=embed_model_id
)
embedder = CacheBackedEmbeddings.from_bytes_store(
    core_embeddings_model, store, namespace=embed_model_id

)

In [10]:
import os
existing_files = set(os.listdir('../abstracts'))

In [11]:
#existing_files = list(existing_files)

In [12]:
def get_title_abstract_abbrev(directory = '../abstracts/',filename=None):
    path = os.path.join(directory, filename)
    with open(path, 'r') as file:
        lines = file.readlines()
        title = lines[0].strip()
        abstract = "".join(lines[1:]).replace('\n', ' ')
    return title, abstract, filename.replace('.txt','')


In [13]:
get_title_abstract_abbrev(filename=next(iter(existing_files)))

('RAVEN: In-Context Learning with Retrieval-Augmented Encoder-Decoder  Language Models',
 'In this paper, we investigate the in-context learning ability of retrieval-augmented encoder-decoder language models. We first conduct a comprehensive analysis of existing models and identify their limitations in in-context learning, primarily due to a mismatch between pretraining and inference, as well as a restricted context length. To address these issues, we propose RAVEN, a model that combines retrieval-augmented masked language modeling and prefix language modeling. We further introduce Fusion-in-Context Learning to enhance the few-shot performance by enabling the model to leverage more in-context examples without requiring additional training. Through extensive experiments, we demonstrate that our simple yet effective design significantly improves performance, achieving results comparable to the most advanced language models in certain scenarios, despite having substantially fewer paramete

In [14]:
for file in existing_files:
    title, abstract,_ = get_title_abstract_abbrev(filename=file)
    print(title)
    print(abstract)
    print()

RAVEN: In-Context Learning with Retrieval-Augmented Encoder-Decoder  Language Models
In this paper, we investigate the in-context learning ability of retrieval-augmented encoder-decoder language models. We first conduct a comprehensive analysis of existing models and identify their limitations in in-context learning, primarily due to a mismatch between pretraining and inference, as well as a restricted context length. To address these issues, we propose RAVEN, a model that combines retrieval-augmented masked language modeling and prefix language modeling. We further introduce Fusion-in-Context Learning to enhance the few-shot performance by enabling the model to leverage more in-context examples without requiring additional training. Through extensive experiments, we demonstrate that our simple yet effective design significantly improves performance, achieving results comparable to the most advanced language models in certain scenarios, despite having substantially fewer parameters. Ou

In [15]:
import pandas as pd
new_articles=pd.read_csv('/home/mainuser/Desktop/LLMs/RagOverArXiv/scrape/data/articles_up_to_2024-04-16.csv')

In [16]:
new_articles.head()

Unnamed: 0,title,authors,abstract,arxiv_abbrev
0,Ferret-UI: Grounded Mobile UI Understanding wi...,"Authors:Keen You,Haotian Zhang,Eldon Schoop,Fl...",Recent advancements in multimodal large langua...,2404.05719
1,"ByteEdit: Boost, Comply and Accelerate Generat...","Authors:Yuxi Ren,Jie Wu,Yanzuo Lu,Huafeng Kuan...",Recent advancements in diffusion-based generat...,2404.0486
2,SpatialTracker: Tracking Any 2D Pixels in 3D S...,"Authors:Yuxi Xiao,Qianqian Wang,Shangzhan Zhan...",Recovering dense and long-range pixel motion i...,2404.04319
3,SwapAnything: Enabling Arbitrary Object Swappi...,"Authors:Jing Gu,Yilin Wang,Nanxuan Zhao,Wei Xi...",Effective editing of personal content holds a ...,2404.05717
4,BeyondScene: Higher-Resolution Human-Centric S...,"Authors:Gwanghyun Kim,Hayeon Kim,Hoigi Seo,Don...",Generating higher-resolution human-centric sce...,2404.04544


In [17]:
from langchain.schema.document import Document

In [18]:
docs = [get_title_abstract_abbrev(filename=file) for file in existing_files]
docs = [Document(page_content=doc[1],metadata={'title':doc[0], 'abbrev':doc[2]}) for doc in docs]


In [19]:
len(docs)

37

In [13]:
docs[0].__getattribute__('page_content')

'Large pre-trained language models have been shown to store factual knowledge in their parameters, and achieve state-of-the-art results when fine-tuned on downstream NLP tasks. However, their ability to access and precisely manipulate knowledge is still limited, and hence on knowledge-intensive tasks, their performance lags behind task-specific architectures. Additionally, providing provenance for their decisions and updating their world knowledge remain open research problems. Pre-trained models with a differentiable access mechanism to explicit non-parametric memory can overcome this issue, but have so far been only investigated for extractive downstream tasks. We explore a general-purpose fine-tuning recipe for retrieval-augmented generation (RAG) -- models which combine pre-trained parametric and non-parametric memory for language generation. We introduce RAG models where the parametric memory is a pre-trained seq2seq model and the non-parametric memory is a dense vector index of W

- TODO: using the merge functionality, write a function to load an existing vector store and add a new vector to it

In [38]:
# for i, doc in enumerate(docs):
#    #doc = doc[1]
#    #content = docs[index]
#    if i == 0:
#        vector_store = FAISS.from_documents([doc], embedder)
#    else:
#       vector_store_i = FAISS.from_documents([doc], embedder)
#       vector_store.merge_from(vector_store_i)

# vector_store

In [23]:
# Adding vs_index: Note if add new ones, to existing local vs, get len(vector_store.index_to_docstore_id)
# and start from there
for i, doc in enumerate(docs):
   doc.metadata['vs_index'] = i
   if i == 0:
       vector_store = FAISS.from_documents([doc], embedder)
   else:
      vector_store_i = FAISS.from_documents([doc], embedder)
      vector_store.merge_from(vector_store_i)

vector_store

<langchain_community.vectorstores.faiss.FAISS at 0x7d0e5bc308f0>

In [24]:
vector_store.index_to_docstore_id

{0: '169eaef8-d090-4bcf-818f-9670563dea42',
 1: 'abc9e76c-1423-4a5f-9189-347d0ea26139',
 2: '9cf51bed-81b7-473c-b3d5-42bc513c14ae',
 3: '9ba95860-51ff-41ea-931a-ca0b57429f9d',
 4: 'db44c24a-5f7e-499f-a817-4d207f1c1403',
 5: '01fb2a1d-d0b6-4a8b-bc5b-ebf3f5114060',
 6: '614d2e74-8f32-43e7-9c24-3d7880e06adb',
 7: 'e3611fe1-00c6-47d5-8c0d-3a378138628d',
 8: '6ed86cdb-957b-4587-864b-473ba2afb8fd',
 9: 'b07c29c1-772b-43fc-b73d-b1e6fa6e8776',
 10: '42129278-1525-491a-aa89-55779c8b82f5',
 11: '89331833-e9c5-42f5-a91e-1d699dbbd00d',
 12: 'f5600839-ecf7-41b7-87d3-bb3f1154319e',
 13: 'd15a83eb-5cb0-4f67-aea8-13a08439310d',
 14: 'cf37b5e9-0ca4-4ee9-a8bb-397690dff461',
 15: 'df1a81c7-e971-4469-a0ce-5900b873e3a9',
 16: '2f8abffb-f2ee-47d6-841a-349ff8167f10',
 17: '88417c94-b003-40a9-aad7-fe2497433827',
 18: '263231c0-cd30-4d67-b672-899f22efdcfb',
 19: '3cc4098d-a837-4195-a5f5-bf55ba06043e',
 20: 'b2d8aa31-8e0d-4a21-b625-219472f3578f',
 21: '26a1b4ec-583a-4cf3-b8b6-06b28ce9a191',
 22: '62863b24-55c3-

In [17]:
index_id = 36
embedding_vector = vector_store.index.reconstruct_n(index_id, 1)[0]

In [18]:
embedding_vector.shape

(1024,)

In [19]:
docs[36]

Document(page_content='Retrieval-Augmented Language Modeling (RALM) methods, which condition a language model (LM) on relevant documents from a grounding corpus during generation, were shown to significantly improve language modeling performance. In addition, they can mitigate the problem of factually inaccurate text generation and provide natural source attribution mechanism. Existing RALM approaches focus on modifying the LM architecture in order to facilitate the incorporation of external information, significantly complicating deployment. This paper considers a simple alternative, which we dub In-Context RALM: leaving the LM architecture unchanged and prepending grounding documents to the input, without any further training of the LM. We show that In-Context RALM that builds on off-the-shelf general purpose retrievers provides surprisingly large LM gains across model sizes and diverse corpora. We also demonstrate that the document retrieval and ranking mechanism can be specialized 

In [20]:
#vector_store.asimilaritysearch('What is PPO?')
vector_store.asimilarity_search('What is PPO?')

<coroutine object FAISS.asimilarity_search at 0x7a632abe4280>

In [45]:
query = "The complexity of the alignment problem stems from the fact that existing methods are unstable. Researchers continuously invent various tricks to address this shortcoming. For instance, in the fundamental Reinforcement Learning From Human Feedback (RLHF) technique of Language Model alignment, in addition to reward maximization, the Kullback-Leibler divergence between the trainable policy and the SFT policy is minimized. This addition prevents the model from being overfitted to the Reward Model (RM) and generating texts that are out-of-domain for the RM. The Direct Preference Optimization (DPO) method reformulates the optimization task of RLHF and eliminates the Reward Model while tacitly maintaining the requirement for the policy to be close to the SFT policy. In our paper, we argue that this implicit limitation in the DPO method leads to sub-optimal results. We propose a new method called Trust Region DPO (TR-DPO), which updates the reference policy during training. With such a straightforward update, we demonstrate the effectiveness of TR-DPO against DPO on the Anthropic HH and TLDR datasets. We show that TR-DPO outperforms DPO by up to 19%, measured by automatic evaluation with GPT-4. The new alignment approach that we propose allows us to improve the quality of models across several parameters at once, such as coherence, correctness, level of detail, helpfulness, and harmlessness."
#query = 'The quadratic complexity and weak length extrapolation of Transformers limits their ability to scale to long sequences, and while sub-quadratic solutions like linear attention and state space models exist, they empirically underperform Transformers in pretraining efficiency and downstream task accuracy. We introduce Megalodon, a neural architecture for efficient sequence modeling with unlimited context length. Megalodon inherits the architecture of Mega (exponential moving average with gated attention), and further introduces multiple technical components to improve its capability and stability, including complex exponential moving average (CEMA), timestep normalization layer, normalized attention mechanism and pre-norm with two-hop residual configuration. In a controlled head-to-head comparison with Llama2, Megalodon achieves better efficiency than Transformer in the scale of 7 billion parameters and 2 trillion training tokens. Megalodon reaches a training loss of 1.70, landing mid-way between Llama2-7B (1.75) and 13B (1.67). Code: https://github.com/XuezheMax/megalodon'
#query = ''
embedding_vector = core_embeddings_model.embed_query(query)
docs = vector_store.similarity_search_by_vector(embedding_vector, k = 4)

for page in docs:
  print(page.page_content)
  print(page.metadata)

While large-scale unsupervised language models (LMs) learn broad world knowledge and some reasoning skills, achieving precise control of their behavior is difficult due to the completely unsupervised nature of their training. Existing methods for gaining such steerability collect human labels of the relative quality of model generations and fine-tune the unsupervised LM to align with these preferences, often with reinforcement learning from human feedback (RLHF). However, RLHF is a complex and often unstable procedure, first fitting a reward model that reflects the human preferences, and then fine-tuning the large unsupervised LM using reinforcement learning to maximize this estimated reward without drifting too far from the original model. In this paper we introduce a new parameterization of the reward model in RLHF that enables extraction of the corresponding optimal policy in closed form, allowing us to solve the standard RLHF problem with only a simple classification loss. The resu

In [21]:
query = "The complexity of the alignment problem stems from the fact that existing methods are unstable. Researchers continuously invent various tricks to address this shortcoming. For instance, in the fundamental Reinforcement Learning From Human Feedback (RLHF) technique of Language Model alignment, in addition to reward maximization, the Kullback-Leibler divergence between the trainable policy and the SFT policy is minimized. This addition prevents the model from being overfitted to the Reward Model (RM) and generating texts that are out-of-domain for the RM. The Direct Preference Optimization (DPO) method reformulates the optimization task of RLHF and eliminates the Reward Model while tacitly maintaining the requirement for the policy to be close to the SFT policy. In our paper, we argue that this implicit limitation in the DPO method leads to sub-optimal results. We propose a new method called Trust Region DPO (TR-DPO), which updates the reference policy during training. With such a straightforward update, we demonstrate the effectiveness of TR-DPO against DPO on the Anthropic HH and TLDR datasets. We show that TR-DPO outperforms DPO by up to 19%, measured by automatic evaluation with GPT-4. The new alignment approach that we propose allows us to improve the quality of models across several parameters at once, such as coherence, correctness, level of detail, helpfulness, and harmlessness."
source = 'Learn Your Reference Model for Real Good Alignment'
abbrev = '2404.09656'
new_doc = Document(page_content=query,metadata={'source':source, 'abbrev':abbrev})
vector_store_i = FAISS.from_documents([new_doc], embedder)
vector_store.merge_from(vector_store_i)

In [22]:
len(vector_store.index_to_docstore_id)

38

In [23]:
# Experimental
query = "The complexity of the alignment problem stems from the fact that existing methods are unstable. Researchers continuously invent various tricks to address this shortcoming. For instance, in the fundamental Reinforcement Learning From Human Feedback (RLHF) technique of Language Model alignment, in addition to reward maximization, the Kullback-Leibler divergence between the trainable policy and the SFT policy is minimized. This addition prevents the model from being overfitted to the Reward Model (RM) and generating texts that are out-of-domain for the RM. The Direct Preference Optimization (DPO) method reformulates the optimization task of RLHF and eliminates the Reward Model while tacitly maintaining the requirement for the policy to be close to the SFT policy. In our paper, we argue that this implicit limitation in the DPO method leads to sub-optimal results. We propose a new method called Trust Region DPO (TR-DPO), which updates the reference policy during training. With such a straightforward update, we demonstrate the effectiveness of TR-DPO against DPO on the Anthropic HH and TLDR datasets. We show that TR-DPO outperforms DPO by up to 19%, measured by automatic evaluation with GPT-4. The new alignment approach that we propose allows us to improve the quality of models across several parameters at once, such as coherence, correctness, level of detail, helpfulness, and harmlessness."
#query = 'The quadratic complexity and weak length extrapolation of Transformers limits their ability to scale to long sequences, and while sub-quadratic solutions like linear attention and state space models exist, they empirically underperform Transformers in pretraining efficiency and downstream task accuracy. We introduce Megalodon, a neural architecture for efficient sequence modeling with unlimited context length. Megalodon inherits the architecture of Mega (exponential moving average with gated attention), and further introduces multiple technical components to improve its capability and stability, including complex exponential moving average (CEMA), timestep normalization layer, normalized attention mechanism and pre-norm with two-hop residual configuration. In a controlled head-to-head comparison with Llama2, Megalodon achieves better efficiency than Transformer in the scale of 7 billion parameters and 2 trillion training tokens. Megalodon reaches a training loss of 1.70, landing mid-way between Llama2-7B (1.75) and 13B (1.67). Code: https://github.com/XuezheMax/megalodon'
#query = ''
embedding_vector = core_embeddings_model.embed_query(query)
docs = vector_store.similarity_search_by_vector(embedding_vector, k = 10)

for page in docs:
  print(page.page_content)
  print(page.metadata)

The complexity of the alignment problem stems from the fact that existing methods are unstable. Researchers continuously invent various tricks to address this shortcoming. For instance, in the fundamental Reinforcement Learning From Human Feedback (RLHF) technique of Language Model alignment, in addition to reward maximization, the Kullback-Leibler divergence between the trainable policy and the SFT policy is minimized. This addition prevents the model from being overfitted to the Reward Model (RM) and generating texts that are out-of-domain for the RM. The Direct Preference Optimization (DPO) method reformulates the optimization task of RLHF and eliminates the Reward Model while tacitly maintaining the requirement for the policy to be close to the SFT policy. In our paper, we argue that this implicit limitation in the DPO method leads to sub-optimal results. We propose a new method called Trust Region DPO (TR-DPO), which updates the reference policy during training. With such a straig

In [49]:
# query = "The complexity of the alignment problem stems from the fact that existing methods are unstable. Researchers continuously invent various tricks to address this shortcoming. For instance, in the fundamental Reinforcement Learning From Human Feedback (RLHF) technique of Language Model alignment, in addition to reward maximization, the Kullback-Leibler divergence between the trainable policy and the SFT policy is minimized. This addition prevents the model from being overfitted to the Reward Model (RM) and generating texts that are out-of-domain for the RM. The Direct Preference Optimization (DPO) method reformulates the optimization task of RLHF and eliminates the Reward Model while tacitly maintaining the requirement for the policy to be close to the SFT policy. In our paper, we argue that this implicit limitation in the DPO method leads to sub-optimal results. We propose a new method called Trust Region DPO (TR-DPO), which updates the reference policy during training. With such a straightforward update, we demonstrate the effectiveness of TR-DPO against DPO on the Anthropic HH and TLDR datasets. We show that TR-DPO outperforms DPO by up to 19%, measured by automatic evaluation with GPT-4. The new alignment approach that we propose allows us to improve the quality of models across several parameters at once, such as coherence, correctness, level of detail, helpfulness, and harmlessness."
# #query = 'The quadratic complexity and weak length extrapolation of Transformers limits their ability to scale to long sequences, and while sub-quadratic solutions like linear attention and state space models exist, they empirically underperform Transformers in pretraining efficiency and downstream task accuracy. We introduce Megalodon, a neural architecture for efficient sequence modeling with unlimited context length. Megalodon inherits the architecture of Mega (exponential moving average with gated attention), and further introduces multiple technical components to improve its capability and stability, including complex exponential moving average (CEMA), timestep normalization layer, normalized attention mechanism and pre-norm with two-hop residual configuration. In a controlled head-to-head comparison with Llama2, Megalodon achieves better efficiency than Transformer in the scale of 7 billion parameters and 2 trillion training tokens. Megalodon reaches a training loss of 1.70, landing mid-way between Llama2-7B (1.75) and 13B (1.67). Code: https://github.com/XuezheMax/megalodon'
# #query = ''
# embedding_vector = core_embeddings_model.embed_query(query)
# docs = vector_store.similarity_search_by_vector(embedding_vector, k = 4)

# for page in docs:
#   print(page.page_content)
#   print(page.metadata)

In [24]:
new_articles.loc[0,'abstract']

'Recent advancements in multimodal large language models (MLLMs) have been noteworthy, yet, these general-domain MLLMs often fall short in their ability to comprehend and interact effectively with user interface (UI) screens. In this paper, we present Ferret-UI, a new MLLM tailored for enhanced understanding of mobile UI screens, equipped with referring, grounding, and reasoning capabilities. Given that UI screens typically exhibit a more elongated aspect ratio and contain smaller objects of interest (e.g., icons, texts) than natural images, we incorporate "any resolution" on top of Ferret to magnify details and leverage enhanced visual features. Specifically, each screen is divided into 2 sub-images based on the original aspect ratio (i.e., horizontal division for portrait screens and vertical division for landscape screens). Both sub-images are encoded separately before being sent to LLMs. We meticulously gather training samples from an extensive range of elementary UI tasks, such as

In [25]:
query=new_articles.loc[0,'abstract']
embedding_vector = core_embeddings_model.embed_query(query)
docs = vector_store.similarity_search_by_vector(embedding_vector, k = 10)

for page in docs:
  print(page.page_content)
  print(page.metadata)

We continue the investigation into the power of smaller Transformer-based language models as initiated by \textbf{TinyStories} -- a 10 million parameter model that can produce coherent English -- and the follow-up work on \textbf{phi-1}, a 1.3 billion parameter model with Python coding performance close to the state-of-the-art. The latter work proposed to use existing Large Language Models (LLMs) to generate ``textbook quality" data as a way to enhance the learning process compared to traditional web data. We follow the ``Textbooks Are All You Need" approach, focusing this time on common sense reasoning in natural language, and create a new 1.3 billion parameter model named \textbf{phi-1.5}, with performance on natural language tasks comparable to models 5x larger, and surpassing most non-frontier LLMs on more complex reasoning tasks such as grade-school mathematics and basic coding. More generally, \textbf{phi-1.5} exhibits many of the traits of much larger LLMs, both good -- such as 

In [26]:
new_articles.loc[1,'abstract']

"Recent advancements in diffusion-based generative image editing have sparked a profound revolution, reshaping the landscape of image outpainting and inpainting tasks. Despite these strides, the field grapples with inherent challenges, including: i) inferior quality; ii) poor consistency; iii) insufficient instrcution adherence; iv) suboptimal generation efficiency. To address these obstacles, we present ByteEdit, an innovative feedback learning framework meticulously designed to Boost, Comply, and Accelerate Generative Image Editing tasks. ByteEdit seamlessly integrates image reward models dedicated to enhancing aesthetics and image-text alignment, while also introducing a dense, pixel-level reward model tailored to foster coherence in the output. Furthermore, we propose a pioneering adversarial and progressive feedback learning strategy to expedite the model's inference speed. Through extensive large-scale user evaluations, we demonstrate that ByteEdit surpasses leading generative im

In [27]:
query=new_articles.loc[0,'abstract']
embedding_vector = core_embeddings_model.embed_query(query)
docs = vector_store.similarity_search_by_vector(embedding_vector, k = 10)
vs_indices = [doc.metadata['vs_index'] for doc in docs]
for page in docs:
  print(page.page_content)
  print(page.metadata)

We continue the investigation into the power of smaller Transformer-based language models as initiated by \textbf{TinyStories} -- a 10 million parameter model that can produce coherent English -- and the follow-up work on \textbf{phi-1}, a 1.3 billion parameter model with Python coding performance close to the state-of-the-art. The latter work proposed to use existing Large Language Models (LLMs) to generate ``textbook quality" data as a way to enhance the learning process compared to traditional web data. We follow the ``Textbooks Are All You Need" approach, focusing this time on common sense reasoning in natural language, and create a new 1.3 billion parameter model named \textbf{phi-1.5}, with performance on natural language tasks comparable to models 5x larger, and surpassing most non-frontier LLMs on more complex reasoning tasks such as grade-school mathematics and basic coding. More generally, \textbf{phi-1.5} exhibits many of the traits of much larger LLMs, both good -- such as 

In [28]:
vs_indices

[34, 16, 15, 10, 26, 23, 36, 4, 7, 8]

In [29]:
import numpy as np
import torch
#similar_embedding_vectors = torch.tensor(np.array([vector_store.index.reconstruct_n(index_id, 1)[0] for index_id in vs_indices]))
similar_embedding_vectors = np.array([vector_store.index.reconstruct_n(index_id, 1)[0] for index_id in vs_indices])

In [30]:
similar_embedding_vectors.shape

(10, 1024)

In [31]:
np.array(embedding_vector).shape

(1024,)

In [32]:
distances = np.linalg.norm(similar_embedding_vectors-np.array(embedding_vector), axis=1)
average_distance = np.mean(distances)
distances, average_distance

(array([12.20101666, 12.42112783, 12.57575874, 12.70079431, 12.72302538,
        12.77311078, 12.8005531 , 12.80305586, 12.96793891, 13.0623732 ]),
 12.702875475669595)

- 12.7 avg similarity for TinyStories paper and 14.7 for diffusion one (of which don't have many in Zotero), seems sensible
- Could perhaps take top-1 as similarity measure (read that paper, after all) and average_distance from my vector to all the embedding vectors as novelty, combine these two...

- OK, so the issue is that the papers are a bit over the map, generally in ['llms','vision','diffusion','other camp'], could try to you use a smaller quanitized model to extract topic, just need to monitor the time taken.

In [48]:
from exllamav2 import *
from exllamav2.generator import *
import sys, torch


generator_config = ExLlamaV2Config()
generator_config.model_dir = "/home/mainuser/Desktop/LLMs/MiStralInference"
#generator_config.model_dir = '/home/mainuser/Desktop/LLMs/Mixtral4bit'
generator_config.prepare()

generator_model = ExLlamaV2(generator_config)
cache = ExLlamaV2Cache(generator_model, lazy = True)

print("Loading model...")
generator_model.load_autosplit(cache)

generator_tokenizer = ExLlamaV2Tokenizer(generator_config)
generator_llm = ExLlamaV2StreamingGenerator(generator_model, cache, generator_tokenizer)
generator_llm.set_stop_conditions([generator_tokenizer.eos_token_id])
generator_settings = ExLlamaV2Sampler.Settings()
generator_settings.temperature = 0.85
generator_settings.top_k = 50
generator_settings.top_p = 0.8
generator_settings.token_repetition_penalty = 1.01

Loading model...


In [49]:
def call_llm(
    question: str,
    generator: ExLlamaV2StreamingGenerator,
    settings:ExLlamaV2Sampler.Settings,
    max_new_tokens = 512
    ):

    max_new_tokens = max_new_tokens

    generator.warmup()
    output = generator.generate_simple(f"<s>[INST] {question} [/INST]", settings, max_new_tokens, seed = 1234)
    return output



In [35]:
abstract = new_articles.loc[0,'abstract']
topic_classification_prompt = """
Your task is to take an arXiv abstract and classify it into one of the following categories: LLMs, stable diffusion, computer vision, or other.
Only classify the abstract as 'other' if you're sure it doesn't fit into any of the other categories.

Provide your answer as follows:


Answer: category

Now here is the abstract: {abstract}

Output:::"""
#topic_classification_prompt = 
call_llm(question=topic_classification_prompt, generator=generator_llm,settings=generator_settings,max_new_tokens=24)

"<s>[INST] \nYour task is to take an arXiv abstract and classify it into one of the following categories: LLMs, stable diffusion, computer vision, or other.\nOnly classify the abstract as 'other' if you're sure it doesn't fit into any of the other categories.\n\nProvide your answer as follows:\n\n\nAnswer: category\n\nNow here is the abstract: {abstract}\n\nOutput::: [/INST] I'm sorry, but I need the abstract to classify it into a category. Please provide the abstract."

In [36]:
abstract

'Recent advancements in multimodal large language models (MLLMs) have been noteworthy, yet, these general-domain MLLMs often fall short in their ability to comprehend and interact effectively with user interface (UI) screens. In this paper, we present Ferret-UI, a new MLLM tailored for enhanced understanding of mobile UI screens, equipped with referring, grounding, and reasoning capabilities. Given that UI screens typically exhibit a more elongated aspect ratio and contain smaller objects of interest (e.g., icons, texts) than natural images, we incorporate "any resolution" on top of Ferret to magnify details and leverage enhanced visual features. Specifically, each screen is divided into 2 sub-images based on the original aspect ratio (i.e., horizontal division for portrait screens and vertical division for landscape screens). Both sub-images are encoded separately before being sent to LLMs. We meticulously gather training samples from an extensive range of elementary UI tasks, such as

In [37]:
import re

In [38]:
# abstract = new_articles.loc[0,'abstract']
# topic_classification_prompt = f"""
# Your task is to take an arXiv abstract and classify it into one of the following categories: LLMs, diffusion, computer vision, or other.
# Only classify the abstract as 'other' if you're sure it doesn't fit into any of the other categories.

# Provide your answer as follows:


# Answer: category

# Now here is the abstract: {abstract}

# Output::: """
# #topic_classification_prompt = 
# ans = call_llm(question=topic_classification_prompt, generator=generator_llm,settings=generator_settings,max_new_tokens=48)[len(topic_classification_prompt):]

# ans
# pattern = r'(Category|Answer):\s*(\w+)'
# match = re.search(pattern, ans)
# match.group(2)

In [39]:
# #abstract = r"""JWST is discovering a large population of z>4 supermassive black holes (SMBHs) that are overmassive with respect to the stellar content of their hosts. A previous study developed a physical model to interpret this overmassive population as the result of quasar feedback acting on a compact host galaxy. In this Note, we apply this model to JADES GN 1146115, a dormant supermassive black hole at z=6.7 whose mass is ∼40% of the host's mass in stars and accreting at ∼2% of the Eddington limit. The host has been forming stars at the low rate of ∼1M⊙yr−1 for the past ∼100 Myr. Our model suggests that this galactic system is on the verge of a resurgence of global star formation activity. This transition comes after a period of domination by the effect of its overmassive black hole, whose duration is comparable to typical quasar lifetimes. """
# #abstract = r"""Text animation serves as an expressive medium, transforming static communication into dynamic experiences by infusing words with motion to evoke emotions, emphasize meanings, and construct compelling narratives. Crafting animations that are semantically aware poses significant challenges, demanding expertise in graphic design and animation. We present an automated text animation scheme, termed "Dynamic Typography", which combines two challenging tasks. It deforms letters to convey semantic meaning and infuses them with vibrant movements based on user prompts. Our technique harnesses vector graphics representations and an end-to-end optimization-based framework. This framework employs neural displacement fields to convert letters into base shapes and applies per-frame motion, encouraging coherence with the intended textual concept. Shape preservation techniques and perceptual loss regularization are employed to maintain legibility and structural integrity throughout the animation process. We demonstrate the generalizability of our approach across various text-to-video models and highlight the superiority of our end-to-end methodology over baseline methods, which might comprise separate tasks. Through quantitative and qualitative evaluations, we demonstrate the effectiveness of our framework in generating coherent text animations that faithfully interpret user prompts while maintaining readability. Our code is available at: https://animate-your-word.github.io/demo/."""
# #abstract = r"""We introduce Reka Core, Flash, and Edge, a series of powerful multimodal language models trained from scratch by Reka. Reka models are able to process and reason with text, images, video, and audio inputs. This technical report discusses details of training some of these models and provides comprehensive evaluation results. We show that Reka Edge and Reka Flash are not only state-of-the-art but also outperform many much larger models, delivering outsized values for their respective compute class. Meanwhile, our most capable and largest model, Reka Core, approaches the best frontier models on both automatic evaluations and blind human evaluations. On image question answering benchmarks (e.g. MMMU, VQAv2), Core performs competitively to GPT4-V. Meanwhile, on multimodal chat, Core ranks as the second most preferred model under a blind third-party human evaluation setup, outperforming other models such as Claude 3 Opus. On text benchmarks, Core not only performs competitively to other frontier models on a set of well-established benchmarks (e.g. MMLU, GSM8K) but also outperforms GPT4-0613 on human evaluation. On video question answering (Perception-Test), Core outperforms Gemini Ultra. Models are shipped in production at http://chat.reka.ai . A showcase of non cherry picked qualitative examples can also be found at http://showcase.reka.ai ."""
# abstract = r"""Despite the impressive capabilities of Large Language Models (LLMs) on various tasks, they still struggle with scenarios that involves complex reasoning and planning. Recent work proposed advanced prompting techniques and the necessity of fine-tuning with high-quality data to augment LLMs' reasoning abilities. However, these approaches are inherently constrained by data availability and quality. In light of this, self-correction and self-learning emerge as viable solutions, employing strategies that allow LLMs to refine their outputs and learn from self-assessed rewards. Yet, the efficacy of LLMs in self-refining its response, particularly in complex reasoning and planning task, remains dubious. In this paper, we introduce AlphaLLM for the self-improvements of LLMs, which integrates Monte Carlo Tree Search (MCTS) with LLMs to establish a self-improving loop, thereby enhancing the capabilities of LLMs without additional annotations. Drawing inspiration from the success of AlphaGo, AlphaLLM addresses the unique challenges of combining MCTS with LLM for self-improvement, including data scarcity, the vastness search spaces of language tasks, and the subjective nature of feedback in language tasks. AlphaLLM is comprised of prompt synthesis component, an efficient MCTS approach tailored for language tasks, and a trio of critic models for precise feedback. Our experimental results in mathematical reasoning tasks demonstrate that AlphaLLM significantly enhances the performance of LLMs without additional annotations, showing the potential for self-improvement in LLMs."""

# topic_classification_prompt = f"""
# Your task is to take an arXiv abstract and classify it into one of the following categories: LLMs, diffusion, computer vision, multimodal.
# If you see multimodal in abstract, classify as multimodal.  If you see diffusion in the abstract, classify as diffusion.
# If you see LLMs or language models in the abstract, classify as LLMs.  If you see computer vision in the abstract, classify as computer vision.
# Now here is the abstract: {abstract}
# Check your answer before returning it and provide a brief explanation.


# Category:
# """
# #topic_classification_prompt = 
# ans = call_llm(question=topic_classification_prompt, generator=generator_llm,settings=generator_settings,max_new_tokens=24)[len(topic_classification_prompt):]
# #pattern = r'(Category|Answer):\s*(\w+)'
# pattern = r'(LLMs|diffusion|computer vision|multimodal)'
# match = re.search(pattern, ans,re.IGNORECASE)
# response =  match.group(1) if match else 'other'
# response

In [40]:
#abstract = r"""JWST is discovering a large population of z>4 supermassive black holes (SMBHs) that are overmassive with respect to the stellar content of their hosts. A previous study developed a physical model to interpret this overmassive population as the result of quasar feedback acting on a compact host galaxy. In this Note, we apply this model to JADES GN 1146115, a dormant supermassive black hole at z=6.7 whose mass is ∼40% of the host's mass in stars and accreting at ∼2% of the Eddington limit. The host has been forming stars at the low rate of ∼1M⊙yr−1 for the past ∼100 Myr. Our model suggests that this galactic system is on the verge of a resurgence of global star formation activity. This transition comes after a period of domination by the effect of its overmassive black hole, whose duration is comparable to typical quasar lifetimes. """
#abstract = r"""Text animation serves as an expressive medium, transforming static communication into dynamic experiences by infusing words with motion to evoke emotions, emphasize meanings, and construct compelling narratives. Crafting animations that are semantically aware poses significant challenges, demanding expertise in graphic design and animation. We present an automated text animation scheme, termed "Dynamic Typography", which combines two challenging tasks. It deforms letters to convey semantic meaning and infuses them with vibrant movements based on user prompts. Our technique harnesses vector graphics representations and an end-to-end optimization-based framework. This framework employs neural displacement fields to convert letters into base shapes and applies per-frame motion, encouraging coherence with the intended textual concept. Shape preservation techniques and perceptual loss regularization are employed to maintain legibility and structural integrity throughout the animation process. We demonstrate the generalizability of our approach across various text-to-video models and highlight the superiority of our end-to-end methodology over baseline methods, which might comprise separate tasks. Through quantitative and qualitative evaluations, we demonstrate the effectiveness of our framework in generating coherent text animations that faithfully interpret user prompts while maintaining readability. Our code is available at: https://animate-your-word.github.io/demo/."""
#abstract = r"""We introduce Reka Core, Flash, and Edge, a series of powerful multimodal language models trained from scratch by Reka. Reka models are able to process and reason with text, images, video, and audio inputs. This technical report discusses details of training some of these models and provides comprehensive evaluation results. We show that Reka Edge and Reka Flash are not only state-of-the-art but also outperform many much larger models, delivering outsized values for their respective compute class. Meanwhile, our most capable and largest model, Reka Core, approaches the best frontier models on both automatic evaluations and blind human evaluations. On image question answering benchmarks (e.g. MMMU, VQAv2), Core performs competitively to GPT4-V. Meanwhile, on multimodal chat, Core ranks as the second most preferred model under a blind third-party human evaluation setup, outperforming other models such as Claude 3 Opus. On text benchmarks, Core not only performs competitively to other frontier models on a set of well-established benchmarks (e.g. MMLU, GSM8K) but also outperforms GPT4-0613 on human evaluation. On video question answering (Perception-Test), Core outperforms Gemini Ultra. Models are shipped in production at http://chat.reka.ai . A showcase of non cherry picked qualitative examples can also be found at http://showcase.reka.ai ."""
#abstract = r"""Despite the impressive capabilities of Large Language Models (LLMs) on various tasks, they still struggle with scenarios that involves complex reasoning and planning. Recent work proposed advanced prompting techniques and the necessity of fine-tuning with high-quality data to augment LLMs' reasoning abilities. However, these approaches are inherently constrained by data availability and quality. In light of this, self-correction and self-learning emerge as viable solutions, employing strategies that allow LLMs to refine their outputs and learn from self-assessed rewards. Yet, the efficacy of LLMs in self-refining its response, particularly in complex reasoning and planning task, remains dubious. In this paper, we introduce AlphaLLM for the self-improvements of LLMs, which integrates Monte Carlo Tree Search (MCTS) with LLMs to establish a self-improving loop, thereby enhancing the capabilities of LLMs without additional annotations. Drawing inspiration from the success of AlphaGo, AlphaLLM addresses the unique challenges of combining MCTS with LLM for self-improvement, including data scarcity, the vastness search spaces of language tasks, and the subjective nature of feedback in language tasks. AlphaLLM is comprised of prompt synthesis component, an efficient MCTS approach tailored for language tasks, and a trio of critic models for precise feedback. Our experimental results in mathematical reasoning tasks demonstrate that AlphaLLM significantly enhances the performance of LLMs without additional annotations, showing the potential for self-improvement in LLMs."""
#abstract = r"""We introduce Blink, a new benchmark for multimodal language models (LLMs) that focuses on core visual perception abilities not found in other evaluations. Most of the Blink tasks can be solved by humans "within a blink" (e.g., relative depth estimation, visual correspondence, forensics detection, and multi-view reasoning). However, we find these perception-demanding tasks cast significant challenges for current multimodal LLMs because they resist mediation through natural language. Blink reformats 14 classic computer vision tasks into 3,807 multiple-choice questions, paired with single or multiple images and visual prompting. While humans get 95.70% accuracy on average, Blink is surprisingly challenging for existing multimodal LLMs: even the best-performing GPT-4V and Gemini achieve accuracies of 51.26% and 45.72%, only 13.17% and 7.63% higher than random guessing, indicating that such perception abilities have not "emerged" yet in recent multimodal LLMs. Our analysis also highlights that specialist CV models could solve these problems much better, suggesting potential pathways for future improvements. We believe Blink will stimulate the community to help multimodal LLMs catch up with human-level visual perception."""
#abstract = r"""We propose MeshLRM, a novel LRM-based approach that can reconstruct a high-quality mesh from merely four input images in less than one second. Different from previous large reconstruction models (LRMs) that focus on NeRF-based reconstruction, MeshLRM incorporates differentiable mesh extraction and rendering within the LRM framework. This allows for end-to-end mesh reconstruction by fine-tuning a pre-trained NeRF LRM with mesh rendering. Moreover, we improve the LRM architecture by simplifying several complex designs in previous LRMs. MeshLRM's NeRF initialization is sequentially trained with low- and high-resolution images; this new LRM training strategy enables significantly faster convergence and thereby leads to better quality with less compute. Our approach achieves state-of-the-art mesh reconstruction from sparse-view inputs and also allows for many downstream applications, including text-to-3D and single-image-to-3D generation. Project page: https://sarahweiii.github.io/meshlrm/"""
#abstract = r"""With large language models (LLMs) widely deployed in long content generation recently, there has emerged an increasing demand for efficient long-sequence inference support. However, key-value (KV) cache, which is stored to avoid re-computation, has emerged as a critical bottleneck by growing linearly in size with the sequence length. Due to the auto-regressive nature of LLMs, the entire KV cache will be loaded for every generated token, resulting in low utilization of computational cores and high latency. While various compression methods for KV cache have been proposed to alleviate this issue, they suffer from degradation in generation quality. We introduce TriForce, a hierarchical speculative decoding system that is scalable to long sequence generation. This approach leverages the original model weights and dynamic sparse KV cache via retrieval as a draft model, which serves as an intermediate layer in the hierarchy and is further speculated by a smaller model to reduce its drafting latency. TriForce not only facilitates impressive speedups for Llama2-7B-128K, achieving up to 2.31times on an A100 GPU but also showcases scalability in handling even longer contexts. For the offloading setting on two RTX 4090 GPUs, TriForce achieves 0.108s/tokenx2014only half as slow as the auto-regressive baseline on an A100, which attains 7.78times on our optimized offloading system. Additionally, TriForce performs 4.86times than DeepSpeed-Zero-Inference on a single RTX 4090 GPU. TriForce's robustness is highlighted by its consistently outstanding performance across various temperatures. The code is available at https://github.com/Infini-AI-Lab/TriForce."""
abstract = r"""The intensive computational burden of Stable Diffusion (SD) for text-to-image generation poses a significant hurdle for its practical application. To tackle this challenge, recent research focuses on methods to reduce sampling steps, such as Latent Consistency Model (LCM), and on employing architectural optimizations, including pruning and knowledge distillation. Diverging from existing approaches, we uniquely start with a compact SD variant, BK-SDM. We observe that directly applying LCM to BK-SDM with commonly used crawled datasets yields unsatisfactory results. It leads us to develop two strategies: (1) leveraging high-quality image-text pairs from leading generative models and (2) designing an advanced distillation process tailored for LCM. Through our thorough exploration of quantization, profiling, and on-device deployment, we achieve rapid generation of photo-realistic, text-aligned images in just two steps, with latency under one second on resource-limited edge devices."""
topic_classification_prompt = f"""
Your task is to take an arXiv abstract and classify it into one of the following categories: LLMs, diffusion, computer vision, multimodal.
If you see LLMs or language models in the abstract, classify as LLMs.  If you see computer vision in the abstract, classify as computer vision.
If you see multimodal in abstract, classify as multimodal.  If you see diffusion in the abstract, classify as diffusion.
Now here is the abstract: {abstract}
Think step-by-step about your answer and check your answer before returning it and provide a brief explanation.

Category:
"""
#topic_classification_prompt = 
ans = call_llm(question=topic_classification_prompt, generator=generator_llm,settings=generator_settings,max_new_tokens=24)[len(topic_classification_prompt):]
#pattern = r'(Category|Answer):\s*(\w+)'
pattern = r'(LLMs|diffusion|computer vision|multimodal)'
match = re.search(pattern, ans,re.IGNORECASE)
response =  match.group(1) if match else 'other'
response

'diffusion'

In [None]:
#abstract = r"""JWST is discovering a large population of z>4 supermassive black holes (SMBHs) that are overmassive with respect to the stellar content of their hosts. A previous study developed a physical model to interpret this overmassive population as the result of quasar feedback acting on a compact host galaxy. In this Note, we apply this model to JADES GN 1146115, a dormant supermassive black hole at z=6.7 whose mass is ∼40% of the host's mass in stars and accreting at ∼2% of the Eddington limit. The host has been forming stars at the low rate of ∼1M⊙yr−1 for the past ∼100 Myr. Our model suggests that this galactic system is on the verge of a resurgence of global star formation activity. This transition comes after a period of domination by the effect of its overmassive black hole, whose duration is comparable to typical quasar lifetimes. """
#abstract = r"""Text animation serves as an expressive medium, transforming static communication into dynamic experiences by infusing words with motion to evoke emotions, emphasize meanings, and construct compelling narratives. Crafting animations that are semantically aware poses significant challenges, demanding expertise in graphic design and animation. We present an automated text animation scheme, termed "Dynamic Typography", which combines two challenging tasks. It deforms letters to convey semantic meaning and infuses them with vibrant movements based on user prompts. Our technique harnesses vector graphics representations and an end-to-end optimization-based framework. This framework employs neural displacement fields to convert letters into base shapes and applies per-frame motion, encouraging coherence with the intended textual concept. Shape preservation techniques and perceptual loss regularization are employed to maintain legibility and structural integrity throughout the animation process. We demonstrate the generalizability of our approach across various text-to-video models and highlight the superiority of our end-to-end methodology over baseline methods, which might comprise separate tasks. Through quantitative and qualitative evaluations, we demonstrate the effectiveness of our framework in generating coherent text animations that faithfully interpret user prompts while maintaining readability. Our code is available at: https://animate-your-word.github.io/demo/."""
#abstract = r"""We introduce Reka Core, Flash, and Edge, a series of powerful multimodal language models trained from scratch by Reka. Reka models are able to process and reason with text, images, video, and audio inputs. This technical report discusses details of training some of these models and provides comprehensive evaluation results. We show that Reka Edge and Reka Flash are not only state-of-the-art but also outperform many much larger models, delivering outsized values for their respective compute class. Meanwhile, our most capable and largest model, Reka Core, approaches the best frontier models on both automatic evaluations and blind human evaluations. On image question answering benchmarks (e.g. MMMU, VQAv2), Core performs competitively to GPT4-V. Meanwhile, on multimodal chat, Core ranks as the second most preferred model under a blind third-party human evaluation setup, outperforming other models such as Claude 3 Opus. On text benchmarks, Core not only performs competitively to other frontier models on a set of well-established benchmarks (e.g. MMLU, GSM8K) but also outperforms GPT4-0613 on human evaluation. On video question answering (Perception-Test), Core outperforms Gemini Ultra. Models are shipped in production at http://chat.reka.ai . A showcase of non cherry picked qualitative examples can also be found at http://showcase.reka.ai ."""
#abstract = r"""Despite the impressive capabilities of Large Language Models (LLMs) on various tasks, they still struggle with scenarios that involves complex reasoning and planning. Recent work proposed advanced prompting techniques and the necessity of fine-tuning with high-quality data to augment LLMs' reasoning abilities. However, these approaches are inherently constrained by data availability and quality. In light of this, self-correction and self-learning emerge as viable solutions, employing strategies that allow LLMs to refine their outputs and learn from self-assessed rewards. Yet, the efficacy of LLMs in self-refining its response, particularly in complex reasoning and planning task, remains dubious. In this paper, we introduce AlphaLLM for the self-improvements of LLMs, which integrates Monte Carlo Tree Search (MCTS) with LLMs to establish a self-improving loop, thereby enhancing the capabilities of LLMs without additional annotations. Drawing inspiration from the success of AlphaGo, AlphaLLM addresses the unique challenges of combining MCTS with LLM for self-improvement, including data scarcity, the vastness search spaces of language tasks, and the subjective nature of feedback in language tasks. AlphaLLM is comprised of prompt synthesis component, an efficient MCTS approach tailored for language tasks, and a trio of critic models for precise feedback. Our experimental results in mathematical reasoning tasks demonstrate that AlphaLLM significantly enhances the performance of LLMs without additional annotations, showing the potential for self-improvement in LLMs."""
#abstract = r"""We introduce Blink, a new benchmark for multimodal language models (LLMs) that focuses on core visual perception abilities not found in other evaluations. Most of the Blink tasks can be solved by humans "within a blink" (e.g., relative depth estimation, visual correspondence, forensics detection, and multi-view reasoning). However, we find these perception-demanding tasks cast significant challenges for current multimodal LLMs because they resist mediation through natural language. Blink reformats 14 classic computer vision tasks into 3,807 multiple-choice questions, paired with single or multiple images and visual prompting. While humans get 95.70% accuracy on average, Blink is surprisingly challenging for existing multimodal LLMs: even the best-performing GPT-4V and Gemini achieve accuracies of 51.26% and 45.72%, only 13.17% and 7.63% higher than random guessing, indicating that such perception abilities have not "emerged" yet in recent multimodal LLMs. Our analysis also highlights that specialist CV models could solve these problems much better, suggesting potential pathways for future improvements. We believe Blink will stimulate the community to help multimodal LLMs catch up with human-level visual perception."""
#abstract = r"""We propose MeshLRM, a novel LRM-based approach that can reconstruct a high-quality mesh from merely four input images in less than one second. Different from previous large reconstruction models (LRMs) that focus on NeRF-based reconstruction, MeshLRM incorporates differentiable mesh extraction and rendering within the LRM framework. This allows for end-to-end mesh reconstruction by fine-tuning a pre-trained NeRF LRM with mesh rendering. Moreover, we improve the LRM architecture by simplifying several complex designs in previous LRMs. MeshLRM's NeRF initialization is sequentially trained with low- and high-resolution images; this new LRM training strategy enables significantly faster convergence and thereby leads to better quality with less compute. Our approach achieves state-of-the-art mesh reconstruction from sparse-view inputs and also allows for many downstream applications, including text-to-3D and single-image-to-3D generation. Project page: https://sarahweiii.github.io/meshlrm/"""
#abstract = r"""With large language models (LLMs) widely deployed in long content generation recently, there has emerged an increasing demand for efficient long-sequence inference support. However, key-value (KV) cache, which is stored to avoid re-computation, has emerged as a critical bottleneck by growing linearly in size with the sequence length. Due to the auto-regressive nature of LLMs, the entire KV cache will be loaded for every generated token, resulting in low utilization of computational cores and high latency. While various compression methods for KV cache have been proposed to alleviate this issue, they suffer from degradation in generation quality. We introduce TriForce, a hierarchical speculative decoding system that is scalable to long sequence generation. This approach leverages the original model weights and dynamic sparse KV cache via retrieval as a draft model, which serves as an intermediate layer in the hierarchy and is further speculated by a smaller model to reduce its drafting latency. TriForce not only facilitates impressive speedups for Llama2-7B-128K, achieving up to 2.31times on an A100 GPU but also showcases scalability in handling even longer contexts. For the offloading setting on two RTX 4090 GPUs, TriForce achieves 0.108s/tokenx2014only half as slow as the auto-regressive baseline on an A100, which attains 7.78times on our optimized offloading system. Additionally, TriForce performs 4.86times than DeepSpeed-Zero-Inference on a single RTX 4090 GPU. TriForce's robustness is highlighted by its consistently outstanding performance across various temperatures. The code is available at https://github.com/Infini-AI-Lab/TriForce."""
abstract = r"""The intensive computational burden of Stable Diffusion (SD) for text-to-image generation poses a significant hurdle for its practical application. To tackle this challenge, recent research focuses on methods to reduce sampling steps, such as Latent Consistency Model (LCM), and on employing architectural optimizations, including pruning and knowledge distillation. Diverging from existing approaches, we uniquely start with a compact SD variant, BK-SDM. We observe that directly applying LCM to BK-SDM with commonly used crawled datasets yields unsatisfactory results. It leads us to develop two strategies: (1) leveraging high-quality image-text pairs from leading generative models and (2) designing an advanced distillation process tailored for LCM. Through our thorough exploration of quantization, profiling, and on-device deployment, we achieve rapid generation of photo-realistic, text-aligned images in just two steps, with latency under one second on resource-limited edge devices."""
topic_classification_prompt = f"""
Your task is to take an arXiv abstract and classify it into one of the following categories: LLMs, diffusion, computer vision, multimodal.
If you see LLMs or language models in the abstract, classify as LLMs.  If you see computer vision in the abstract, classify as computer vision.
If you see multimodal in abstract, classify as multimodal.  If you see diffusion in the abstract, classify as diffusion.
Now here is the abstract: {abstract}
Think step-by-step about your answer and check your answer before returning it and provide a brief explanation.

Category:
"""
#topic_classification_prompt = 
ans = call_llm(question=topic_classification_prompt, generator=generator_llm,settings=generator_settings,max_new_tokens=24)[len(topic_classification_prompt):]
#pattern = r'(Category|Answer):\s*(\w+)'
pattern = r'(LLMs|diffusion|computer vision|multimodal)'
match = re.search(pattern, ans,re.IGNORECASE)
response =  match.group(1) if match else 'other'
response

'LLMs'

In [41]:
ans

'Category:\n [/INST] I would classify this abstract as "diffusion". The abstract mentions "Stable Diffusion (SD)" as'

In [113]:
def classify_topic(abstract):
    topic_classification_prompt = f"""
    Your task is to take an arXiv abstract and classify it into one of the following categories: LLMs, diffusion, computer vision, multimodal.
    If you see LLMs or language models in the abstract, classify as LLMs. 
    If you see multimodal in abstract, classify as multimodal.  If you see diffusion in the abstract, classify as diffusion. If you see video, vision, or visual in the abstract, classify as computer vision. 
    Now here is the abstract: {abstract}
    Think step-by-step about your answer and check your answer before returning it and provide a brief explanation.

    Category:
    """
    ans = call_llm(question=topic_classification_prompt, generator=generator_llm,settings=generator_settings,max_new_tokens=24)[len(topic_classification_prompt):]
    pattern = r'(LLMs|diffusion|computer vision|multimodal)'
    match = re.search(pattern, ans,re.IGNORECASE)
    response =  match.group(1) if match else 'other'
    return response.lower()

In [64]:
import re
def classify_topic(abstract):
    topic_classification_prompt = f"""
    Your task is to take an arXiv abstract and classify it into one of the following categories: LLMs, diffusion, computer vision, multimodal, 3D, or other.
    Only if you see LLMs or language models in the abstract, classify as LLMs, else classify as one of of the other categories.
    Now here is the abstract: {abstract}
    Think step-by-step about your answer and check your answer before returning it and provide a brief explanation.

    Category:
    """
    ans = call_llm(question=topic_classification_prompt, generator=generator_llm,settings=generator_settings,max_new_tokens=24)[len(topic_classification_prompt):]
    pattern = r'(LLMs|diffusion|computer vision|multimodal)'
    match = re.search(pattern, ans,re.IGNORECASE)
    response =  match.group(1) if match else 'other'
    return response.lower()

### Try re approach

In [87]:
import re

# The text
text = """Recent advancements in multimodal large language models (MLLMs) have been noteworthy, yet, these general-domain MLLMs often fall short in their ability to comprehend and interact effectively with user interface (UI) screens. In this paper, we present Ferret-UI, a new MLLM tailored for enhanced understanding of mobile UI screens, equipped with referring, grounding, and reasoning capabilities. Given that UI screens typically exhibit a more elongated aspect ratio and contain smaller objects of interest (e.g., icons, texts) than natural images, we incorporate "any resolution" on top of Ferret to magnify details and leverage enhanced visual features. Specifically, each screen is divided into 2 sub-images based on the original aspect ratio (i.e., horizontal division for portrait screens and vertical division for landscape screens). Both sub-images are encoded separately before being sent to LLMs. We meticulously gather training samples from an extensive range of elementary UI tasks, such as icon recognition, find text, and widget listing. These samples are formatted for instruction-following with region annotations to facilitate precise referring and grounding. To augment the model's reasoning ability, we further compile a dataset for advanced tasks, including detailed description, perception/interaction conversations, and function inference. After training on the curated datasets, Ferret-UI exhibits outstanding comprehension of UI screens and the capability to execute open-ended instructions. For model evaluation, we establish a comprehensive benchmark encompassing all the aforementioned tasks. Ferret-UI excels not only beyond most open-source UI MLLMs, but also surpasses GPT-4V on all the elementary UI tasks."""
text ="""Recent advancements in diffusion-based generative image editing have sparked a profound revolution, reshaping the landscape of image outpainting and inpainting tasks. Despite these strides, the field grapples with inherent challenges, including: i) inferior quality; ii) poor consistency; iii) insufficient instrcution adherence; iv) suboptimal generation efficiency. To address these obstacles, we present ByteEdit, an innovative feedback learning framework meticulously designed to Boost, Comply, and Accelerate Generative Image Editing tasks. ByteEdit seamlessly integrates image reward models dedicated to enhancing aesthetics and image-text alignment, while also introducing a dense, pixel-level reward model tailored to foster coherence in the output. Furthermore, we propose a pioneering adversarial and progressive feedback learning strategy to expedite the model's inference speed. Through extensive large-scale user evaluations, we demonstrate that ByteEdit surpasses leading generative image editing products, including Adobe, Canva, and MeiTu, in both generation quality and consistency. ByteEdit-Outpainting exhibits a remarkable enhancement of 388% and 135% in quality and consistency, respectively, when compared to the baseline model. Experiments also verfied that our acceleration models maintains excellent performance results in terms of quality and consistency."""
text ="""Recovering dense and long-range pixel motion in videos is a challenging problem. Part of the difficulty arises from the 3D-to-2D projection process, leading to occlusions and discontinuities in the 2D motion domain. While 2D motion can be intricate, we posit that the underlying 3D motion can often be simple and low-dimensional. In this work, we propose to estimate point trajectories in 3D space to mitigate the issues caused by image projection. Our method, named SpatialTracker, lifts 2D pixels to 3D using monocular depth estimators, represents the 3D content of each frame efficiently using a triplane representation, and performs iterative updates using a transformer to estimate 3D trajectories. Tracking in 3D allows us to leverage as-rigid-as-possible (ARAP) constraints while simultaneously learning a rigidity embedding that clusters pixels into different rigid parts. Extensive evaluation shows that our approach achieves state-of-the-art tracking performance both qualitatively and quantitatively, particularly in challenging scenarios such as out-of-plane rotation."""
text="""Effective editing of personal content holds a pivotal role in enabling individuals to express their creativity, weaving captivating narratives within their visual stories, and elevate the overall quality and impact of their visual content. Therefore, in this work, we introduce SwapAnything, a novel framework that can swap any objects in an image with personalized concepts given by the reference, while keeping the context unchanged. Compared with existing methods for personalized subject swapping, SwapAnything has three unique advantages: (1) precise control of arbitrary objects and parts rather than the main subject, (2) more faithful preservation of context pixels, (3) better adaptation of the personalized concept to the image. First, we propose targeted variable swapping to apply region control over latent feature maps and swap masked variables for faithful context preservation and initial semantic concept swapping. Then, we introduce appearance adaptation, to seamlessly adapt the semantic concept into the original image in terms of target location, shape, style, and content during the image generation process. Extensive results on both human and automatic evaluation demonstrate significant improvements of our approach over baseline methods on personalized swapping. Furthermore, SwapAnything shows its precise and faithful swapping abilities across single object, multiple objects, partial object, and cross-domain swapping tasks. SwapAnything also achieves great performance on text-based swapping and tasks beyond swapping such as object insertion."""
text="""Generating higher-resolution human-centric scenes with details and controls remains a challenge for existing text-to-image diffusion models. This challenge stems from limited training image size, text encoder capacity (limited tokens), and the inherent difficulty of generating complex scenes involving multiple humans. While current methods attempted to address training size limit only, they often yielded human-centric scenes with severe artifacts. We propose BeyondScene, a novel framework that overcomes prior limitations, generating exquisite higher-resolution (over 8K) human-centric scenes with exceptional text-image correspondence and naturalness using existing pretrained diffusion models. BeyondScene employs a staged and hierarchical approach to initially generate a detailed base image focusing on crucial elements in instance creation for multiple humans and detailed descriptions beyond token limit of diffusion model, and then to seamlessly convert the base image to a higher-resolution output, exceeding training image size and incorporating details aware of text and instances via our novel instance-aware hierarchical enlargement process that consists of our proposed high-frequency injected forward diffusion and adaptive joint diffusion. BeyondScene surpasses existing methods in terms of correspondence with detailed text descriptions and naturalness, paving the way for advanced applications in higher-resolution human-centric scene creation beyond the capacity of pretrained diffusion models without costly retraining. Project page: https://janeyeon.github.io/beyond-scene."""
text="""With the success of large language models (LLMs), integrating the vision model into LLMs to build vision-language foundation models has gained much more interest recently. However, existing LLM-based large multimodal models (e.g., Video-LLaMA, VideoChat) can only take in a limited number of frames for short video understanding. In this study, we mainly focus on designing an efficient and effective model for long-term video understanding. Instead of trying to process more frames simultaneously like most existing work, we propose to process videos in an online manner and store past video information in a memory bank. This allows our model to reference historical video content for long-term analysis without exceeding LLMs' context length constraints or GPU memory limits. Our memory bank can be seamlessly integrated into current multimodal LLMs in an off-the-shelf manner. We conduct extensive experiments on various video understanding tasks, such as long-video understanding, video question answering, and video captioning, and our model can achieve state-of-the-art performances across multiple datasets. Code available at https://boheumd.github.io/MA-LMM/."""
text="""Inference with Multimodal Large Language Models (MLLMs) is slow due to their large-language-model backbone which suffers from memory bandwidth bottleneck and generates tokens auto-regressively. In this paper, we explore the application of speculative decoding to enhance the inference efficiency of MLLMs, specifically the LLaVA 7B model. We show that a language-only model can serve as a good draft model for speculative decoding with LLaVA 7B, bypassing the need for image tokens and their associated processing components from the draft model. Our experiments across three different tasks show that speculative decoding can achieve a memory-bound speedup of up to 2.37times using a 115M parameter language model that we trained from scratch. Additionally, we introduce a compact LLaVA draft model incorporating an image adapter, which shows marginal performance gains in image captioning while maintaining comparable results in other tasks.	2404.08856"""
# The regex pattern
pattern = r'^(?!.*\b(diffusion|3D|computer vision)\b).*\b(language model\(s\)|LLMs)\b.*$'

# Find the match
match = re.search(pattern, text, re.IGNORECASE)

# Print the result
if match:
    print('Match found')
else:
    print('No match found')

No match found


In [195]:
import re
def classify_topic_re(example):
    """Classify abstract based on regex pattern"""
    # pattern = r'^(?!.*\b(diffusion|3D|computer vision)\b).*\b(lms|multimodal|attention|language model\(s\)|LLMs)\b.*$'

    # match = re.search(pattern, example['abstract'], re.IGNORECASE)
    #pattern = r'^(?!.*\b(diffusion|3D|computer vision)\b).*\b(lms|multimodal|language model\(s\)|LLMs|context LMs|Large Language Model|synthetic data)\b.*$'
    pattern = r'^(?!.*\b(diffusion|3D|computer vision|image|video|resnet|cnn|vit)\b).*\b(lms|attention|language model(s)?|LLMs|context LMs|synthetic data|GPT|RLHF|DPO|KTO|ORPO|.*RNN.*|llama|mamba).*'
    match = re.findall(pattern, example['abstract'], re.IGNORECASE)
    #pattern = r'(?!.*\b(diffusion|3D|computer vision|image|video)\b).*\b(lms|attention|(?<!Vision-)language model(s)?|LLMs|context LMs|synthetic data|GPT(?!-4V)|RLHF|DPO|KTO|ORPO|RNN|RNN(s)?).*'
    #matches = re.findall(pattern, example['abstract'], re.IGNORECASE)
    return 'llms' if match else 'other'


In [196]:
new_articles['topic_re']=new_articles.apply(classify_topic_re,axis=1)

In [197]:
new_articles.topic_re.value_counts()

topic_re
other    44
llms     25
Name: count, dtype: int64

In [198]:
pd.set_option('display.max_rows',None)
new_articles.head(70)

Unnamed: 0,title,authors,abstract,arxiv_abbrev,topic,topic_re
0,Ferret-UI: Grounded Mobile UI Understanding with Multimodal LLMs,"Authors:Keen You,Haotian Zhang,Eldon Schoop,Floris Weers,Amanda Swearngin,Jeffrey Nichols,Yinfei Yang,Zhe Gan","Recent advancements in multimodal large language models (MLLMs) have been noteworthy, yet, these general-domain MLLMs often fall short in their ability to comprehend and interact effectively with user interface (UI) screens. In this paper, we present Ferret-UI, a new MLLM tailored for enhanced understanding of mobile UI screens, equipped with referring, grounding, and reasoning capabilities. Given that UI screens typically exhibit a more elongated aspect ratio and contain smaller objects of interest (e.g., icons, texts) than natural images, we incorporate ""any resolution"" on top of Ferret to magnify details and leverage enhanced visual features. Specifically, each screen is divided into 2 sub-images based on the original aspect ratio (i.e., horizontal division for portrait screens and vertical division for landscape screens). Both sub-images are encoded separately before being sent to LLMs. We meticulously gather training samples from an extensive range of elementary UI tasks, such as icon recognition, find text, and widget listing. These samples are formatted for instruction-following with region annotations to facilitate precise referring and grounding. To augment the model's reasoning ability, we further compile a dataset for advanced tasks, including detailed description, perception/interaction conversations, and function inference. After training on the curated datasets, Ferret-UI exhibits outstanding comprehension of UI screens and the capability to execute open-ended instructions. For model evaluation, we establish a comprehensive benchmark encompassing all the aforementioned tasks. Ferret-UI excels not only beyond most open-source UI MLLMs, but also surpasses GPT-4V on all the elementary UI tasks.",2404.05719,llms,llms
1,"ByteEdit: Boost, Comply and Accelerate Generative Image Editing","Authors:Yuxi Ren,Jie Wu,Yanzuo Lu,Huafeng Kuang,Xin Xia,Xionghui Wang,Qianqian Wang,Yixing Zhu,Pan Xie,Shiyin Wang,Xuefeng Xiao,Yitong Wang,Min Zheng,Lean Fu","Recent advancements in diffusion-based generative image editing have sparked a profound revolution, reshaping the landscape of image outpainting and inpainting tasks. Despite these strides, the field grapples with inherent challenges, including: i) inferior quality; ii) poor consistency; iii) insufficient instrcution adherence; iv) suboptimal generation efficiency. To address these obstacles, we present ByteEdit, an innovative feedback learning framework meticulously designed to Boost, Comply, and Accelerate Generative Image Editing tasks. ByteEdit seamlessly integrates image reward models dedicated to enhancing aesthetics and image-text alignment, while also introducing a dense, pixel-level reward model tailored to foster coherence in the output. Furthermore, we propose a pioneering adversarial and progressive feedback learning strategy to expedite the model's inference speed. Through extensive large-scale user evaluations, we demonstrate that ByteEdit surpasses leading generative image editing products, including Adobe, Canva, and MeiTu, in both generation quality and consistency. ByteEdit-Outpainting exhibits a remarkable enhancement of 388% and 135% in quality and consistency, respectively, when compared to the baseline model. Experiments also verfied that our acceleration models maintains excellent performance results in terms of quality and consistency.",2404.0486,diffusion,other
2,SpatialTracker: Tracking Any 2D Pixels in 3D Space,"Authors:Yuxi Xiao,Qianqian Wang,Shangzhan Zhang,Nan Xue,Sida Peng,Yujun Shen,Xiaowei Zhou","Recovering dense and long-range pixel motion in videos is a challenging problem. Part of the difficulty arises from the 3D-to-2D projection process, leading to occlusions and discontinuities in the 2D motion domain. While 2D motion can be intricate, we posit that the underlying 3D motion can often be simple and low-dimensional. In this work, we propose to estimate point trajectories in 3D space to mitigate the issues caused by image projection. Our method, named SpatialTracker, lifts 2D pixels to 3D using monocular depth estimators, represents the 3D content of each frame efficiently using a triplane representation, and performs iterative updates using a transformer to estimate 3D trajectories. Tracking in 3D allows us to leverage as-rigid-as-possible (ARAP) constraints while simultaneously learning a rigidity embedding that clusters pixels into different rigid parts. Extensive evaluation shows that our approach achieves state-of-the-art tracking performance both qualitatively and quantitatively, particularly in challenging scenarios such as out-of-plane rotation.",2404.04319,llms,other
3,SwapAnything: Enabling Arbitrary Object Swapping in Personalized Visual Editing,"Authors:Jing Gu,Yilin Wang,Nanxuan Zhao,Wei Xiong,Qing Liu,Zhifei Zhang,He Zhang,Jianming Zhang,HyunJoon Jung,Xin Eric Wang","Effective editing of personal content holds a pivotal role in enabling individuals to express their creativity, weaving captivating narratives within their visual stories, and elevate the overall quality and impact of their visual content. Therefore, in this work, we introduce SwapAnything, a novel framework that can swap any objects in an image with personalized concepts given by the reference, while keeping the context unchanged. Compared with existing methods for personalized subject swapping, SwapAnything has three unique advantages: (1) precise control of arbitrary objects and parts rather than the main subject, (2) more faithful preservation of context pixels, (3) better adaptation of the personalized concept to the image. First, we propose targeted variable swapping to apply region control over latent feature maps and swap masked variables for faithful context preservation and initial semantic concept swapping. Then, we introduce appearance adaptation, to seamlessly adapt the semantic concept into the original image in terms of target location, shape, style, and content during the image generation process. Extensive results on both human and automatic evaluation demonstrate significant improvements of our approach over baseline methods on personalized swapping. Furthermore, SwapAnything shows its precise and faithful swapping abilities across single object, multiple objects, partial object, and cross-domain swapping tasks. SwapAnything also achieves great performance on text-based swapping and tasks beyond swapping such as object insertion.",2404.05717,llms,other
4,BeyondScene: Higher-Resolution Human-Centric Scene Generation With Pretrained Diffusion,"Authors:Gwanghyun Kim,Hayeon Kim,Hoigi Seo,Dong Un Kang,Se Young Chun","Generating higher-resolution human-centric scenes with details and controls remains a challenge for existing text-to-image diffusion models. This challenge stems from limited training image size, text encoder capacity (limited tokens), and the inherent difficulty of generating complex scenes involving multiple humans. While current methods attempted to address training size limit only, they often yielded human-centric scenes with severe artifacts. We propose BeyondScene, a novel framework that overcomes prior limitations, generating exquisite higher-resolution (over 8K) human-centric scenes with exceptional text-image correspondence and naturalness using existing pretrained diffusion models. BeyondScene employs a staged and hierarchical approach to initially generate a detailed base image focusing on crucial elements in instance creation for multiple humans and detailed descriptions beyond token limit of diffusion model, and then to seamlessly convert the base image to a higher-resolution output, exceeding training image size and incorporating details aware of text and instances via our novel instance-aware hierarchical enlargement process that consists of our proposed high-frequency injected forward diffusion and adaptive joint diffusion. BeyondScene surpasses existing methods in terms of correspondence with detailed text descriptions and naturalness, paving the way for advanced applications in higher-resolution human-centric scene creation beyond the capacity of pretrained diffusion models without costly retraining. Project page: https://janeyeon.github.io/beyond-scene.",2404.04544,diffusion,other
5,UniFL: Improve Stable Diffusion via Unified Feedback Learning,"Authors:Jiacheng Zhang,Jie Wu,Yuxi Ren,Xin Xia,Huafeng Kuang,Pan Xie,Jiashi Li,Xuefeng Xiao,Weilin Huang,Min Zheng,Lean Fu,Guanbin Li","Diffusion models have revolutionized the field of image generation, leading to the proliferation of high-quality models and diverse downstream applications. However, despite these significant advancements, the current competitive solutions still suffer from several limitations, including inferior visual quality, a lack of aesthetic appeal, and inefficient inference, without a comprehensive solution in sight. To address these challenges, we present UniFL, a unified framework that leverages feedback learning to enhance diffusion models comprehensively. UniFL stands out as a universal, effective, and generalizable solution applicable to various diffusion models, such as SD1.5 and SDXL. Notably, UniFL incorporates three key components: perceptual feedback learning, which enhances visual quality; decoupled feedback learning, which improves aesthetic appeal; and adversarial feedback learning, which optimizes inference speed. In-depth experiments and extensive user studies validate the superior performance of our proposed method in enhancing both the quality of generated models and their acceleration. For instance, UniFL surpasses ImageReward by 17% user preference in terms of generation quality and outperforms LCM and SDXL Turbo by 57% and 20% in 4-step inference. Moreover, we have verified the efficacy of our approach in downstream tasks, including Lora, ControlNet, and AnimateDiff.",2404.05595,llms,other
6,MagicTime: Time-lapse Video Generation Models as Metamorphic Simulators,"Authors:Shenghai Yuan,Jinfa Huang,Yujun Shi,Yongqi Xu,Ruijie Zhu,Bin Lin,Xinhua Cheng,Li Yuan,Jiebo Luo","Recent advances in Text-to-Video generation (T2V) have achieved remarkable success in synthesizing high-quality general videos from textual descriptions. A largely overlooked problem in T2V is that existing models have not adequately encoded physical knowledge of the real world, thus generated videos tend to have limited motion and poor variations. In this paper, we propose MagicTime, a metamorphic time-lapse video generation model, which learns real-world physics knowledge from time-lapse videos and implements metamorphic generation. First, we design a MagicAdapter scheme to decouple spatial and temporal training, encode more physical knowledge from metamorphic videos, and transform pre-trained T2V models to generate metamorphic videos. Second, we introduce a Dynamic Frames Extraction strategy to adapt to metamorphic time-lapse videos, which have a wider variation range and cover dramatic object metamorphic processes, thus embodying more physical knowledge than general videos. Finally, we introduce a Magic Text-Encoder to improve the understanding of metamorphic video prompts. Furthermore, we create a time-lapse video-text dataset called ChronoMagic, specifically curated to unlock the metamorphic video generation ability. Extensive experiments demonstrate the superiority and effectiveness of MagicTime for generating high-quality and dynamic metamorphic videos, suggesting time-lapse video generation is a promising path toward building metamorphic simulators of the physical world.",2404.05014,llms,other
7,MA-LMM: Memory-Augmented Large Multimodal Model for Long-Term Video Understanding,"Authors:Bo He,Hengduo Li,Young Kyun Jang,Menglin Jia,Xuefei Cao,Ashish Shah,Abhinav Shrivastava,Ser-Nam Lim","With the success of large language models (LLMs), integrating the vision model into LLMs to build vision-language foundation models has gained much more interest recently. However, existing LLM-based large multimodal models (e.g., Video-LLaMA, VideoChat) can only take in a limited number of frames for short video understanding. In this study, we mainly focus on designing an efficient and effective model for long-term video understanding. Instead of trying to process more frames simultaneously like most existing work, we propose to process videos in an online manner and store past video information in a memory bank. This allows our model to reference historical video content for long-term analysis without exceeding LLMs' context length constraints or GPU memory limits. Our memory bank can be seamlessly integrated into current multimodal LLMs in an off-the-shelf manner. We conduct extensive experiments on various video understanding tasks, such as long-video understanding, video question answering, and video captioning, and our model can achieve state-of-the-art performances across multiple datasets. Code available at https://boheumd.github.io/MA-LMM/.",2404.05726,llms,other
8,PhysAvatar: Learning the Physics of Dressed 3D Avatars from Visual Observations,"Authors:Yang Zheng,Qingqing Zhao,Guandao Yang,Wang Yifan,Donglai Xiang,Florian Dubost,Dmitry Lagun,Thabo Beeler,Federico Tombari,Leonidas Guibas,Gordon Wetzstein","Modeling and rendering photorealistic avatars is of crucial importance in many applications. Existing methods that build a 3D avatar from visual observations, however, struggle to reconstruct clothed humans. We introduce PhysAvatar, a novel framework that combines inverse rendering with inverse physics to automatically estimate the shape and appearance of a human from multi-view video data along with the physical parameters of the fabric of their clothes. For this purpose, we adopt a mesh-aligned 4D Gaussian technique for spatio-temporal mesh tracking as well as a physically based inverse renderer to estimate the intrinsic material properties. PhysAvatar integrates a physics simulator to estimate the physical parameters of the garments using gradient-based optimization in a principled manner. These novel capabilities enable PhysAvatar to create high-quality novel-view renderings of avatars dressed in loose-fitting clothes under motions and lighting conditions not seen in the training data. This marks a significant advancement towards modeling photorealistic digital humans using physically based inverse rendering with physics in the loop. Our project website is at: https://qingqing-zhao.github.io/PhysAvatar",2404.04421,other,other
9,YaART: Yet Another ART Rendering Technology,"Authors:Sergey Kastryulin,Artem Konev,Alexander Shishenya,Eugene Lyapustin,Artem Khurshudov,Alexander Tselousov,Nikita Vinokurov,Denis Kuznedelev,Alexander Markovich,Grigoriy Livshits,Alexey Kirillov,Anastasiia Tabisheva,Liubov Chubarova,Marina Kaminskaia,Alexander Ustyuzhanin,Artemii Shvetsov,Daniil Shlenskii,Valerii Startsev,Dmitrii Kornilov,Mikhail Romanov,Artem Babenko,Sergei Ovcharenko+1 authors","In the rapidly progressing field of generative models, the development of efficient and high-fidelity text-to-image diffusion systems represents a significant frontier. This study introduces YaART, a novel production-grade text-to-image cascaded diffusion model aligned to human preferences using Reinforcement Learning from Human Feedback (RLHF). During the development of YaART, we especially focus on the choices of the model and training dataset sizes, the aspects that were not systematically investigated for text-to-image cascaded diffusion models before. In particular, we comprehensively analyze how these choices affect both the efficiency of the training process and the quality of the generated images, which are highly important in practice. Furthermore, we demonstrate that models trained on smaller datasets of higher-quality images can successfully compete with those trained on larger datasets, establishing a more efficient scenario of diffusion models training. From the quality perspective, YaART is consistently preferred by users over many existing state-of-the-art models.",2404.05666,computer vision,other


- OK, looks like the regex pattern is more robust, if not perfect (RNN still a bit of a problem)

### LLM classification approach

In [55]:
# abstract = r"""The intensive computational burden of Stable Diffusion (SD) for text-to-image generation poses a significant hurdle for its practical application. To tackle this challenge, recent research focuses on methods to reduce sampling steps, such as Latent Consistency Model (LCM), and on employing architectural optimizations, including pruning and knowledge distillation. Diverging from existing approaches, we uniquely start with a compact SD variant, BK-SDM. We observe that directly applying LCM to BK-SDM with commonly used crawled datasets yields unsatisfactory results. It leads us to develop two strategies: (1) leveraging high-quality image-text pairs from leading generative models and (2) designing an advanced distillation process tailored for LCM. Through our thorough exploration of quantization, profiling, and on-device deployment, we achieve rapid generation of photo-realistic, text-aligned images in just two steps, with latency under one second on resource-limited edge devices."""
# classify_topic(abstract)
# pd.set_option('display.max_colwidth', 300)
# new_articles.loc[10:20,'abstract']
# new_articles.loc[10:20,'abstract'].map(classify_topic)

In [200]:
new_articles['topic']=new_articles.apply(classify_topic_re,axis=1)

In [201]:
new_articles.shape

(69, 6)

In [202]:
new_articles.head(10)

Unnamed: 0,title,authors,abstract,arxiv_abbrev,topic,topic_re
0,Ferret-UI: Grounded Mobile UI Understanding with Multimodal LLMs,"Authors:Keen You,Haotian Zhang,Eldon Schoop,Floris Weers,Amanda Swearngin,Jeffrey Nichols,Yinfei Yang,Zhe Gan","Recent advancements in multimodal large language models (MLLMs) have been noteworthy, yet, these general-domain MLLMs often fall short in their ability to comprehend and interact effectively with user interface (UI) screens. In this paper, we present Ferret-UI, a new MLLM tailored for enhanced understanding of mobile UI screens, equipped with referring, grounding, and reasoning capabilities. Given that UI screens typically exhibit a more elongated aspect ratio and contain smaller objects of interest (e.g., icons, texts) than natural images, we incorporate ""any resolution"" on top of Ferret to magnify details and leverage enhanced visual features. Specifically, each screen is divided into 2 sub-images based on the original aspect ratio (i.e., horizontal division for portrait screens and vertical division for landscape screens). Both sub-images are encoded separately before being sent to LLMs. We meticulously gather training samples from an extensive range of elementary UI tasks, such as icon recognition, find text, and widget listing. These samples are formatted for instruction-following with region annotations to facilitate precise referring and grounding. To augment the model's reasoning ability, we further compile a dataset for advanced tasks, including detailed description, perception/interaction conversations, and function inference. After training on the curated datasets, Ferret-UI exhibits outstanding comprehension of UI screens and the capability to execute open-ended instructions. For model evaluation, we establish a comprehensive benchmark encompassing all the aforementioned tasks. Ferret-UI excels not only beyond most open-source UI MLLMs, but also surpasses GPT-4V on all the elementary UI tasks.",2404.05719,llms,llms
1,"ByteEdit: Boost, Comply and Accelerate Generative Image Editing","Authors:Yuxi Ren,Jie Wu,Yanzuo Lu,Huafeng Kuang,Xin Xia,Xionghui Wang,Qianqian Wang,Yixing Zhu,Pan Xie,Shiyin Wang,Xuefeng Xiao,Yitong Wang,Min Zheng,Lean Fu","Recent advancements in diffusion-based generative image editing have sparked a profound revolution, reshaping the landscape of image outpainting and inpainting tasks. Despite these strides, the field grapples with inherent challenges, including: i) inferior quality; ii) poor consistency; iii) insufficient instrcution adherence; iv) suboptimal generation efficiency. To address these obstacles, we present ByteEdit, an innovative feedback learning framework meticulously designed to Boost, Comply, and Accelerate Generative Image Editing tasks. ByteEdit seamlessly integrates image reward models dedicated to enhancing aesthetics and image-text alignment, while also introducing a dense, pixel-level reward model tailored to foster coherence in the output. Furthermore, we propose a pioneering adversarial and progressive feedback learning strategy to expedite the model's inference speed. Through extensive large-scale user evaluations, we demonstrate that ByteEdit surpasses leading generative image editing products, including Adobe, Canva, and MeiTu, in both generation quality and consistency. ByteEdit-Outpainting exhibits a remarkable enhancement of 388% and 135% in quality and consistency, respectively, when compared to the baseline model. Experiments also verfied that our acceleration models maintains excellent performance results in terms of quality and consistency.",2404.0486,other,other
2,SpatialTracker: Tracking Any 2D Pixels in 3D Space,"Authors:Yuxi Xiao,Qianqian Wang,Shangzhan Zhang,Nan Xue,Sida Peng,Yujun Shen,Xiaowei Zhou","Recovering dense and long-range pixel motion in videos is a challenging problem. Part of the difficulty arises from the 3D-to-2D projection process, leading to occlusions and discontinuities in the 2D motion domain. While 2D motion can be intricate, we posit that the underlying 3D motion can often be simple and low-dimensional. In this work, we propose to estimate point trajectories in 3D space to mitigate the issues caused by image projection. Our method, named SpatialTracker, lifts 2D pixels to 3D using monocular depth estimators, represents the 3D content of each frame efficiently using a triplane representation, and performs iterative updates using a transformer to estimate 3D trajectories. Tracking in 3D allows us to leverage as-rigid-as-possible (ARAP) constraints while simultaneously learning a rigidity embedding that clusters pixels into different rigid parts. Extensive evaluation shows that our approach achieves state-of-the-art tracking performance both qualitatively and quantitatively, particularly in challenging scenarios such as out-of-plane rotation.",2404.04319,other,other
3,SwapAnything: Enabling Arbitrary Object Swapping in Personalized Visual Editing,"Authors:Jing Gu,Yilin Wang,Nanxuan Zhao,Wei Xiong,Qing Liu,Zhifei Zhang,He Zhang,Jianming Zhang,HyunJoon Jung,Xin Eric Wang","Effective editing of personal content holds a pivotal role in enabling individuals to express their creativity, weaving captivating narratives within their visual stories, and elevate the overall quality and impact of their visual content. Therefore, in this work, we introduce SwapAnything, a novel framework that can swap any objects in an image with personalized concepts given by the reference, while keeping the context unchanged. Compared with existing methods for personalized subject swapping, SwapAnything has three unique advantages: (1) precise control of arbitrary objects and parts rather than the main subject, (2) more faithful preservation of context pixels, (3) better adaptation of the personalized concept to the image. First, we propose targeted variable swapping to apply region control over latent feature maps and swap masked variables for faithful context preservation and initial semantic concept swapping. Then, we introduce appearance adaptation, to seamlessly adapt the semantic concept into the original image in terms of target location, shape, style, and content during the image generation process. Extensive results on both human and automatic evaluation demonstrate significant improvements of our approach over baseline methods on personalized swapping. Furthermore, SwapAnything shows its precise and faithful swapping abilities across single object, multiple objects, partial object, and cross-domain swapping tasks. SwapAnything also achieves great performance on text-based swapping and tasks beyond swapping such as object insertion.",2404.05717,other,other
4,BeyondScene: Higher-Resolution Human-Centric Scene Generation With Pretrained Diffusion,"Authors:Gwanghyun Kim,Hayeon Kim,Hoigi Seo,Dong Un Kang,Se Young Chun","Generating higher-resolution human-centric scenes with details and controls remains a challenge for existing text-to-image diffusion models. This challenge stems from limited training image size, text encoder capacity (limited tokens), and the inherent difficulty of generating complex scenes involving multiple humans. While current methods attempted to address training size limit only, they often yielded human-centric scenes with severe artifacts. We propose BeyondScene, a novel framework that overcomes prior limitations, generating exquisite higher-resolution (over 8K) human-centric scenes with exceptional text-image correspondence and naturalness using existing pretrained diffusion models. BeyondScene employs a staged and hierarchical approach to initially generate a detailed base image focusing on crucial elements in instance creation for multiple humans and detailed descriptions beyond token limit of diffusion model, and then to seamlessly convert the base image to a higher-resolution output, exceeding training image size and incorporating details aware of text and instances via our novel instance-aware hierarchical enlargement process that consists of our proposed high-frequency injected forward diffusion and adaptive joint diffusion. BeyondScene surpasses existing methods in terms of correspondence with detailed text descriptions and naturalness, paving the way for advanced applications in higher-resolution human-centric scene creation beyond the capacity of pretrained diffusion models without costly retraining. Project page: https://janeyeon.github.io/beyond-scene.",2404.04544,other,other
5,UniFL: Improve Stable Diffusion via Unified Feedback Learning,"Authors:Jiacheng Zhang,Jie Wu,Yuxi Ren,Xin Xia,Huafeng Kuang,Pan Xie,Jiashi Li,Xuefeng Xiao,Weilin Huang,Min Zheng,Lean Fu,Guanbin Li","Diffusion models have revolutionized the field of image generation, leading to the proliferation of high-quality models and diverse downstream applications. However, despite these significant advancements, the current competitive solutions still suffer from several limitations, including inferior visual quality, a lack of aesthetic appeal, and inefficient inference, without a comprehensive solution in sight. To address these challenges, we present UniFL, a unified framework that leverages feedback learning to enhance diffusion models comprehensively. UniFL stands out as a universal, effective, and generalizable solution applicable to various diffusion models, such as SD1.5 and SDXL. Notably, UniFL incorporates three key components: perceptual feedback learning, which enhances visual quality; decoupled feedback learning, which improves aesthetic appeal; and adversarial feedback learning, which optimizes inference speed. In-depth experiments and extensive user studies validate the superior performance of our proposed method in enhancing both the quality of generated models and their acceleration. For instance, UniFL surpasses ImageReward by 17% user preference in terms of generation quality and outperforms LCM and SDXL Turbo by 57% and 20% in 4-step inference. Moreover, we have verified the efficacy of our approach in downstream tasks, including Lora, ControlNet, and AnimateDiff.",2404.05595,other,other
6,MagicTime: Time-lapse Video Generation Models as Metamorphic Simulators,"Authors:Shenghai Yuan,Jinfa Huang,Yujun Shi,Yongqi Xu,Ruijie Zhu,Bin Lin,Xinhua Cheng,Li Yuan,Jiebo Luo","Recent advances in Text-to-Video generation (T2V) have achieved remarkable success in synthesizing high-quality general videos from textual descriptions. A largely overlooked problem in T2V is that existing models have not adequately encoded physical knowledge of the real world, thus generated videos tend to have limited motion and poor variations. In this paper, we propose MagicTime, a metamorphic time-lapse video generation model, which learns real-world physics knowledge from time-lapse videos and implements metamorphic generation. First, we design a MagicAdapter scheme to decouple spatial and temporal training, encode more physical knowledge from metamorphic videos, and transform pre-trained T2V models to generate metamorphic videos. Second, we introduce a Dynamic Frames Extraction strategy to adapt to metamorphic time-lapse videos, which have a wider variation range and cover dramatic object metamorphic processes, thus embodying more physical knowledge than general videos. Finally, we introduce a Magic Text-Encoder to improve the understanding of metamorphic video prompts. Furthermore, we create a time-lapse video-text dataset called ChronoMagic, specifically curated to unlock the metamorphic video generation ability. Extensive experiments demonstrate the superiority and effectiveness of MagicTime for generating high-quality and dynamic metamorphic videos, suggesting time-lapse video generation is a promising path toward building metamorphic simulators of the physical world.",2404.05014,other,other
7,MA-LMM: Memory-Augmented Large Multimodal Model for Long-Term Video Understanding,"Authors:Bo He,Hengduo Li,Young Kyun Jang,Menglin Jia,Xuefei Cao,Ashish Shah,Abhinav Shrivastava,Ser-Nam Lim","With the success of large language models (LLMs), integrating the vision model into LLMs to build vision-language foundation models has gained much more interest recently. However, existing LLM-based large multimodal models (e.g., Video-LLaMA, VideoChat) can only take in a limited number of frames for short video understanding. In this study, we mainly focus on designing an efficient and effective model for long-term video understanding. Instead of trying to process more frames simultaneously like most existing work, we propose to process videos in an online manner and store past video information in a memory bank. This allows our model to reference historical video content for long-term analysis without exceeding LLMs' context length constraints or GPU memory limits. Our memory bank can be seamlessly integrated into current multimodal LLMs in an off-the-shelf manner. We conduct extensive experiments on various video understanding tasks, such as long-video understanding, video question answering, and video captioning, and our model can achieve state-of-the-art performances across multiple datasets. Code available at https://boheumd.github.io/MA-LMM/.",2404.05726,other,other
8,PhysAvatar: Learning the Physics of Dressed 3D Avatars from Visual Observations,"Authors:Yang Zheng,Qingqing Zhao,Guandao Yang,Wang Yifan,Donglai Xiang,Florian Dubost,Dmitry Lagun,Thabo Beeler,Federico Tombari,Leonidas Guibas,Gordon Wetzstein","Modeling and rendering photorealistic avatars is of crucial importance in many applications. Existing methods that build a 3D avatar from visual observations, however, struggle to reconstruct clothed humans. We introduce PhysAvatar, a novel framework that combines inverse rendering with inverse physics to automatically estimate the shape and appearance of a human from multi-view video data along with the physical parameters of the fabric of their clothes. For this purpose, we adopt a mesh-aligned 4D Gaussian technique for spatio-temporal mesh tracking as well as a physically based inverse renderer to estimate the intrinsic material properties. PhysAvatar integrates a physics simulator to estimate the physical parameters of the garments using gradient-based optimization in a principled manner. These novel capabilities enable PhysAvatar to create high-quality novel-view renderings of avatars dressed in loose-fitting clothes under motions and lighting conditions not seen in the training data. This marks a significant advancement towards modeling photorealistic digital humans using physically based inverse rendering with physics in the loop. Our project website is at: https://qingqing-zhao.github.io/PhysAvatar",2404.04421,other,other
9,YaART: Yet Another ART Rendering Technology,"Authors:Sergey Kastryulin,Artem Konev,Alexander Shishenya,Eugene Lyapustin,Artem Khurshudov,Alexander Tselousov,Nikita Vinokurov,Denis Kuznedelev,Alexander Markovich,Grigoriy Livshits,Alexey Kirillov,Anastasiia Tabisheva,Liubov Chubarova,Marina Kaminskaia,Alexander Ustyuzhanin,Artemii Shvetsov,Daniil Shlenskii,Valerii Startsev,Dmitrii Kornilov,Mikhail Romanov,Artem Babenko,Sergei Ovcharenko+1 authors","In the rapidly progressing field of generative models, the development of efficient and high-fidelity text-to-image diffusion systems represents a significant frontier. This study introduces YaART, a novel production-grade text-to-image cascaded diffusion model aligned to human preferences using Reinforcement Learning from Human Feedback (RLHF). During the development of YaART, we especially focus on the choices of the model and training dataset sizes, the aspects that were not systematically investigated for text-to-image cascaded diffusion models before. In particular, we comprehensively analyze how these choices affect both the efficiency of the training process and the quality of the generated images, which are highly important in practice. Furthermore, we demonstrate that models trained on smaller datasets of higher-quality images can successfully compete with those trained on larger datasets, establishing a more efficient scenario of diffusion models training. From the quality perspective, YaART is consistently preferred by users over many existing state-of-the-art models.",2404.05666,other,other


In [203]:
new_articles['topic'].value_counts()

topic
other    44
llms     25
Name: count, dtype: int64

In [204]:
new_articles[new_articles['topic']=='llms'].shape

(25, 6)

In [205]:
new_articles.columns

Index(['title', 'authors', 'abstract', 'arxiv_abbrev', 'topic', 'topic_re'], dtype='object')

In [207]:
llm_articles = new_articles[new_articles['topic']=='llms'].loc[:,['title', 'authors', 'abstract', 'arxiv_abbrev', 'topic_re']]
llm_articles.columns=['title', 'authors', 'abstract', 'arxiv_abbrev',  'topic']

In [208]:
llm_articles.tail()

Unnamed: 0,title,authors,abstract,arxiv_abbrev,topic
55,Dataset Reset Policy Optimization for RLHF,"Authors:Jonathan D. Chang,Wenhao Shan,Owen Oertell,Kianté Brantley,Dipendra Misra,Jason D. Lee,Wen Sun","Reinforcement Learning (RL) from Human Preference-based feedback is a popular paradigm for fine-tuning generative models, which has produced impressive models such as GPT-4 and Claude3 Opus. This framework often consists of two steps: learning a reward model from an offline preference dataset followed by running online RL to optimize the learned reward model. In this work, leveraging the idea of reset, we propose a new RLHF algorithm with provable guarantees. Motivated by the fact that offline preference dataset provides informative states (i.e., data that is preferred by the labelers), our new algorithm, Dataset Reset Policy Optimization (DR-PO), integrates the existing offline preference dataset into the online policy training procedure via dataset reset: it directly resets the policy optimizer to the states in the offline dataset, instead of always starting from the initial state distribution. In theory, we show that DR-PO learns to perform at least as good as any policy that is covered by the offline dataset under general function approximation with finite sample complexity. In experiments, we demonstrate that on both the TL;DR summarization and the Anthropic Helpful Harmful (HH) dataset, the generation from DR-PO is better than that from Proximal Policy Optimization (PPO) and Direction Preference Optimization (DPO), under the metric of GPT4 win-rate. Code for this work can be found at https://github.com/Cornell-RL/drpo.",2404.08495,llms
57,Learn Your Reference Model for Real Good Alignment,"Authors:Alexey Gorbatovski,Boris Shaposhnikov,Alexey Malakhov,Nikita Surnachev,Yaroslav Aksenov,Ian Maksimov,Nikita Balagansky,Daniil Gavrilov","The complexity of the alignment problem stems from the fact that existing methods are unstable. Researchers continuously invent various tricks to address this shortcoming. For instance, in the fundamental Reinforcement Learning From Human Feedback (RLHF) technique of Language Model alignment, in addition to reward maximization, the Kullback-Leibler divergence between the trainable policy and the SFT policy is minimized. This addition prevents the model from being overfitted to the Reward Model (RM) and generating texts that are out-of-domain for the RM. The Direct Preference Optimization (DPO) method reformulates the optimization task of RLHF and eliminates the Reward Model while tacitly maintaining the requirement for the policy to be close to the SFT policy. In our paper, we argue that this implicit limitation in the DPO method leads to sub-optimal results. We propose a new method called Trust Region DPO (TR-DPO), which updates the reference policy during training. With such a straightforward update, we demonstrate the effectiveness of TR-DPO against DPO on the Anthropic HH and TLDR datasets. We show that TR-DPO outperforms DPO by up to 19%, measured by automatic evaluation with GPT-4. The new alignment approach that we propose allows us to improve the quality of models across several parameters at once, such as coherence, correctness, level of detail, helpfulness, and harmlessness.",2404.09656,llms
58,Megalodon: Efficient LLM Pretraining and Inference with Unlimited Context Length,"Authors:Xuezhe Ma,Xiaomeng Yang,Wenhan Xiong,Beidi Chen,Lili Yu,Hao Zhang,Jonathan May,Luke Zettlemoyer,Omer Levy,Chunting Zhou","The quadratic complexity and weak length extrapolation of Transformers limits their ability to scale to long sequences, and while sub-quadratic solutions like linear attention and state space models exist, they empirically underperform Transformers in pretraining efficiency and downstream task accuracy. We introduce Megalodon, a neural architecture for efficient sequence modeling with unlimited context length. Megalodon inherits the architecture of Mega (exponential moving average with gated attention), and further introduces multiple technical components to improve its capability and stability, including complex exponential moving average (CEMA), timestep normalization layer, normalized attention mechanism and pre-norm with two-hop residual configuration. In a controlled head-to-head comparison with Llama2, Megalodon achieves better efficiency than Transformer in the scale of 7 billion parameters and 2 trillion training tokens. Megalodon reaches a training loss of 1.70, landing mid-way between Llama2-7B (1.75) and 13B (1.67). Code: https://github.com/XuezheMax/megalodon",2404.08801,llms
59,TransformerFAM: Feedback attention is working memory,"Authors:Dongseong Hwang,Weiran Wang,Zhuoyuan Huo,Khe Chai Sim,Pedro Moreno Mengibar","While Transformers have revolutionized deep learning, their quadratic attention complexity hinders their ability to process infinitely long inputs. We propose Feedback Attention Memory (FAM), a novel Transformer architecture that leverages a feedback loop to enable the network to attend to its own latent representations. This design fosters the emergence of working memory within the Transformer, allowing it to process indefinitely long sequences. TransformerFAM requires no additional weights, enabling seamless integration with pre-trained models. Our experiments show that TransformerFAM significantly improves Transformer performance on long-context tasks across various model sizes (1B, 8B, and 24B). These results showcase the potential to empower Large Language Models (LLMs) to process sequences of unlimited length.",2404.09173,llms
60,Compression Represents Intelligence Linearly,"Authors:Yuzhen Huang,Jinghan Zhang,Zifei Shan,Junxian He","There is a belief that learning to compress well will lead to intelligence. Recently, language modeling has been shown to be equivalent to compression, which offers a compelling rationale for the success of large language models (LLMs): the development of more advanced language models is essentially enhancing compression which facilitates intelligence. Despite such appealing discussions, little empirical evidence is present for the interplay between compression and intelligence. In this work, we examine their relationship in the context of LLMs, treating LLMs as data compressors. Given the abstract concept of ""intelligence"", we adopt the average downstream benchmark scores as a surrogate, specifically targeting intelligence related to knowledge and commonsense, coding, and mathematical reasoning. Across 12 benchmarks, our study brings together 30 public LLMs that originate from diverse organizations. Remarkably, we find that LLMs' intelligence -- reflected by average benchmark scores -- almost linearly correlates with their ability to compress external text corpora. These results provide concrete evidence supporting the belief that superior compression indicates greater intelligence. Furthermore, our findings suggest that compression efficiency, as an unsupervised metric derived from raw text corpora, serves as a reliable evaluation measure that is linearly associated with the model capabilities. We open-source our compression datasets as well as our data collection pipelines to facilitate future researchers to assess compression properly.",2404.09937,llms


In [209]:
llm_articles.to_csv('/home/mainuser/Desktop/LLMs/RagOverArXiv/data/llm_articles_up_to_2024-04-16.csv',index=False)

In [210]:
import pandas as pd
llm_articles=pd.read_csv('/home/mainuser/Desktop/LLMs/RagOverArXiv/data/llm_articles_up_to_2024-04-16.csv')

In [212]:
llm_articles.shape

(25, 5)

In [213]:
import numpy as np

In [214]:
docs[0].metadata

{'title': 'RAVEN: In-Context Learning with Retrieval-Augmented Encoder-Decoder  Language Models',
 'abbrev': '2308.07922',
 'vs_index': 0}

In [215]:

def get_embedding_distances(query,vector_store,docs,k=10):
    embedding_vector = core_embeddings_model.embed_query(query)
    docs = vector_store.similarity_search_by_vector(embedding_vector, k = k)
    vs_indices = [doc.metadata['vs_index'] for doc in docs]
  
    similar_embedding_vectors = np.array([vector_store.index.reconstruct_n(index_id, 1)[0] for index_id in vs_indices])

    distances = np.linalg.norm(similar_embedding_vectors-np.array(embedding_vector), axis=1)
    average_distance = np.mean(distances)
    return distances, average_distance



In [216]:
from functools import partial
get_dists = partial(get_embedding_distances,vector_store=vector_store,docs=docs,k=10) 
llm_articles['emb_dists'],llm_articles['avg_emb_dist']=zip(*llm_articles['abstract'].apply(lambda x: get_dists(x)))

In [217]:
pd.set_option('display.max_colwidth', 100)

In [218]:
llm_articles.head()

Unnamed: 0,title,authors,abstract,arxiv_abbrev,topic,emb_dists,avg_emb_dist
0,Ferret-UI: Grounded Mobile UI Understanding with Multimodal LLMs,"Authors:Keen You,Haotian Zhang,Eldon Schoop,Floris Weers,Amanda Swearngin,Jeffrey Nichols,Yinfei...","Recent advancements in multimodal large language models (MLLMs) have been noteworthy, yet, these...",2404.05719,llms,"[12.201016663073478, 12.421127832361341, 12.575758737000204, 12.700794308564443, 12.723025376277...",12.702875
1,LLM2Vec: Large Language Models Are Secretly Powerful Text Encoders,"Authors:Parishad BehnamGhader,Vaibhav Adlakha,Marius Mosbach,Dzmitry Bahdanau,Nicolas Chapados,S...",Large decoder-only language models (LLMs) are the state-of-the-art models on most of today's NLP...,2404.05961,llms,"[11.03322129411231, 11.095819857319206, 11.380552810532274, 11.393827158066271, 11.4246071543606...",11.363708
2,Eagle and Finch: RWKV with Matrix-Valued States and Dynamic Recurrence,"Authors:Bo Peng,Daniel Goldstein,Quentin Anthony,Alon Albalak,Eric Alcaide,Stella Biderman,Eugen...","We present Eagle (RWKV-5) and Finch (RWKV-6), sequence models improving upon the RWKV (RWKV-4) a...",2404.05892,llms,"[11.345155257119119, 11.42845900639154, 11.589359122771011, 11.84329891427124, 11.85253527684826...",11.785847
3,MiniCPM: Unveiling the Potential of Small Language Models with Scalable Training Strategies,"Authors:Shengding Hu,Yuge Tu,Xu Han,Chaoqun He,Ganqu Cui,Xiang Long,Zhi Zheng,Yewei Fang,Yuxiang...",The burgeoning interest in developing Large Language Models (LLMs) with up to trillion parameter...,2404.06395,llms,"[9.377453576628662, 9.488660525238432, 9.962195882307531, 10.013912430574708, 10.356499690987905...",10.225513
4,CodecLM: Aligning Language Models with Tailored Synthetic Data,"Authors:Zifeng Wang,Chun-Liang Li,Vincent Perot,Long T. Le,Jin Miao,Zizhao Zhang,Chen-Yu Lee,Tom...",Instruction tuning has emerged as the key in aligning large language models (LLMs) with specific...,2404.05875,llms,"[10.213641123462821, 10.663784239606466, 10.765694882610957, 10.858162046999393, 11.229226093562...",11.207188


In [219]:
llm_articles.avg_emb_dist.describe()

count    25.000000
mean     11.763598
std       1.051245
min      10.225513
25%      10.889350
50%      11.712150
75%      12.396669
max      13.670626
Name: avg_emb_dist, dtype: float64

In [220]:
llm_articles=llm_articles.sort_values('avg_emb_dist')

- Follow 80-20 heuristic for exploration/exploitation

In [225]:
eighty=llm_articles.iloc[:4,:]

In [231]:
twenty  = llm_articles[(llm_articles['avg_emb_dist']==llm_articles.avg_emb_dist.max())]

In [232]:
eighty_twenty= pd.concat([eighty,twenty],axis=0)

In [233]:
eighty_twenty

Unnamed: 0,title,authors,abstract,arxiv_abbrev,topic,emb_dists,avg_emb_dist
3,MiniCPM: Unveiling the Potential of Small Language Models with Scalable Training Strategies,"Authors:Shengding Hu,Yuge Tu,Xu Han,Chaoqun He,Ganqu Cui,Xiang Long,Zhi Zheng,Yewei Fang,Yuxiang...",The burgeoning interest in developing Large Language Models (LLMs) with up to trillion parameter...,2404.06395,llms,"[9.377453576628662, 9.488660525238432, 9.962195882307531, 10.013912430574708, 10.356499690987905...",10.225513
7,Elephants Never Forget: Memorization and Learning of Tabular Data in Large Language Models,"Authors:Sebastian Bordt,Harsha Nori,Vanessa Rodrigues,Besmira Nushi,Rich Caruana","While many have shown how Large Language Models (LLMs) can be applied to a diverse set of tasks,...",2404.06209,llms,"[8.700613066073558, 9.835016479979423, 10.060549197802118, 10.111659958813162, 10.21750851357972...",10.255892
19,Pre-training Small Base LMs with Fewer Tokens,"Authors:Sunny Sanyal,Sujay Sanghavi,Alexandros G. Dimakis",We study the effectiveness of a simple approach to develop a small base language model (LM) star...,2404.08634,llms,"[9.911934168192557, 9.990401982666498, 10.136721214046329, 10.315288327027822, 10.32599356209696...",10.308379
12,JetMoE: Reaching Llama2 Performance with 0.1M Dollars,"Authors:Yikang Shen,Zhen Guo,Tianle Cai,Zengyi Qin","Large Language Models (LLMs) have achieved remarkable results, but their increasing resource dem...",2404.07413,llms,"[9.239893119685544, 9.731755536464691, 9.959462180800749, 10.059882383775637, 10.286171968365627...",10.37727
20,Dataset Reset Policy Optimization for RLHF,"Authors:Jonathan D. Chang,Wenhao Shan,Owen Oertell,Kianté Brantley,Dipendra Misra,Jason D. Lee,W...",Reinforcement Learning (RL) from Human Preference-based feedback is a popular paradigm for fine-...,2404.08495,llms,"[10.601702051650602, 12.30378178252001, 12.862117251126232, 13.681003383608655, 14.1768027734208...",13.670626


In [222]:
llm_articles[(llm_articles['avg_emb_dist']==llm_articles.avg_emb_dist.min())
             |(llm_articles['avg_emb_dist']==llm_articles.avg_emb_dist.max())
             |(llm_articles['avg_emb_dist']==llm_articles.avg_emb_dist.median())]

Unnamed: 0,title,authors,abstract,arxiv_abbrev,topic,emb_dists,avg_emb_dist
3,MiniCPM: Unveiling the Potential of Small Language Models with Scalable Training Strategies,"Authors:Shengding Hu,Yuge Tu,Xu Han,Chaoqun He,Ganqu Cui,Xiang Long,Zhi Zheng,Yewei Fang,Yuxiang...",The burgeoning interest in developing Large Language Models (LLMs) with up to trillion parameter...,2404.06395,llms,"[9.377453576628662, 9.488660525238432, 9.962195882307531, 10.013912430574708, 10.356499690987905...",10.225513
24,Compression Represents Intelligence Linearly,"Authors:Yuzhen Huang,Jinghan Zhang,Zifei Shan,Junxian He","There is a belief that learning to compress well will lead to intelligence. Recently, language m...",2404.09937,llms,"[10.502811529280036, 11.195357048912312, 11.317314878343582, 11.602831753990818, 11.900717587059...",11.71215
20,Dataset Reset Policy Optimization for RLHF,"Authors:Jonathan D. Chang,Wenhao Shan,Owen Oertell,Kianté Brantley,Dipendra Misra,Jason D. Lee,W...",Reinforcement Learning (RL) from Human Preference-based feedback is a popular paradigm for fine-...,2404.08495,llms,"[10.601702051650602, 12.30378178252001, 12.862117251126232, 13.681003383608655, 14.1768027734208...",13.670626


In [223]:
q25 = llm_articles.avg_emb_dist.quantile(0.25)
q80 = llm_articles.avg_emb_dist.quantile(0.80)
llm_articles[(abs(llm_articles['avg_emb_dist']-q80)<=0.3)
                ].sort_values('avg_emb_dist').iloc[0,:]

llm_articles[(llm_articles['avg_emb_dist']>=q80)
                ].sort_values('avg_emb_dist')

Unnamed: 0,title,authors,abstract,arxiv_abbrev,topic,emb_dists,avg_emb_dist
0,Ferret-UI: Grounded Mobile UI Understanding with Multimodal LLMs,"Authors:Keen You,Haotian Zhang,Eldon Schoop,Floris Weers,Amanda Swearngin,Jeffrey Nichols,Yinfei...","Recent advancements in multimodal large language models (MLLMs) have been noteworthy, yet, these...",2404.05719,llms,"[12.201016663073478, 12.421127832361341, 12.575758737000204, 12.700794308564443, 12.723025376277...",12.702875
18,Transferable and Principled Efficiency for Open-Vocabulary Segmentation,"Authors:Jingxuan Xu,Wuyang Chen,Yao Zhao,Yunchao Wei",Recent success of pre-trained foundation vision-language models makes Open-Vocabulary Segmentati...,2404.07448,llms,"[12.573024188644053, 12.808905105192547, 12.833223308474425, 13.045494148331887, 13.072369410809...",13.177242
5,MuPT: A Generative Symbolic Music Pretrained Transformer,"Authors:Xingwei Qu,Yuelin Bai,Yinghao Ma,Ziya Zhou,Ka Man Lo,Jiaheng Liu,Ruibin Yuan,Lejun Min,X...","In this paper, we explore the application of Large Language Models (LLMs) to the pre-training of...",2404.06393,llms,"[12.939994273074173, 12.965719869471338, 12.998888916000036, 13.348547372236853, 13.364794536214...",13.385977
16,Audio Dialogues: Dialogues dataset for audio and music understanding,"Authors:Arushi Goel,Zhifeng Kong,Rafael Valle,Bryan Catanzaro",Existing datasets for audio understanding primarily focus on single-turn interactions (i.e. audi...,2404.07616,llms,"[13.309188048398507, 13.406342978533383, 13.540871335806123, 13.546039915070754, 13.551678477015...",13.54122
20,Dataset Reset Policy Optimization for RLHF,"Authors:Jonathan D. Chang,Wenhao Shan,Owen Oertell,Kianté Brantley,Dipendra Misra,Jason D. Lee,W...",Reinforcement Learning (RL) from Human Preference-based feedback is a popular paradigm for fine-...,2404.08495,llms,"[10.601702051650602, 12.30378178252001, 12.862117251126232, 13.681003383608655, 14.1768027734208...",13.670626


In [143]:
q25 = llm_articles.avg_emb_dist.quantile(0.25)
q75 = llm_articles.avg_emb_dist.quantile(0.75)
select_articles = llm_articles[(abs(llm_articles['avg_emb_dist']-q25)<=0.05)
                |(abs(llm_articles['avg_emb_dist']-q75)<=0.05)
                |(llm_articles['avg_emb_dist']==llm_articles.avg_emb_dist.min())].sort_values('avg_emb_dist')

- OK, seems like something of this nature would be what I want to send forward, seems like it aligns with papers I would want to look into from most similar to more novel

In [144]:
select_articles.to_csv('/home/mainuser/Desktop/LLMs/RagOverArXiv/data/chosen_llm_articles_up_to_2024-04-16.csv',index=False)

In [145]:
select_articles

Unnamed: 0,title,authors,abstract,arxiv_abbrev,topic,emb_dists,avg_emb_dist
27,Elephants Never Forget: Memorization and Learning of Tabular Data in Large Language Models,"Authors:Sebastian Bordt,Harsha Nori,Vanessa Rodrigues,Besmira Nushi,Rich Caruana","While many have shown how Large Language Models (LLMs) can be applied to a diverse set of tasks,...",2404.06209,llms,"[9.835016479979423, 10.060549197802118, 10.111659958813162, 10.21750851357972, 10.50181969852475...",10.621296
20,CodecLM: Aligning Language Models with Tailored Synthetic Data,"Authors:Zifeng Wang,Chun-Liang Li,Vincent Perot,Long T. Le,Jin Miao,Zizhao Zhang,Chen-Yu Lee,Tom...",Instruction tuning has emerged as the key in aligning large language models (LLMs) with specific...,2404.05875,llms,"[10.663784239606466, 10.858162046999393, 11.229226093562481, 11.415028708887158, 12.174792902537...",11.917534
16,LLM2Vec: Large Language Models Are Secretly Powerful Text Encoders,"Authors:Parishad BehnamGhader,Vaibhav Adlakha,Marius Mosbach,Dzmitry Bahdanau,Nicolas Chapados,S...",Large decoder-only language models (LLMs) are the state-of-the-art models on most of today's NLP...,2404.05961,llms,"[11.03322129411231, 11.380552810532274, 11.393827158066271, 11.434131417362972, 11.4937417683813...",11.939184
23,SambaLingo: Teaching Large Language Models New Languages,"Authors:Zoltan Csaki,Bo Li,Jonathan Li,Qiantong Xu,Pian Pawakapan,Leon Zhang,Yun Du,Hengyu Zhao,...","Despite the widespread availability of LLMs, there remains a substantial gap in their capabiliti...",2404.05829,llms,"[10.871547833016765, 11.074968403235761, 11.632704566608224, 11.65514664340978, 12.0123631050087...",11.956805
17,InternLM-XComposer2-4KHD: A Pioneering Large Vision-Language Model Handling Resolutions from 336...,"Authors:Xiaoyi Dong,Pan Zhang,Yuhang Zang,Yuhang Cao,Bin Wang,Linke Ouyang,Songyang Zhang,Haodon...","The Large Vision-Language Model (LVLM) field has seen significant advancements, yet its progress...",2404.06512,llms,"[12.715346620159693, 13.665014824291674, 13.786537691953383, 14.305279699186768, 14.342509213306...",14.242317


In [252]:
# #abstract = r"""JWST is discovering a large population of z>4 supermassive black holes (SMBHs) that are overmassive with respect to the stellar content of their hosts. A previous study developed a physical model to interpret this overmassive population as the result of quasar feedback acting on a compact host galaxy. In this Note, we apply this model to JADES GN 1146115, a dormant supermassive black hole at z=6.7 whose mass is ∼40% of the host's mass in stars and accreting at ∼2% of the Eddington limit. The host has been forming stars at the low rate of ∼1M⊙yr−1 for the past ∼100 Myr. Our model suggests that this galactic system is on the verge of a resurgence of global star formation activity. This transition comes after a period of domination by the effect of its overmassive black hole, whose duration is comparable to typical quasar lifetimes. """
# abstract = r"""Text animation serves as an expressive medium, transforming static communication into dynamic experiences by infusing words with motion to evoke emotions, emphasize meanings, and construct compelling narratives. Crafting animations that are semantically aware poses significant challenges, demanding expertise in graphic design and animation. We present an automated text animation scheme, termed "Dynamic Typography", which combines two challenging tasks. It deforms letters to convey semantic meaning and infuses them with vibrant movements based on user prompts. Our technique harnesses vector graphics representations and an end-to-end optimization-based framework. This framework employs neural displacement fields to convert letters into base shapes and applies per-frame motion, encouraging coherence with the intended textual concept. Shape preservation techniques and perceptual loss regularization are employed to maintain legibility and structural integrity throughout the animation process. We demonstrate the generalizability of our approach across various text-to-video models and highlight the superiority of our end-to-end methodology over baseline methods, which might comprise separate tasks. Through quantitative and qualitative evaluations, we demonstrate the effectiveness of our framework in generating coherent text animations that faithfully interpret user prompts while maintaining readability. Our code is available at: https://animate-your-word.github.io/demo/."""
# #abstract = r"""We introduce Reka Core, Flash, and Edge, a series of powerful multimodal language models trained from scratch by Reka. Reka models are able to process and reason with text, images, video, and audio inputs. This technical report discusses details of training some of these models and provides comprehensive evaluation results. We show that Reka Edge and Reka Flash are not only state-of-the-art but also outperform many much larger models, delivering outsized values for their respective compute class. Meanwhile, our most capable and largest model, Reka Core, approaches the best frontier models on both automatic evaluations and blind human evaluations. On image question answering benchmarks (e.g. MMMU, VQAv2), Core performs competitively to GPT4-V. Meanwhile, on multimodal chat, Core ranks as the second most preferred model under a blind third-party human evaluation setup, outperforming other models such as Claude 3 Opus. On text benchmarks, Core not only performs competitively to other frontier models on a set of well-established benchmarks (e.g. MMLU, GSM8K) but also outperforms GPT4-0613 on human evaluation. On video question answering (Perception-Test), Core outperforms Gemini Ultra. Models are shipped in production at http://chat.reka.ai . A showcase of non cherry picked qualitative examples can also be found at http://showcase.reka.ai ."""
# #abstract = r"""Despite the impressive capabilities of Large Language Models (LLMs) on various tasks, they still struggle with scenarios that involves complex reasoning and planning. Recent work proposed advanced prompting techniques and the necessity of fine-tuning with high-quality data to augment LLMs' reasoning abilities. However, these approaches are inherently constrained by data availability and quality. In light of this, self-correction and self-learning emerge as viable solutions, employing strategies that allow LLMs to refine their outputs and learn from self-assessed rewards. Yet, the efficacy of LLMs in self-refining its response, particularly in complex reasoning and planning task, remains dubious. In this paper, we introduce AlphaLLM for the self-improvements of LLMs, which integrates Monte Carlo Tree Search (MCTS) with LLMs to establish a self-improving loop, thereby enhancing the capabilities of LLMs without additional annotations. Drawing inspiration from the success of AlphaGo, AlphaLLM addresses the unique challenges of combining MCTS with LLM for self-improvement, including data scarcity, the vastness search spaces of language tasks, and the subjective nature of feedback in language tasks. AlphaLLM is comprised of prompt synthesis component, an efficient MCTS approach tailored for language tasks, and a trio of critic models for precise feedback. Our experimental results in mathematical reasoning tasks demonstrate that AlphaLLM significantly enhances the performance of LLMs without additional annotations, showing the potential for self-improvement in LLMs."""

# topic_classification_prompt = f"""
# Your task is to take an arXiv abstract and classify the paper as LLMs (large langugage models) or other
# If you see 'LLMs' or 'language models' in the abstract, classify as 'LLMs'; else classify as 'other'
# Now here is the abstract: {abstract}
# Think step-by-step about your answer and check your answer before returning it and provide a brief explanation.


# Category:
# """
# #topic_classification_prompt = 
# ans = call_llm(question=topic_classification_prompt, generator=generator_llm,settings=generator_settings,max_new_tokens=24)[len(topic_classification_prompt):]
# #pattern = r'(Category|Answer):\s*(\w+)'
# pattern = r'(LLMs|diffusion|computer vision|multimodal)'
# match = re.search(pattern, ans,re.IGNORECASE)
# response =  match.group(1) if match else 'other'
# response

'LLMs'

In [285]:
# topic_classification_prompt
# ans
# abstract = r"""We introduce Reka Core, Flash, and Edge, a series of powerful multimodal language models trained from scratch by Reka. Reka models are able to process and reason with text, images, video, and audio inputs. This technical report discusses details of training some of these models and provides comprehensive evaluation results. We show that Reka Edge and Reka Flash are not only state-of-the-art but also outperform many much larger models, delivering outsized values for their respective compute class. Meanwhile, our most capable and largest model, Reka Core, approaches the best frontier models on both automatic evaluations and blind human evaluations. On image question answering benchmarks (e.g. MMMU, VQAv2), Core performs competitively to GPT4-V. Meanwhile, on multimodal chat, Core ranks as the second most preferred model under a blind third-party human evaluation setup, outperforming other models such as Claude 3 Opus. On text benchmarks, Core not only performs competitively to other frontier models on a set of well-established benchmarks (e.g. MMLU, GSM8K) but also outperforms GPT4-0613 on human evaluation. On video question answering (Perception-Test), Core outperforms Gemini Ultra. Models are shipped in production at http://chat.reka.ai . A showcase of non cherry picked qualitative examples can also be found at http://showcase.reka.ai ."""
# #pattern = r'(LLMs|diffusion|computer vision|multimodal)'
# pattern = r'(LLMs)'
# match = re.search(pattern, ans,re.IGNORECASE)
# response =  match.group(1) if match else 'other'
# response

# match

- Looks like can get Mistral exl2 to classify in .1-.2 seconds

- OK, trying old school route, not useful

In [168]:
# import gensim
# from gensim import corpora
# from gensim.models.ldamodel import LdaModel
# from nltk.tokenize import word_tokenize
# from nltk.corpus import stopwords
# import nltk

# # Download NLTK stopwords
# nltk.download('stopwords')
# nltk.download('punkt')

# # Sample abstract
# #abstract = "Your arXiv abstract text goes here."
# abstract = r"""We introduce Reka Core, Flash, and Edge, a series of powerful multimodal language models trained from scratch by Reka. Reka models are able to process and reason with text, images, video, and audio inputs. This technical report discusses details of training some of these models and provides comprehensive evaluation results. We show that Reka Edge and Reka Flash are not only state-of-the-art but also outperform many much larger models, delivering outsized values for their respective compute class. Meanwhile, our most capable and largest model, Reka Core, approaches the best frontier models on both automatic evaluations and blind human evaluations. On image question answering benchmarks (e.g. MMMU, VQAv2), Core performs competitively to GPT4-V. Meanwhile, on multimodal chat, Core ranks as the second most preferred model under a blind third-party human evaluation setup, outperforming other models such as Claude 3 Opus. On text benchmarks, Core not only performs competitively to other frontier models on a set of well-established benchmarks (e.g. MMLU, GSM8K) but also outperforms GPT4-0613 on human evaluation. On video question answering (Perception-Test), Core outperforms Gemini Ultra. Models are shipped in production at http://chat.reka.ai . A showcase of non cherry picked qualitative examples can also be found at http://showcase.reka.ai ."""

# # Preprocess the text
# def preprocess(text):
#     stop_words = set(stopwords.words('english'))
#     tokens = word_tokenize(text.lower())
#     return [token for token in tokens if token not in stop_words and token.isalpha()]

# # Tokenize the abstract
# tokenized_abstract = preprocess(abstract)

# # Create a dictionary representation of the documents
# dictionary = corpora.Dictionary([tokenized_abstract])

# # Create a bag-of-words representation of the documents
# corpus = [dictionary.doc2bow(tokenized_abstract)]

# # Train the LDA model
# lda_model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=10, passes=15)

# # Get the topic distribution for the abstract
# abstract_topics = lda_model.get_document_topics(corpus[0])

# # Print the topics with their probabilities
# for topic_id, prob in abstract_topics:
#     print(f"Topic ID: {topic_id} - Probability: {prob}")
#     topic = lda_model.print_topic(topic_id)
#     print(topic)

Topic ID: 9 - Probability: 0.9927418231964111
0.062*"models" + 0.046*"core" + 0.046*"reka" + 0.024*"also" + 0.024*"evaluation" + 0.024*"human" + 0.024*"benchmarks" + 0.016*"text" + 0.016*"model" + 0.016*"flash"


[nltk_data] Downloading package stopwords to
[nltk_data]     /home/mainuser/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/mainuser/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [35]:
# for i, doc in enumerate(docs):
#    #content = docs[index]
#    if i == 0:
#        vector_store = FAISS.from_documents(doc.__getattribute__('page_content'), embedder)
#    else:
#       vector_store_i = FAISS.from_documents(doc.__getattribute__('page_content'), embedder)
#       vector_store.merge_from(vector_store_i)

# vector_store

In [36]:
# for index, doc in enumerate(docs):
#    #content = docs[index]
#    if index == 0:
#        vector_store = FAISS.from_texts(doc.page_content, embedder)
#    else:
#       vector_store_i = FAISS.from_texts(doc.page_content, embedder)
#       vector_store.merge_from(vector_store_i)

# vector_store