In [1]:
import urllib
import xml.etree.ElementTree as ET
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

In [2]:
def fetch_papers():
    """Fetches papers from the arXiv API and returns them as a list of strings."""

    url = 'http://export.arxiv.org/api/query?search_query=ti:llama&start=0&max_results=70'
    response = urllib.request.urlopen(url)
    data = response.read().decode('utf-8')
    root = ET.fromstring(data)

    papers_list = []

    for entry in root.findall('{http://www.w3.org/2005/Atom}entry'):
        title = entry.find('{http://www.w3.org/2005/Atom}title').text
        summary = entry.find('{http://www.w3.org/2005/Atom}summary').text
        paper_info = f"Title: {title}\nSummary: {summary}\n"
        papers_list.append(paper_info)

    return papers_list

In [3]:
#papers_list = fetch_papers()
#df = pd.DataFrame(papers_list, columns=['TitleAbstract'])
df = pd.read_csv('df_papers_llama2.csv')
#df.to_csv('df_papers_llama2.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,TitleAbstract
0,0,Title: Lawyer LLaMA Technical Report\nSummary:...
1,1,Title: Label Supervised LLaMA Finetuning\nSumm...
2,2,Title: LLAMA: Leveraging Learning to Automatic...
3,3,Title: Challenges and opportunities integratin...
4,4,Title: LLaMA: Open and Efficient Foundation La...


In [4]:
df['TitleAbstract'].apply(len).max()

2058

In [5]:
import re
def remove_linebreaks(input_string):
    """Removes linebreaks and tabs from string using regular expression"""
    cleaned_string = re.sub(r'[\n\t]', ' ', input_string)
    return cleaned_string
    
df['TitleAbstract'] = df['TitleAbstract'].apply(remove_linebreaks)

import spacy

def extract_keywords(text):
    # Load the spaCy NLP model
    nlp = spacy.load("en_core_web_sm")

    # Process the text using spaCy
    doc = nlp(text)

    # Extract keywords (nouns and adjectives)
    keywords = [token.text for token in doc if token.pos_ in ["NOUN", "ADJ"]]
    keywords = " ".join(keywords)
    
    return keywords


df['KeyWords'] = df['TitleAbstract'].apply(extract_keywords)

df['KeyWords'].apply(len).max()

1131

In [10]:
from transformers import BertTokenizer, BertModel, AutoModel, AutoTokenizer
import torch

def get_bert_embedding(input_string):
    """tokenizes string and runs BERT to obtain embedding."""

    # Load pre-trained BERT tokenizer and model
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    model = BertModel.from_pretrained("bert-base-uncased")

    # Tokenize input text
    tokens = tokenizer(input_string, return_tensors='pt')

    # Forward pass to obtain embeddings
    with torch.no_grad():
        outputs = model(**tokens)

    # Extract embeddings from the last hidden layer (CLS token)
    embeddings = outputs.last_hidden_state[:, 0, :].numpy()

    return embeddings[0]

def cosine_similarity(embedding1, embedding2):
    """Calculates cosine similarity between two embeddings."""

    # Ensure both embeddings have the same shape
    if embedding1.shape != embedding2.shape:
        raise ValueError("Embeddings must have the same shape for cosine similarity calculation.")

    # Calculate cosine similarity
    similarity = np.dot(embedding1, embedding2.T) / (np.linalg.norm(embedding1) * np.linalg.norm(embedding2))

    return similarity


In [12]:
df['Embedding'] = df['KeyWords'].apply(get_bert_embedding)

In [227]:
df.head()

Unnamed: 0,TitleAbstract,KeyWords,Embedding
0,Title: Lawyer LLaMA Technical Report Summary: ...,Title Large remarkable performance various tas...,"[-0.21004462, 0.5068891, 0.39497513, 0.3047706..."
1,Title: Label Supervised LLaMA Finetuning Summa...,Title recent success Large significant attenti...,"[-0.62233067, 0.41749716, 0.07669323, 0.112751..."
2,Title: LLAMA: Leveraging Learning to Automatic...,Title portfolio selection approaches remarkabl...,"[-0.8397804, 0.3386485, 0.20808932, 0.18489337..."
3,Title: Challenges and opportunities integratin...,Title Challenges opportunities transport simul...,"[-0.5284588, 0.3198053, 0.34352037, 0.10175034..."
4,Title: LLaMA: Open and Efficient Foundation La...,Title collection foundation language models 7B...,"[-0.48944178, -0.0042269714, -0.015640128, 0.0..."


In [24]:
question = "Name at least 5 domain-specific LLMs that have been created by fine-tuning Llama-2."
question = "For which tasks has Llama-2 already been used successfully? What are promising areas of application for Llama-2?"

embedding_question = get_bert_embedding(question)
similarities = df['Embedding'].apply(lambda embedding_paper:cosine_similarity(embedding_question, embedding_paper))
index_match = similarities.sort_values(ascending=False).head(10).index
print(index_match)


relevant_papers = "\n".join(df['TitleAbstract'][index_match].values)

query = f"""Context information is below.\n---------------\nContext:\n{relevant_papers}\n---------------\n'
Given the context information and prior knowledge, answer the following query.\n 
Query: {question}"""

#query = 'CONTEXT: '+relevant_papers+' QUESTION:'+question 
query

Index([12, 24, 8, 17, 54, 25, 26, 4, 47, 35], dtype='int64')


"Context information is below.\n---------------\nTitle: HuaTuo: Tuning LLaMA Model with Chinese Medical Knowledge Summary:   Large Language Models (LLMs), such as the LLaMA model, have demonstrated their effectiveness in various general-domain natural language processing (NLP) tasks. Nevertheless, LLMs have not yet performed optimally in biomedical domain tasks due to the need for medical expertise in the responses. In response to this challenge, we propose HuaTuo, a LLaMA-based model that has been supervised-fine-tuned with generated QA (Question-Answer) instances. The experimental results demonstrate that HuaTuo generates responses that possess more reliable medical knowledge. Our proposed HuaTuo model is accessible at https://github.com/SCIR-HI/Huatuo-Llama-Med-Chinese.  \nTitle: Tamil-Llama: A New Tamil Language Model Based on Llama 2 Summary:   Language modeling has witnessed remarkable advancements in recent years, with Large Language Models (LLMs) like ChatGPT setting unparallel

In [25]:
from openai import OpenAI
client = OpenAI(api_key=None)

response = client.chat.completions.create(
  model="gpt-3.5-turbo",
  messages=[
    {"role": "user", "content": query}
  ],
    max_tokens=200
)

response.choices[0].message.content

"From the given context, it is not explicitly mentioned which tasks Llama-2 has been used successfully for. However, based on the information provided, we can infer some potential areas of application where Llama-2 shows promise:\n\n1. Biomedical domain tasks: Llama-2 has been proposed to improve the performance of Large Language Models (LLMs) in biomedical domain tasks by fine-tuning it with generated Question-Answer (QA) instances. This approach resulted in more reliable medical knowledge in the model's responses.\n\n2. Tamil language tasks: Llama-2 has been enhanced with the addition of 16,000 Tamil tokens, aiming to achieve superior text generation and comprehension in the Tamil language. This suggests that Llama-2 can be successfully applied to tasks involving Tamil language processing.\n\n3. Dialogue-based tasks: Llama-2, specifically the Llama 2-Chat models, have been fine-tuned and optimized for dialogue use cases. These models outperform open-source chat"

In [217]:
from openai import OpenAI
client = OpenAI(api_key=None)

response = client.chat.completions.create(
  model="gpt-3.5-turbo",
  messages=[
    {"role": "user", "content": question}
  ],
    max_tokens=200
)

response.choices[0].message.content

'Llama-2 has been used successfully for various natural language processing (NLP) tasks, including:\n\n1. Language Modeling: Llama-2 has been trained on a large corpus of text to generate coherent and contextually accurate language. It can be used for tasks like text generation, completion, and paraphrasing.\n\n2. Text Classification: Llama-2 has shown promising results in classifying text into predefined categories or labels. It can be used for sentiment analysis, spam detection, topic classification, etc.\n\n3. Named Entity Recognition (NER): Llama-2 can extract specific information from text by identifying and classifying named entities such as names, dates, organizations, locations, etc.\n\n4. Summarization: Llama-2 has been used for automatic text summarization, condensing large documents or articles into concise summaries while retaining the key points.\n\n5. Machine Translation: Llama-2 can be utilized for machine translation tasks, enabling the translation of text between diffe