In [None]:
# toturial for arxive 

In [1]:
openai_api_key = "secret key"
claude_api_key = "secret key"

In [2]:
import json
import sys
from IPython.display import display, Markdown, clear_output

import arxiv
import anthropic
from openai import OpenAI

##########################################################
## Functions to interact with ArXiv and cache papers    ##
##########################################################
def fetch_papers(query, category=None, n_papers=20):
    # Construct the API client
    client = arxiv.Client()
    
    # Build the search query
    search_query = query
    if category:
        search_query += f" AND cat:{category}"
    
    # Initialize the search
    search = arxiv.Search(
        query=search_query,
        max_results=n_papers,
        sort_by=arxiv.SortCriterion.SubmittedDate
    )
    
    # Fetch the results
    results = client.results(search)
    
    # Initialize a list to hold data dictionaries for each paper
    papers = []
    
    for paper in results:
        paper_data = {
            'title': paper.title,
            'abstract': paper.summary.replace('\n', ' '),  # Replace new lines in abstracts with spaces
            'authors': [author.name for author in paper.authors],
            'link': paper.entry_id  # Add this line to include the paper's URL
        }
        papers.append(paper_data)
    return papers

    
def write_papers(papers, filename="./arxiv_papers.jsonl"):
    # Write data to a JSONL file. Replace with DB write if you want something more interesting
    with open(filename, 'w') as outfile:
        for paper_data in papers:
            json.dump(paper_data, outfile)
            outfile.write('\n')


##########################################################
## Functions to get responses from OpenAI and Anthropic ##
##########################################################
def get_openai(prompt,
               model="gpt-3.5-turbo-0125", 
               api_key=None, 
               system_prompt="You are a helpful assistant.", 
               max_tokens=2000, 
               temperature=0.8):
    # models "gpt-4-turbo", "gpt-3.5-turbo-0125"
    client = OpenAI(api_key=api_key)
    
    response = client.chat.completions.create(
      model=model,
      messages=[
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": prompt}
      ],
      temperature=temperature,
      max_tokens=max_tokens,
      top_p=1,
      frequency_penalty=0,
      presence_penalty=0
    )
    return response.choices[0].message.content

def get_anthropic(prompt, 
                  model="claude-3-haiku-20240307", 
                  api_key=None,
                  system_prompt="You are a helpful assistant.", 
                  max_tokens=2000, 
                  temperature=0):
    # Other models "claude-3-opus-20240229", "claude-3-sonnet-20240229", "claude-3-haiku-20240307"

    client = anthropic.Anthropic(
        api_key=api_key,
    )
    message = client.messages.create(
        model=model,
        max_tokens=2000,
        temperature=0,
        system=f"{system_prompt}",
        messages=[
            {
                "role": "user",
                "content": [
                    {
                        "type": "text",
                        "text": prompt
                    }
                ]
            }
        ]
    )
    return message.content[0].text

def get_streaming_anthropic(input_for_summary, 
                            model="claude-3-haiku-20240307", 
                            api_key=None,
                            system_prompt="You are a helpful assitant", 
                            max_tokens=2000, 
                            temperature=0):

    client = anthropic.Anthropic(api_key=claude_api_key)

    response = ""
    with client.messages.stream(
        max_tokens=max_tokens,
        temperature=temperature,
        system=f"{system_prompt}",
        messages=[{"role": "user", "content": input_for_summary}],
        model=model,
    ) as stream:
        for text in stream.text_stream:
            response += text
            clear_output(wait=True)
            display(Markdown(response))
            sys.stdout.flush()

In [3]:
!pip install anthropic
!pip install arxiv



In [4]:
# fetch the data / collecting 40 papers 
query= "au:blaiszik"
n_papers = 5
category = ""
db_file = './arxiv_papers.jsonl'

papers = fetch_papers(query, category=category, n_papers=n_papers)
print(f"== Fetched {len(papers)} papers from ArXiv. ==> ")

write_papers(papers, db_file)
print(f"== Wrote papers to temporary cache at {db_file} ==>")

== Fetched 5 papers from ArXiv. ==> 
== Wrote papers to temporary cache at ./arxiv_papers.jsonl ==>


In [5]:
papers = []
with open('./arxiv_papers.jsonl', 'r') as file:
    for line in file:
        papers.append(json.loads(line))

In [6]:
for i in papers :
    print(i["title"])
    print("------------------")

Machine Learning Materials Properties with Accurate Predictions, Uncertainty Estimates, Domain Guidance, and Persistent Online Accessibility
------------------
Twins in rotational spectroscopy: Does a rotational spectrum uniquely identify a molecule?
------------------
Trillion Parameter AI Serving Infrastructure for Scientific Discovery: A Survey and Vision
------------------
Accelerating Electronic Stopping Power Predictions by 10 Million Times with a Combination of Time-Dependent Density Functional Theory and Machine Learning
------------------
Towards a Modular Architecture for Science Factories
------------------


In [7]:
# Each papaer {title,abstract,authors,link}

In [8]:
input_for_summary = """Summarize the following arXiv papers at the level of an advanced Ph.D. student, making interconnections between the papers where possible.
                       First create a summary paragraph that includes the most important breakthroughs in the contained papers. 
                       Second, create a summary tweet thread describing the papers. 
                       Next, provide a section on interconnections where the information is grouped in a structured way that makes it easy to understand, rather than by each paper separately. 
                       Each paper should always be referenced by its link in markdown format. Be sure to include markdown style links to the papers and to references to the other papers. 
                       \n\n"""

In [9]:
for paper in papers:
    input_for_summary += "<paper>\n"
    input_for_summary += f"### {paper['title']}\n\n"
    input_for_summary += f"**Abstract:** {paper['abstract']}\n\n"
    input_for_summary += f"**Authors:** {', '.join(paper['authors'])}\n\n"
    input_for_summary += f"[Link to paper]({paper['link']})\n\n"
    input_for_summary += "</paper>\n\n"

In [10]:
papers[0] 

{'title': 'Machine Learning Materials Properties with Accurate Predictions, Uncertainty Estimates, Domain Guidance, and Persistent Online Accessibility',
 'abstract': 'One compelling vision of the future of materials discovery and design involves the use of machine learning (ML) models to predict materials properties and then rapidly find materials tailored for specific applications. However, realizing this vision requires both providing detailed uncertainty quantification (model prediction errors and domain of applicability) and making models readily usable. At present, it is common practice in the community to assess ML model performance only in terms of prediction accuracy (e.g., mean absolute error), while neglecting detailed uncertainty quantification and robust model accessibility and usability. Here, we demonstrate a practical method for realizing both uncertainty and accessibility features with a large set of models. We develop random forest ML models for 33 materials propertie

In [11]:
# surprised bc add all of the paper with the summary in single str ! 
print (input_for_summary)

Summarize the following arXiv papers at the level of an advanced Ph.D. student, making interconnections between the papers where possible.
                       First create a summary paragraph that includes the most important breakthroughs in the contained papers. 
                       Second, create a summary tweet thread describing the papers. 
                       Next, provide a section on interconnections where the information is grouped in a structured way that makes it easy to understand, rather than by each paper separately. 
                       Each paper should always be referenced by its link in markdown format. Be sure to include markdown style links to the papers and to references to the other papers. 
                       

<paper>
### Machine Learning Materials Properties with Accurate Predictions, Uncertainty Estimates, Domain Guidance, and Persistent Online Accessibility

**Abstract:** One compelling vision of the future of materials discovery and design inv

In [12]:
# The open ai api with payment we here get 429 response 
# system_prompt = "You are modeling the mind of a researcher who has obtained a PhD in the field of study for the papers retrieved."

# # models "gpt-4-turbo", "gpt-3.5-turbo-0125"

# oai = get_openai(input_for_summary, 
#                  model="gpt-3.5-turbo-0125",
#                  system_prompt=system_prompt,
#                  max_tokens=2000,
#                  api_key=openai_api_key
#                 )
# display(Markdown(oai))

In [14]:
system_prompt = "You are modeling the mind of a researcher who has obtained a PhD in the field of study for the papers retrieved."

# Other models "claude-3-opus-20240229", "claude-3-sonnet-20240229", "claude-3-haiku-20240307"
# so the claud api is just giving 5$ for free and after that purchase is neeeded 
anth = get_anthropic(input_for_summary, 
                         api_key=claude_api_key,
                         model="claude-3-haiku-20240307",
                         system_prompt=system_prompt, 
                         max_tokens=2000, 
                         temperature=0)
display(Markdown(anth))

Summary Paragraph:

The papers presented here showcase significant breakthroughs in the field of materials science and scientific discovery. [Jacobs et al.](http://arxiv.org/abs/2406.15650v1) demonstrate a practical method for developing machine learning models that provide both accurate property predictions and detailed uncertainty quantification, making the models readily usable and accessible through the Garden-AI infrastructure. [Schwarting et al.](http://arxiv.org/abs/2404.04225v1) explore the inverse problem of determining molecular structures from rotational spectra, revealing the existence of "molecular twins" that have similar spectra but distinct structures, highlighting the need for increased accuracy in theoretical methods and experiments. [Hudson et al.](http://arxiv.org/abs/2402.03480v1) present a vision for a comprehensive software stack and interfaces to support the diverse and flexible requirements of researchers in the era of Trillion Parameter Models (TPMs). [Ward et al.](http://arxiv.org/abs/2311.00787v2) develop a method that combines time-dependent density functional theory and machine learning to dramatically accelerate the prediction of electronic stopping power, a crucial quantity for designing various technologies. Finally, [Vescovi et al.](http://arxiv.org/abs/2308.09793v2) propose a modular architecture for "science factories," large-scale, AI-enabled self-driving laboratories that can support a wide range of scientific applications and workflows.

Summary Tweet Thread:

1/ Exciting breakthroughs in materials science and scientific discovery! 🔬🧪
[Jacobs et al.](http://arxiv.org/abs/2406.15650v1) develop ML models with accurate predictions, uncertainty estimates, and easy accessibility through Garden-AI.

2/ [Schwarting et al.](http://arxiv.org/abs/2404.04225v1) explore the inverse problem of determining molecular structures from rotational spectra, revealing the existence of "molecular twins" with similar spectra but distinct structures.

3/ [Hudson et al.](http://arxiv.org/abs/2402.03480v1) present a vision for a comprehensive software stack to support the diverse needs of researchers in the era of Trillion Parameter Models (TPMs).

4/ [Ward et al.](http://arxiv.org/abs/2311.00787v2) develop a method combining TDDFT and ML to dramatically accelerate the prediction of electronic stopping power, a crucial quantity for various technologies.

5/ [Vescovi et al.](http://arxiv.org/abs/2308.09793v2) propose a modular architecture for "science factories" - large-scale, AI-enabled self-driving labs that can support a wide range of scientific applications and workflows.

Interconnections:

The papers presented here showcase a range of advancements in materials science and scientific discovery, with several interconnections and synergies.

**Uncertainty Quantification and Model Accessibility**:
The work by [Jacobs et al.](http://arxiv.org/abs/2406.15650v1) on developing machine learning models with calibrated uncertainty estimates and easy-to-use interfaces through the Garden-AI infrastructure is a crucial step towards making these powerful predictive tools more accessible and usable for researchers. This aligns with the vision presented by [Hudson et al.](http://arxiv.org/abs/2402.03480v1) for a comprehensive software stack to support the diverse needs of researchers working with large-scale AI models.

**Inverse Problems in Molecular Spectroscopy**:
The discovery of "molecular twins" by [Schwarting et al.](http://arxiv.org/abs/2404.04225v1) highlights the challenges in uniquely identifying molecular structures from rotational spectra, an inverse problem. This work underscores the need for increased accuracy in theoretical methods and experimental techniques, as mentioned by the authors. The ability to rapidly predict electronic stopping power using the combined TDDFT and ML approach developed by [Ward et al.](http://arxiv.org/abs/2311.00787v2) could potentially aid in the interpretation of molecular spectra and the identification of molecular structures.

**Towards Modular and Scalable Scientific Platforms**:
The modular architecture for "science factories" proposed by [Vescovi et al.](http://arxiv.org/abs/2308.09793v2) aligns with the broader vision of scalable, AI-enabled scientific platforms. The ability to reuse modules, workcells, and workflows across different applications, as demonstrated in their work, could facilitate the integration of the advanced predictive tools and models developed in the other papers, such as the machine learning models from [Jacobs et al.](http://arxiv.org/abs/2406.15650v1) and the electronic stopping power predictions from [Ward et al.](http://arxiv.org/abs/2311.00787v2).

Overall, these papers collectively highlight the significant progress being made in materials science and scientific discovery, with a focus on developing accurate predictive models, addressing inverse problems, and creating scalable, modular platforms to enable the next generation of scientific research and discovery.

In [15]:
# Other models "claude-3-opus-20240229", "claude-3-sonnet-20240229", "claude-3-haiku-20240307"
# same model different mode (real time answer) 
# Call the function with your input
get_streaming_anthropic(input_for_summary, 
                             model="claude-3-haiku-20240307",
                             api_key=claude_api_key,
                             system_prompt=system_prompt, 
                             max_tokens=750, 
                             temperature=0.5)

Summary Paragraph:

The presented papers showcase significant advancements in the field of machine learning (ML) for materials science and discovery. [Jacobs et al.](http://arxiv.org/abs/2406.15650v1) developed a practical method for providing detailed uncertainty quantification and robust accessibility for a large set of ML models predicting various materials properties, enabling more reliable and usable predictions. [Schwarting et al.](http://arxiv.org/abs/2404.04225v1) explored the inverse problem of determining molecular structures from rotational spectra, revealing the existence of "molecular twins" with similar spectra but distinct structures, highlighting the need for increased accuracy in theoretical methods or additional experiments. [Ward et al.](http://arxiv.org/abs/2311.00787v2) combined time-dependent density functional theory and machine learning to dramatically accelerate the prediction of electronic stopping power, a crucial quantity for designing nuclear reactors, medical treatments, and quantum materials. Finally, [Vescovi et al.](http://arxiv.org/abs/2308.09793v2) proposed a modular architecture for "science factories" - large, general-purpose, computation- and AI-enabled self-driving laboratories capable of supporting a wide range of scientific applications and workflows.

Summary Tweet Thread:

1/ Exciting breakthroughs in materials science and discovery! 🔬🤖
@RyanJacobs_UW et al. developed ML models with uncertainty quantification and accessibility for 33 materials properties. [http://arxiv.org/abs/2406.15650v1]

2/ @mschwarting et al. explored the inverse problem of determining molecular structures from rotational spectra, revealing the existence of "molecular twins" with similar spectra but distinct structures. [http://arxiv.org/abs/2404.04225v1]

3/ @LoganWard et al. combined TDDFT and ML to dramatically accelerate the prediction of electronic stopping power, a crucial quantity for nuclear, medical, and quantum materials design. [http://arxiv.org/abs/2311.00787v2]

4/ @RafaelVescovi et al. proposed a modular architecture for "science factories" - general-purpose, computation- and AI-enabled self-driving labs to support a wide range of scientific applications. [http://arxiv.org/abs/2308.09793v2]

Interconnections:

The presented papers showcase a range of advancements in the use of machine learning and computational techniques to accelerate scientific discovery and materials design.

**Uncertainty Quantification and Accessibility:**
[Jacobs et al.](http://arxiv.org/abs/2406.15650v1) developed a practical method for providing detailed uncertainty quantification and robust accessibility for a large set of ML models predicting various materials properties. This work is crucial for enabling reliable and usable ML predictions in materials science, which can then be leveraged by other researchers and applications.

**Inverse Problems in Molecular Structure Determination:**
[Schwarting et al.](http://arxiv.org/abs/2404.04225v1) explored the inverse problem of determining molecular structures from rotational spectra, revealing the existence of "molecular twins" with similar spectra but distinct structures. This highlights the need for increased accuracy in theoretical methods or additional experiments to uniqu