## Imports

In [1]:
from aux.aux_functions import *

In [2]:
import openai
from openai import OpenAI

In [3]:
import pandas as pd
pd.set_option('display.max_colwidth', None)

from IPython.display import display

In [4]:
import os

openai.api_key = os.environ["OPENAI_API_KEY"]

In [9]:
import requests
import os
import tarfile

# List of arXiv IDs
arxiv_ids = ['2307.13614', '2304.01673', '2303.01548']  # Replace with actual arXiv IDs

# Directory to save the downloaded source files
save_dir = 'arxiv_sources'
os.makedirs(save_dir, exist_ok=True)

In [10]:
def extract_tex_files(tar_path, extract_to):
    """
    Extracts .tex files from a tar.gz archive.
    """
    with tarfile.open(tar_path, 'r:gz') as tar:
        # Filter for .tex files
        tex_files = [m for m in tar.getmembers() if m.name.endswith('.tex')]
        for member in tex_files:
            tar.extract(member, extract_to)

In [12]:
for arxiv_id in arxiv_ids:
    # Construct URL for the source file
    url = f'http://arxiv.org/e-print/{arxiv_id}'
    
    # Make the request and download the file
    response = requests.get(url)
    
    if response.status_code == 200:
        tar_path = os.path.join(save_dir, f'{arxiv_id}.tar.gz')
        
        with open(tar_path, 'wb') as file:
            file.write(response.content)
        print(f'Downloaded {arxiv_id}')
        
        # Extract .tex files from the downloaded tar.gz file
        extract_to = os.path.join(save_dir, arxiv_id)  # Create a subdirectory for the extracted files
        os.makedirs(extract_to, exist_ok=True)
        extract_tex_files(tar_path, extract_to)
        print(f'Extracted LaTeX sources for {arxiv_id}')

        # Delete the .tar.gz file
        os.remove(tar_path)
        print(f'Deleted archive {arxiv_id}.tar.gz')
    else:
        print(f'Failed to download {arxiv_id}. Status code: {response.status_code}')

Downloaded 2307.13614
Extracted LaTeX sources for 2307.13614
Deleted archive 2307.13614.tar.gz
Downloaded 2304.01673
Extracted LaTeX sources for 2304.01673
Deleted archive 2304.01673.tar.gz
Downloaded 2303.01548
Extracted LaTeX sources for 2303.01548
Deleted archive 2303.01548.tar.gz


### Upload file

In [139]:
#name_file = "1903.06239/Review.tex" # Palti's review
#name_file = "2008.10625/ls.tex" # Hebecker's review
#name_file = "2303.04819/Sections/Introduction.tex" # StringCosmo review. The sections are: Introduction, CosmoOverview, Moduli, Inflation, Postinflation, DarkEnergy, Alternatives, Outlook
#name_file = "2310.20559/ls.tex" # McAllister-Quevedo's review
name_file = "0701050/review.tex" # Denef-Douglas-Kachru's review

latex_file_path = 'Files/' + name_file

# Extract the LaTeX content
latex_content = extract_latex_content(latex_file_path, "\\startdocument", "\\enddocument")

len(latex_content)

76800

### Simplify the latex text using common replacements

In [140]:
replacements = {
        r'\\d': 'd',
        r'\\G': 'G',
        r'\\O': 'O',
        r'\\C': 'C',
        r'\\V': 'V',
        r'\\L': 'L',
        r'\\R': 'R',
        r'\\S': 'S',
        r'\\Mp': 'Mp',
        r'\\cN': 'N',
        r'\\cC': 'C',
        r'\\cP': 'P',
        r'\\cR': 'R',
        r'\\vo': 'V',
        r'\\Kahler': 'Kahler',
        r'\\flux': 'flux',
        r'\\KK': 'Kaluza-Klein',
        r'\\n': ' '
    }                                        # This can be improved adding more replacements

brace_commands = [r'\\emph', r'\\footnote', r'\\it', r'\\em', r'\\ref', r'\\eqref']               # This can be improved adding more replacements

simplified_text = replace_latex_commands(latex_content, replacements, brace_commands)
len(simplified_text)

76552

### Polish latex text removing standard latex commands

In [141]:
polished_text = clean_latex_content(simplified_text)         # This can be improved adding more commands
len(polished_text)

75561

### Split the text in chunks

In [143]:
no_char_in_chunk = 2000

chunks = chunk_text(polished_text, no_char_in_chunk)
chunks = [chunk.replace("%", "") for chunk in chunks]
print("The number of chunks is: " + str(len(chunks)) + ".")
chunks[23:25]

The number of chunks is: 45.


["The number of vacua $\\phi_*$ in an interval $I$ is given by \\begin{equation}\n\\label{ansatz}\nds^2 = g_{\\mu u} dx^{\\mu}dx^{ u} + R^2 \\tilde g_{mn}dy^m dy^n\n\\end{equation}6 where $\\theta(x):=1$ if $x>0$, $\\theta(x):=0$ if $x<0$. The integrand $delta(V') |V''|$ gives a contribution $+1$ for each critical point in $I$, while $\\theta(V'')$ restricts to actual minima. Now in the large $L$ limit, we can approximate the sum over $(N,M)$ by an integral, and write \\begin{equation}\n\\label{ansatz}\nds^2 = g_{\\mu u} dx^{\\mu}dx^{ u} + R^2 \\tilde g_{mn}dy^m dy^n\n\\end{equation}7 where $\\rho(\\phi)$ can be interpreted as a vacuum number density on moduli space. To evaluate the integral over $(N,M)$ at a given fixed $\\phi$, it is convenient to make the following linear change of variables $(N,M) \\to (v',v'')$: \\begin{equation}\n\\label{ansatz}\nds^2 = g_{\\mu u} dx^{\\mu}dx^{ u} + R^2 \\tilde g_{mn}dy^m dy^n\n\\end{equation}8 This change of variables has Jacobian $=1$, and the 

### Define the prompt to create the Q&A using the text from the papers

In [144]:
# Define the number of Q&A pairs you want to generate for each chunk
num_questions = 3

In [145]:
system_prompt_template = "You are an expert in theoretical physics. I will provide a text, and I'd like you to generate {num_questions} questions a researcher-level colleague might ask about the topic of the text. Then, provide detailed answers to these questions as an expeert in the field. Format your responses as follows: 'Question 1: [question here] Answer 1: [answer here]' and so on for all {num_questions} questions."

user_prompt_template = "Based on the following text, please generate the questions and answers as instructed. Text: {chunk}"

display(system_prompt_template)
display(user_prompt_template)


"You are an expert in theoretical physics. I will provide a text, and I'd like you to generate {num_questions} questions a researcher-level colleague might ask about the topic of the text. Then, provide detailed answers to these questions as an expeert in the field. Format your responses as follows: 'Question 1: [question here] Answer 1: [answer here]' and so on for all {num_questions} questions."

'Based on the following text, please generate the questions and answers as instructed. Text: {chunk}'

### Generate the Q&A using chat-gpt 3.5 turbo

In [146]:
client = OpenAI()

# Initialize a list to store chunk and response pairs
chunk_responses = []

# Assume chunks_test is a list of text chunks you want to generate Q&A for
for chunk in chunks:
    system_prompt, user_prompt = generate_prompt(chunk, num_questions, system_prompt_template, user_prompt_template)
    
    # Call the OpenAI API once per chunk
    response = client.chat.completions.create(
        model="gpt-3.5-turbo-0125",
        messages=[system_prompt, user_prompt]
    )
    
    # Append both the chunk and its response to the chunk_responses list
    chunk_responses.append({
        "chunk": chunk,
        "response": response.choices[-1].message.content if response.choices else "No response"
    })


In [147]:
# Initialize lists to hold the DataFrame data
chunk_list = []
question_list = []
answer_list = []
sub_index = []

# Process each chunk-response pair
for item in chunk_responses:
    chunk = item['chunk']
    response = item['response']
    
    # Split the response into Q&As
    qas = response.split('\n\n')  # Assuming two newlines separate each Q&A pair
    for i, qa in enumerate(qas):
        # Attempt to split the QA into question and answer
        parts = qa.split('\nAnswer ')
        if len(parts) == 2:
            question = parts[0].replace('Question {}: '.format(i+1), '').strip()
            answer = parts[1].strip()
            chunk_list.append(chunk)
            question_list.append(question)
            answer_list.append(answer)
            sub_index.append(i+1)

In [160]:
# Create a MultiIndex from the chunks and their corresponding sub_index
multi_index = pd.MultiIndex.from_arrays([chunk_list, sub_index], names=['Chunk', 'Q&A Index'])
answer_list = [re.sub(r'^\d+: ', '', answer) for answer in answer_list]

# Construct the DataFrame
df_multi = pd.DataFrame({'Question': question_list, 'Answer': answer_list}, index=multi_index)

# Display the DataFrame
#df_multi

### Visualize the Q&A ina  dataframe

In [149]:
qa_df = pd.DataFrame({
    'Question': question_list,
    'Answer': answer_list
})

qa_df = qa_df
qa_df

Unnamed: 0,Question,Answer
0,How did the second superstring revolution of 1994-97 address the doubts surrounding string theory being defined only as a perturbative expansion and the need for non-perturbative physics in constructing a completely realistic model?,"The second superstring revolution of 1994-97 addressed these doubts by convincingly arguing that all the different string theories and eleven-dimensional supergravity are actually limits or aspects of a unified framework called string/M theory. The central idea of duality emerged, demonstrating that the strong coupling limit of one string theory can be equivalent to another weakly coupled theory, leading to a more unified understanding of the subject."
1,"What were some of the key discoveries that led particle physicists to consider superstring theory as a viable contender for a ""theory of everything""?","Some key discoveries that propelled superstring theory's candidacy for a ""theory of everything"" included the development of supersymmetric versions of the theory, arguments for perturbative finiteness, discoveries of anomaly cancellations, and quasi-realistic compactifications of the heterotic string. These advancements collectively contributed to a broad consensus among physicists that superstring theory could potentially describe all of fundamental physics."
2,How did the concept of duality play a significant role in the development and acceptance of string/M theory as a unified framework encompassing various string theories and eleven-dimensional supergravity?,"The concept of duality was crucial in the evolution of string theory towards the unified framework of string/M theory. Duality showed that seemingly distinct string theories were actually different descriptions of the same underlying physics. For example, it revealed that the strongly coupled regime of one theory could be equivalent to the weakly coupled regime of another theory. This unification through duality helped reconcile the different versions of string theory and eleven-dimensional supergravity into a singular comprehensive framework, string/M theory, contributing to a deeper understanding of the subject."
3,What is the central idea of duality in the context of string/M theory as mentioned in the text?,"The central idea of duality in string/M theory refers to the concept that the strong coupling limit of one string theory can be equivalent to another theory that is weakly coupled, including M theory. This concept reveals that seemingly distinct theories are actually different limits or aspects of one unified framework."
4,"How did the ""second superstring revolution"" from 1994-97 contribute to theoretical physics as mentioned in the text?","The ""second superstring revolution"" during 1994-97 played a crucial role in theoretical physics by providing nonperturbative definitions for string/M theory and offering exact solutions for the effective Lagrangians of various supersymmetric field theories. It introduced the idea of duality, which established connections between different string theories and led to a unified framework known as string/M theory."
...,...,...
94,How does the text relate the search for inflationary measures in theoretical physics to analogous problems in particle physics?,"The text draws an analogy between the search for inflationary measures in theoretical physics and the hypothetical scenario of developing particle physics based on data at energies far below the natural scales of the Standard Model, suggesting that advances using precision information and very powerful supercomputers could potentially lead to inferences about the validity of certain theories even at energy scales well beyond what is experimentally accessible."
95,"In the context of the text, what role do asymptotically free gauge theories coupled to gravity play in advancing theoretical physics and potentially leading to the Standard Model?","The text suggests that asymptotically free gauge theories coupled to gravity are identified as sensible candidates for explaining certain aspects of theoretical physics, with the hypothetical scenario indicating that with enough theoretical advances and computational power, it may be possible to develop models resembling the Standard Model, at least for the first generation of matter, by reproducing experimental observations such as the table of isotopes and radioactivity."
96,"What challenges are encountered when considering different field theories that might lead to similar low energy physics, and how can researchers navigate through this uncertainty?","The challenges include the difficulty of predicting how many other field theories could result in similar low energy physics, leading to a period of great uncertainty where multiple competing ideas seem valid. Researchers must navigate through this uncertainty by primarily focusing on theoretical consistency and the elegance of the proposed framework."
97,"What experimental approaches could be used to test proposals arising from field theories, particularly those related to subtle corrections and higher-energy phenomena in astrophysics?","Experimental testing of proposals from field theories could involve making precise measurements of observables like atomic masses, spectra, and radioactive decay rates to detect subtle corrections compared to Standard Model predictions. Additionally, looking for high-energy data in astrophysical phenomena, such as cosmic ray events, could provide crucial insights into the validity of these proposals."


### Save the dataframe in a csv and parquet format

In [152]:
qa_df.to_csv("Datasets/0701050.csv")
qa_df.to_parquet("Datasets/0701050.parquet")

### Upload csv datasets for further processing

In [153]:
df_test = pd.read_csv("Datasets/0701050.csv", index_col=0)
df_test

Unnamed: 0,Question,Answer
0,How did the second superstring revolution of 1994-97 address the doubts surrounding string theory being defined only as a perturbative expansion and the need for non-perturbative physics in constructing a completely realistic model?,"The second superstring revolution of 1994-97 addressed these doubts by convincingly arguing that all the different string theories and eleven-dimensional supergravity are actually limits or aspects of a unified framework called string/M theory. The central idea of duality emerged, demonstrating that the strong coupling limit of one string theory can be equivalent to another weakly coupled theory, leading to a more unified understanding of the subject."
1,"What were some of the key discoveries that led particle physicists to consider superstring theory as a viable contender for a ""theory of everything""?","Some key discoveries that propelled superstring theory's candidacy for a ""theory of everything"" included the development of supersymmetric versions of the theory, arguments for perturbative finiteness, discoveries of anomaly cancellations, and quasi-realistic compactifications of the heterotic string. These advancements collectively contributed to a broad consensus among physicists that superstring theory could potentially describe all of fundamental physics."
2,How did the concept of duality play a significant role in the development and acceptance of string/M theory as a unified framework encompassing various string theories and eleven-dimensional supergravity?,"The concept of duality was crucial in the evolution of string theory towards the unified framework of string/M theory. Duality showed that seemingly distinct string theories were actually different descriptions of the same underlying physics. For example, it revealed that the strongly coupled regime of one theory could be equivalent to the weakly coupled regime of another theory. This unification through duality helped reconcile the different versions of string theory and eleven-dimensional supergravity into a singular comprehensive framework, string/M theory, contributing to a deeper understanding of the subject."
3,What is the central idea of duality in the context of string/M theory as mentioned in the text?,"The central idea of duality in string/M theory refers to the concept that the strong coupling limit of one string theory can be equivalent to another theory that is weakly coupled, including M theory. This concept reveals that seemingly distinct theories are actually different limits or aspects of one unified framework."
4,"How did the ""second superstring revolution"" from 1994-97 contribute to theoretical physics as mentioned in the text?","The ""second superstring revolution"" during 1994-97 played a crucial role in theoretical physics by providing nonperturbative definitions for string/M theory and offering exact solutions for the effective Lagrangians of various supersymmetric field theories. It introduced the idea of duality, which established connections between different string theories and led to a unified framework known as string/M theory."
...,...,...
94,How does the text relate the search for inflationary measures in theoretical physics to analogous problems in particle physics?,"The text draws an analogy between the search for inflationary measures in theoretical physics and the hypothetical scenario of developing particle physics based on data at energies far below the natural scales of the Standard Model, suggesting that advances using precision information and very powerful supercomputers could potentially lead to inferences about the validity of certain theories even at energy scales well beyond what is experimentally accessible."
95,"In the context of the text, what role do asymptotically free gauge theories coupled to gravity play in advancing theoretical physics and potentially leading to the Standard Model?","The text suggests that asymptotically free gauge theories coupled to gravity are identified as sensible candidates for explaining certain aspects of theoretical physics, with the hypothetical scenario indicating that with enough theoretical advances and computational power, it may be possible to develop models resembling the Standard Model, at least for the first generation of matter, by reproducing experimental observations such as the table of isotopes and radioactivity."
96,"What challenges are encountered when considering different field theories that might lead to similar low energy physics, and how can researchers navigate through this uncertainty?","The challenges include the difficulty of predicting how many other field theories could result in similar low energy physics, leading to a period of great uncertainty where multiple competing ideas seem valid. Researchers must navigate through this uncertainty by primarily focusing on theoretical consistency and the elegance of the proposed framework."
97,"What experimental approaches could be used to test proposals arising from field theories, particularly those related to subtle corrections and higher-energy phenomena in astrophysics?","Experimental testing of proposals from field theories could involve making precise measurements of observables like atomic masses, spectra, and radioactive decay rates to detect subtle corrections compared to Standard Model predictions. Additionally, looking for high-energy data in astrophysical phenomena, such as cosmic ray events, could provide crucial insights into the validity of these proposals."


### Concatenate all datasets from various papers to create a single dataset about string phenomenology (and save it)

In [161]:
from sklearn.utils import shuffle

directory_path = 'Datasets/'

# Initialize a list to hold the dataframes
dataframes = []

# Iterate through all files in the directory
for filename in os.listdir(directory_path):
    if filename.endswith('.csv'):
        # Load each CSV file into a DataFrame, excluding the index
        df_temp = pd.read_csv(os.path.join(directory_path, filename), index_col=0)
        # Append the DataFrame to the list
        dataframes.append(df_temp)

# Concatenate all the DataFrames into a single DataFrame
concatenated_df = pd.concat(dataframes, ignore_index=True)

# Shuffle the rows of the concatenated DataFrame
shuffled_df = shuffle(concatenated_df)

# Reset the index to get rid of the old indexing
shuffled_df.reset_index(drop=True, inplace=True)

shuffled_df

Unnamed: 0,Question,Answer
0,"Can dS vacua be obtained in the IIB setting, and what are the methods or approaches suggested to achieve this?","Yes, it is possible to obtain de Sitter (dS) vacua in the IIB setting by incorporating additional effects from the low energy effective action or by adopting a more generalized approach to seeking minima of the effective action. Various proposals have been put forward to construct dS vacua. These proposals can be found in recent research that summarizes the state of the art in dS constructions and highlights the challenges involved. Additional strategies include incorporating anti-branes, which were originally proposed as part of the KKLT construction."
1,"What role do the slow-roll conditions play in multi-field inflation, and how do they relate to the decomposition of the inflationary trajectory into tangent and normal directions?","In multi-field inflation, the slow-roll conditions are essential for ensuring the smooth evolution of the scalar fields and the inflationary dynamics. These conditions constrain the time derivatives of the scalar fields and their gradient with respect to the potential. By utilizing a kinematic basis, one can decompose the inflationary trajectory into adiabatic (tangent) and entropic (normal) directions, which simplifies the analysis and understanding of the inflationary dynamics."
2,"How does the distribution of flux values in the model impact the number of solutions, and how does the choice of parameters affect the range of possible solutions?","The distribution of flux values in the model plays a crucial role in determining the number of solutions. The regularity in the distribution of flux values affects the intervals in which the cosmological constant falls. By making the flux values sufficiently different, one can ensure a more realistic situation. The choice of parameters, such as the size of the flux couplings and the range of the cosmological constant, influences the diversity and quantity of solutions in the model."
3,How does the inclusion of Kaluza-Klein (KK) modes within the $R^d$-dimensional theory relate to the universal quantum gravity bound and the effective theory's scale?,"Including the KK modes within the $R^d$-dimensional theory allows for a quantitative analysis of how many modes can be accommodated before reaching the universal quantum gravity bound, which is the $R^d$-dimensional Planck mass. This counting of KK modes provides insights into the effective description of higher-dimensional theories and helps ascertain the consistency and limitations of the effective field theory approach within the framework of string theory."
4,What are some of the key criticisms or challenges faced by the swampland approach and the Planckian censorship conjecture in particular?,"Some criticisms and challenges faced by the swampland approach, including the Planckian censorship conjecture, involve the speculative nature of these concepts and the lack of a clear explanatory framework for certain restrictions they propose. Critics have raised questions about the relevance of constraints on EFTs imposed by such conjectures and the need for further empirical and theoretical justifications to validate their implications. Additionally, the swampland approach has been perceived as less motivated compared to other conjectures in the field, leading to debates about its significance and impact on advancing our understanding of quantum gravity and cosmology."
...,...,...
5197,What are bifurcated throats and how do they address backreaction effects in the model mentioned in the text?,"Bifurcated throats are a theoretical construct used in string theory and braneworld scenarios where the geometry of spacetime bifurcates into separate regions resembling 'throats'. These throats can be used to address backreaction effects by providing a way to localize different types of matter fields or interactions in distinct regions, thereby mitigating the impact of backreactions."
5198,"In the context of string inflationary scenarios, what distinguishes the classes of models focused on in the text from the broader set of string-inspired models present in the literature?","The classes of models focused on in the text represent the most developed string inflationary scenarios. These models have concrete proposals within the framework of string theory and have been studied extensively in terms of their theoretical implications and observational predictions. On the other hand, the broader set of string-inspired models in the literature includes proposals that are based on ideas from string theory but may lack a solid stringy embedding or detailed mechanisms for moduli stabilization. While these broader models contribute to the diversity of ideas in the field, they may not have the same level of theoretical development and predictive power as the more focused classes of string inflation models discussed in the text."
5199,What is the significance of the DBI inflation scenario and its non-canonical kinetic terms in the context of theoretical physics?,The DBI inflation scenario is significant in theoretical physics as it presents a framework beyond the slow-roll approximation for achieving accelerated expansion. Non-canonical kinetic terms arising from the DBI action introduce new dynamics that can lead to unique inflationary phenomenology and contribute to the understanding of early universe dynamics.
5200,"What is the significance of the equation of state of dark energy changing with time, and how does it relate to inflationary cosmology?","The changing equation of state of dark energy during quintessence is significant as it can lead to an accelerated expansion of the universe. This behavior is similar to inflationary cosmology, where the equation of state of the inflaton field drives a rapid expansion in the early universe. The evolving equation of state during quintessence can lead to different behaviors of the dark energy field over time, with potential implications for the ultimate fate of the universe."


In [159]:
shuffled_df.to_csv("Datasets/StringPhenoDataset.csv")
shuffled_df.to_parquet("Datasets/StringPhenoDataset.parquet")