In [1]:
import concurrent.futures
from dotenv import load_dotenv

from pylatexenc.latex2text import LatexNodes2Text

from sentence_transformers import SentenceTransformer, util
from transformers import AutoModelForCausalLM, AutoTokenizer
import openai
import os
import re

# Imports from langchain (Priyanshu)
#from langchain_community.document_loaders import DirectoryLoader
#from langchain_text_splitters import RecursiveCharacterTextSplitter

# Lang
from langchain.schema import Document
from langchain_together.embeddings import TogetherEmbeddings


from langchain_community.vectorstores import Chroma
from langchain.prompts import ChatPromptTemplate


  from tqdm.autonotebook import tqdm, trange
Found Intel OpenMP ('libiomp') and LLVM OpenMP ('libomp') loaded at
the same time. Both libraries are known to be incompatible and this
can cause random crashes or deadlocks on Linux when loaded in the
same Python program.
Using threadpoolctl may cause crashes or deadlocks. For more
information and possible workarounds, please see
    https://github.com/joblib/threadpoolctl/blob/master/multiple_openmp.md



## Environmental Variables

In [2]:
# Load environment variables from .env file
load_dotenv()

togetherai_api_key = os.getenv('TOGETHERAI_API_KEY')
openai_api_key = os.getenv('OPENAI_API_KEY')

In [3]:
#togetherai_api_key

In [4]:
#openai_api_key

## Load Data

In [9]:
tex_file = '../resources/tex_examples/FOMCacl2023.tex'

def read_latex_doc(tex_file):
    """
    input: insert .tex document path
    
    return: chunks
    """

    with open(tex_file, 'r') as file:
        tex_content = file.read()
    
    # Define patterns for sections, subsections, paragraphs, and labels
    #pattern = re.compile(r'\\(begin|section|subsection|paragraph)\*?{[^}]*}')
    
    # Split using lookahead to include the split point in the result
    #chunks = re.split(r'(?=\\(begin|section|subsection|paragraph)\*?{[^}]*})', tex_content)

    # Define patterns for sections, subsections, paragraphs, and labels
    pattern = re.compile(r'\\(section)\*?{[^}]*}')
    
    # Split using lookahead to include the split point in the result
    chunks = re.split(r'(?=\\(section)\*?{[^}]*})', tex_content)
    
    combined_chunks = []
    current_chunk = ""
    for i in range(0, len(chunks), 2):
        if i + 1 < len(chunks):
            if pattern.match(chunks[i]):
                if current_chunk:
                    combined_chunks.append(current_chunk)
                current_chunk = chunks[i] + chunks[i + 1]
            else:
                current_chunk += chunks[i] + chunks[i + 1]
        else:
            current_chunk += chunks[i]
    if current_chunk:
        combined_chunks.append(current_chunk)
    

    return(combined_chunks)

def parse_pdf(file_path):
    """
    Rutwik parse pdf
    """
    
    try:
        text = ""
        links = []

        with pdfplumber.open(file_path) as pdf:
            for page in pdf.pages:
                text += page.extract_text() + "\n"
                for annot in page.annots:
                    if 'uri' in annot:
                        links.append(annot['uri'])

        return {
            "text": text,
            "links": links
        }
    except Exception as e:
        return {"error": str(e)}

def convert_latex_chunks(list_chunks):
    """
    input: list of latex chunks

    return: list of plain text chunks
    """
    
    # Convert LaTeX to plain text
    latex_converter = LatexNodes2Text()
    plain_chunks = [latex_converter.latex_to_text(chunk) for chunk in chunks]
    
    return plain_chunks

In [10]:
# Initialize the sentence transformer model for retrieval
#retriever = SentenceTransformer('all-MiniLM-L6-v2')

# Load LLaMA 3 model and tokenizer
#llama_model = AutoModelForCausalLM.from_pretrained('path_to_llama_model')
#llama_tokenizer = AutoTokenizer.from_pretrained('path_to_llama_tokenizer')

## Analyze Chunks

In [11]:
list_chunks = read_latex_doc(tex_file)

In [12]:
len(list_chunks)

13

In [17]:
list_chunks[3]

'\\section{Dataset}\n\\subsection{FOMC Data}\n\nThe datasets we build are composed of three different types of data: meeting minutes, press conference transcripts, and speeches from the FOMC.\nMeeting minutes are defined as reports derived from the eight annually scheduled meetings of the FOMC. Press conference transcripts, meanwhile, are transcripts of the prepared remarks, followed by the Q\\&A session between the Federal Reserve chair and press reporters. Lastly, speeches were defined as any talk given by a Federal Reserve official. We limit our datasets to an end release date of October 15th, 2022, and attempt to collect as far back as possible for each category prior to this date.\n\nThe meeting minutes and speeches spanned from a release period of January 1st, 1996 to October 15th, 2022. Press conferences are a more recent phenomenon and the data aggregated stretched from April 27th, 2011 to October 15th, 2022. We obtained the data by leveraging BeautifulSoup, Selenium, and manua

In [13]:
list_chunks[7]

"\\section*{Limitations}\nIn this article, we focus only on meeting minutes, speech, and press conference data. Many other text datasets such as transcripts from congressional and senate testimonies, beige books, green books, etc can be incorporated to understand pre-FOMC drift better. We don't use audio or video features in constructing the measure, which might contain additional information. It can be an interesting future study to compare measures generated from FOMC text with an alternate measure that can be constructed from the news or social media data. In dataset construction, while splitting sentences, we use a simple rule-based approach. We leave it as an open problem for future researchers to find better methods for splitting sentences with opposite tones. \n\nIn our trading strategy construction, we do not include transaction fees as it involves low-frequency trading. In the future, one can use our model and data to construct a high-frequency trading strategy as well. In add

## LLM Prompting

In [16]:
def ask_llm(prompt):
    """
    Rutwik ask llm
    """
    prompt_json = [{'role': 'user', 'content': prompt}]

    model_source = 'meta-llama'
    model_name = 'Llama-3-70b-chat-hf'
    model_str = f'{model_source}/{model_name}'

    chat_completion = client_together.chat.completions.create(
        model=model_str,
        messages=prompt_json,
        temperature=0,
        max_tokens=512
    )

    return chat_completion.choices[0].message.content

def prompt_discuss_limitations(parsedText):
    """
    Prompt discuss limitations
    """
    prompt = f'''
    You are an assistant to a researcher who intends to submit their research paper to the ACL Conference. To avoid desk rejection, the researcher wants to ensure their paper meets the benchmarks set by the Responsible NLP Research Checklist.

    Your task is to analyze the provided research paper and answer the following question from the checklist: "Did you discuss the limitations of your work?"

    If the answer is YES, provide the section number. If the answer is NO, provide a justification.

    Research Paper: ```{parsedText}```

    Your response should be in JSON format with the keys "answer" (YES/NO) and "justification" (if YES, the section number; if NO, the justification).
    '''
    return ask_llm(prompt)

def prompt_discuss_potential_risks(parsedText):
    """
    Prompt discuss potential risks
    """
    
    prompt = f'''
    You are an assistant to a researcher who intends to submit their research paper to the ACL Conference. To avoid desk rejection, the researcher wants to ensure their paper meets the benchmarks set by the Responsible NLP Research Checklist.

    Your task is to analyze the provided research paper and answer the following question from the checklist: "Did you discuss any potential risks of your work?"

    If the answer is YES, provide the section number. If the answer is NO, provide a justification.

    Research Paper: ```{parsedText}```

    Your response should be in JSON format with the keys "answer" (YES/NO) and "justification" (if YES, the section number; if NO, the justification).
    '''
    return ask_llm(prompt)

def prompt_summarize_claims(parsedText):
    """
    Prompt summarize claims
    """
    prompt = f'''
    You are an assistant to a researcher who intends to submit their research paper to the ACL Conference. To avoid desk rejection, the researcher wants to ensure their paper meets the benchmarks set by the Responsible NLP Research Checklist.

    Your task is to analyze the provided research paper and answer the following question from the checklist: "Do the abstract and introduction summarize the paper’s main claims?"

    If the answer is YES, provide the section number. If the answer is NO, provide a justification.

    Research Paper: ```{parsedText}```

    Your response should be in JSON format with the keys "answer" (YES/NO) and "justification" (if YES, the section number; if NO, the justification).
    '''
    return ask_llm(prompt)

def prompt_cite_creators(parsedText):
    """
    Prompt cite creators
    """
    prompt = f'''
    You are an assistant to a researcher who intends to submit their research paper to the ACL Conference. To avoid desk rejection, the researcher wants to ensure their paper meets the benchmarks set by the Responsible NLP Research Checklist.

    Your task is to analyze the provided research paper and answer the following question from the checklist: "Did you cite the creators of artifacts you used?"

    If the answer is YES, provide the section number. If the answer is NO, provide a justification.

    Research Paper: ```{parsedText}```

    Your response should be in JSON format with the keys "answer" (YES/NO) and "justification" (if YES, the section number; if NO, the justification).
    '''
    return ask_llm(prompt)

def prompt_discuss_license(parsedText):
    """
    Prompt discuss license
    """
    prompt = f'''
    You are an assistant to a researcher who intends to submit their research paper to the ACL Conference. To avoid desk rejection, the researcher wants to ensure their paper meets the benchmarks set by the Responsible NLP Research Checklist.

    Your task is to analyze the provided research paper and answer the following question from the checklist: "Did you discuss the license or terms for use and/or distribution of any artifacts?"

    If the answer is YES, provide the section number. If the answer is NO, provide a justification.

    Research Paper: ```{parsedText}```

    Your response should be in JSON format with the keys "answer" (YES/NO) and "justification" (if YES, the section number; if NO, the justification).
    '''
    return ask_llm(prompt)