In [1]:
import openai
from langchain.document_loaders import PyPDFLoader
from langchain import OpenAI
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
from langchain.chains.summarize import load_summarize_chain
from langchain.embeddings import HuggingFaceEmbeddings, SentenceTransformerEmbeddings
from langchain.schema import HumanMessage
from langchain import PromptTemplate

import numpy as np
from sklearn.cluster import KMeans

In [2]:
# Load the book
loader = PyPDFLoader("../downloaded-paper.pdf")
pages = loader.load()

text = ""

for page in pages:
    text += page.page_content

tabs = text.count('\t')
print(f"Tabs in text: {tabs}")
text = text.replace('\t', ' ')
len(text)

Tabs in text: 0


53514

In [3]:
openai_api_key = ''
openai.api_key = openai_api_key
llm = OpenAI(temperature=0, openai_api_key=openai_api_key)

In [4]:
text.count('\n')

947

In [5]:
print(text[:1000])

SEEING IS BELIEVING : B RAIN -INSPIRED MODULAR
TRAINING FOR MECHANISTIC INTERPRETABILITY
Ziming Liu, Eric Gan & Max Tegmark
Department of Physics, Institute for AI and Fundamental Interactions, MIT
{zmliu,ejgan,tegmark}@mit.edu
ABSTRACT
We introduce Brain-Inspired Modular Training (BIMT), a method for making neural
networks more modular and interpretable. Inspired by brains, BIMT embeds neu-
rons in a geometric space and augments the loss function with a cost proportional
to the length of each neuron connection. We demonstrate that BIMT discovers
useful modular neural networks for many simple tasks, revealing compositional
structures in symbolic formulas, interpretable decision boundaries and features
for classification, and mathematical structure in algorithmic datasets. The ability
to directly seemodules with the naked eye can complement current mechanistic
interpretability strategies such as probes, interventions or staring at all weights.
1 I NTRODUCTION
Although deep neural networ

In [6]:
num_tokens = llm.get_num_tokens(text)
print(num_tokens)

15022


In [14]:
text_splitter = RecursiveCharacterTextSplitter(separators=["\n\n", "\n", "\t"], chunk_size=2000, chunk_overlap=200)

docs = text_splitter.create_documents([text])
# print(docs[0].page_content)
print(len(docs))

30


In [None]:
embeddings: HuggingFaceEmbeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")

In [None]:
vectors = embeddings.embed_documents([x.page_content for x in docs])
vectors=np.array([np.array(v) for v in vectors])
print(len(vectors))

In [None]:
num_clusters = 20
# Perform K-means clustering
kmeans = KMeans(n_clusters=num_clusters, random_state=42).fit(vectors)

In [None]:
kmeans.labels_

In [None]:
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt


def generate_tsne_graph(vectors):
    tsne = TSNE(n_components=2, random_state=42)
    reduced_data_tsne = tsne.fit_transform(vectors)

    # Plot the reduced data
    plt.scatter(reduced_data_tsne[:, 0], reduced_data_tsne[:, 1], c=kmeans.labels_)
    plt.xlabel('Dimension 1')
    plt.ylabel('Dimension 2')
    plt.title('Book Embeddings Clustered')
    plt.show()

print(vectors.shape)

generate_tsne_graph(vectors)

In [None]:
print(docs[0].page_content)

In [None]:
kmeans.cluster_centers_[0].shape

In [None]:
from prompts import ARXIV_MD_SUMMARIZATION_PROMPT

In [11]:
from langchain.chat_models import ChatOpenAI
from langchain.chains import ConversationChain
from langchain.schema import HumanMessage
llm3 = ChatOpenAI(temperature=0,
                 openai_api_key=openai_api_key,
                 max_tokens=1000,
                 model='gpt-3.5-turbo'
                )

llm4 = ChatOpenAI(temperature=0,
                 openai_api_key=openai_api_key,
                 max_tokens=1000,
                 model='gpt-4'
                )

# map_chain = load_summarize_chain(llm=llm3,
#                              chain_type="stuff",
#                              prompt=ARXIV_MD_SUMMARIZATION_PROMPT)


# map_chain = ConversationChain(llm=llm, verbose=True, prompt=ARXIV_MD_SUMMARIZATION_PROMPT)

In [18]:


ARXIV_MD_SUMMARIZATION_PROMPT_TEMPLATE = """
    You are an expert summarizer. You are given a portion of a paper and you have to summarize in {language} it in a concise way.
    Concise means one paragraph each.
    
    Proceed this way:
    a) First identify the abstract, sections and subsections in the chunk of the text provided delimited below by triple =. Ignore reference and acknowledgements sections.
    b) Per each section, including also sections and subsections in the appendices and/or supplementary material:
        b.1) Create a markdown document for the section following these rules:
            b.1.1) Heading level 1 should be the title of the paper.
            b.1.2) Heading level 2 should be the title of abstract or each section.
            b.1.3) Heading level 3 should be the summary of each sub-section.
            b.1.4) The summary of each section or subsection should be in regular text.
            b.1.5) The summary of each section should be followed by a horizontal rule.
        b.2) The markdown document for the secition, will include the title of the abstract, section or subsection, and in a paragraph then the summary of it. Do not include in 
        the summary the legend of the figures.
    c) Aggregate, preserving the order, the markdown documents of each section/subsection into a single markdown document.
    
    You will return a JSON object with to properties. 
    The first property will be named 'partial_summarization' will contain the aggregated markdown document string. 
    The second property will be named 'remaining_text' and will contain the title and text of the last section if the title
    or the text of that section/subsection was cut in the middle. Otherwise, 'remaining_text' should return 'N/A'.

    ==={text}===
    
    The JSON object:
"""


ARXIV_MD_SUMMARIZATION_PROMPT_TEMPLATE = """
Please identify and extract all distinct sections from the provided technical paper, including the title, abstract, introduction, 
methodology, results, discussion, conclusion, and any other relevant parts. Return the extracted sections as a JSON object, with each
section as a key-value pair, where the key is the section name and the value is the corresponding text. The text is as shown below 
surrounded by triple batticks.
   
```{text}```
    
The JSON object:
"""


ARXIV_MD_SUMMARIZATION_PROMPT = PromptTemplate(
    template=ARXIV_MD_SUMMARIZATION_PROMPT_TEMPLATE, input_variables=["text"]
)

current_tokens = 0
current_context = ''
for doc in docs:
    if current_tokens < 2000:
        chunk = doc.page_content
        chunk_tokens = llm.get_num_tokens(chunk)
        current_context += chunk
        current_tokens += chunk_tokens
    else:
        print(f"Calling ChatGPT3 with {current_tokens} tokens")
        # args = {"input_documents": current_context, "language": "english"}
        prompt=ARXIV_MD_SUMMARIZATION_PROMPT
        
        formatted_prompt = prompt.format(text=current_context)
        
        from utils import get_gai_completion
        import json
        
        print(formatted_prompt)
        # completion = get_gai_completion(formatted_prompt)
        # print(completion)
        # outputs = json.loads(completion)
        # print(outputs)
        
        # print(outputs['partial_summarization'])
        # print(outputs['remaining_text'])
        
        # # hm = HumanMessage(content=formatted_prompt)
        # # chunk_summary = llm3([hm])

        # print("Summary " * 10)
        # print(chunk_summary)
        # print(chunk_summary.content)
        current_tokens = 0
        current_context = ''
        break
        
        
    # print(llm.get_num_tokens(doc.page_content))

Calling ChatGPT3 with 2019 tokens

Please identify and extract all distinct sections from the provided technical paper, including the title, abstract, introduction, 
methodology, results, discussion, conclusion, and any other relevant parts. Return the extracted sections as a JSON object, with each
section as a key-value pair, where the key is the section name and the value is the corresponding text. The text is as shown below 
surrounded by triple batticks.
   
    ```SEEING IS BELIEVING : B RAIN -INSPIRED MODULAR
TRAINING FOR MECHANISTIC INTERPRETABILITY
Ziming Liu, Eric Gan & Max Tegmark
Department of Physics, Institute for AI and Fundamental Interactions, MIT
{zmliu,ejgan,tegmark}@mit.edu
ABSTRACT
We introduce Brain-Inspired Modular Training (BIMT), a method for making neural
networks more modular and interpretable. Inspired by brains, BIMT embeds neu-
rons in a geometric space and augments the loss function with a cost proportional
to the length of each neuron connection. We demon

In [None]:
for i in range(num_clusters):
    
    # Get the list of distances from that particular cluster center
    distances = np.linalg.norm(vectors - kmeans.cluster_centers_[1], axis=1)
    
    closest_indexes = np.argsort(distances)[:10]
    closest_indexes.sort()
    print(closest_indexes)
    # selected_indices = np.take(vectors, closest_indexes, axis=0)
    # print(selected_indices)
    selected_docs = [docs[doc].page_content for doc in closest_indexes]
    section = "\n".join(selected_docs)
    print(section)
    break
    