# NLTK Sentence Tokenizer

In [2]:
from PyPDF2 import PdfReader
import nltk
nltk.download('punkt')
file_path="img_decoding.pdf"

# Extracting Text from PDF
def extract_text_from_pdf(file_path):
    with open(file_path, 'rb') as file:
        pdf = PdfReader(file)
        text = " ".join(page.extract_text() for page in pdf.pages)
    return text

# Extract text from the PDF and split it into sentences
text = extract_text_from_pdf(file_path)

[nltk_data] Downloading package punkt to /Users/fifi/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
sample = text[1015:3037]
print(sample)

mbeddings obtained from the image, ii) an MEG
module trained end-to-end and iii) a pretrained image generator. Our results are
threefold: Firstly, our MEG decoder shows a 7X improvement of image-retrieval
over classic linear decoders. Second, late brain responses to images are best de-
coded with DINOv2, a recent foundational image model. Third, image retrievals
and generations both suggest that MEG signals primarily contain high-level visual
features, whereas the same approach applied to 7T fMRI also recovers low-level
features. Overall, these results provide an important step towards the decoding
– in real time – of the visual processes continuously unfolding within the human
brain.
1 I NTRODUCTION
Automating the discovery of brain representations. Understanding how the human brain rep-
resents the world is arguably one of the most profound scientific challenges. This quest, which
originally consisted of searching, one by one, for the specific features that trigger each neuron, ( e.g

In [4]:
def split_text_into_sentences(text):
    sentences = nltk.sent_tokenize(text)
    return sentences

sentences = split_text_into_sentences(text)
sentences

['BRAIN DECODING :TOWARD REAL -TIME\nRECONSTRUCTION OF VISUAL PERCEPTION\nYohann Benchetrit1,∗, Hubert Banville1,∗, Jean-R ´emi King1,2\n1FAIR, Meta,2Laboratoire des Syst `emes Perceptifs, ´Ecole Normale Sup ´erieure, PSL University\n{ybenchetrit,hubertjb,jeanremi }@meta.com\nABSTRACT\nIn the past five years, the use of generative and foundational AI systems has\ngreatly improved the decoding of brain activity.',
 'Visual perception, in particular,\ncan now be decoded from functional Magnetic Resonance Imaging (fMRI) with\nremarkable fidelity.',
 'This neuroimaging technique, however, suffers from a lim-\nited temporal resolution ( ≈0.5 Hz) and thus fundamentally constrains its real-time\nusage.',
 'Here, we propose an alternative approach based on magnetoencephalog-\nraphy (MEG), a neuroimaging device capable of measuring brain activity with\nhigh temporal resolution ( ≈5,000 Hz).',
 'For this, we develop an MEG decoding\nmodel trained with both contrastive and regression objectives a

# Spacy Sentence Splitter

In [5]:
import spacy

nlp = spacy.load('en_core_web_sm')
doc = nlp(text)
sentences = list(doc.sents)
sentences

[BRAIN DECODING :TOWARD REAL -TIME
 RECONSTRUCTION OF VISUAL PERCEPTION
 Yohann Benchetrit1,∗, Hubert Banville1,∗, Jean-R ´emi King1,2
 1FAIR, Meta,2Laboratoire des Syst `emes Perceptifs, ´Ecole Normale Sup ´erieure, PSL University
 {ybenchetrit,hubertjb,jeanremi }@meta.com
 ABSTRACT
 In the past five years, the use of generative and foundational AI systems has
 greatly improved the decoding of brain activity.,
 Visual perception, in particular,
 can now be decoded from functional Magnetic Resonance Imaging (fMRI) with
 remarkable fidelity.,
 This neuroimaging technique, however, suffers from a lim-
 ited temporal resolution ( ≈0.5 Hz) and thus fundamentally constrains its real-time
 usage.,
 Here, we propose an alternative approach based on magnetoencephalog-
 raphy (MEG), a neuroimaging device capable of measuring brain activity with
 high temporal resolution ( ≈5,000 Hz).,
 For this, we develop an MEG decoding
 model trained with both contrastive and regression objectives and consis

# Langchain Character Text Splitter

In [7]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
# Initialize the text splitter with custom parameters
custom_text_splitter = RecursiveCharacterTextSplitter(
    # Set custom chunk size
    chunk_size = 100,
    chunk_overlap  = 20,
    # Use length of the text as the size measure
    length_function = len,
    separators=['\n']

)

# Create the chunks
texts = custom_text_splitter.create_documents([sample])

# Print the first two chunks
print(f'### Chunk 1: \n\n{texts[0].page_content}\n\n=====\n')
print(f'### Chunk 2: \n\n{texts[1].page_content}\n\n=====')

### Chunk 1: 

mbeddings obtained from the image, ii) an MEG

=====

### Chunk 2: 

module trained end-to-end and iii) a pretrained image generator. Our results are

=====


# Implementation of Semantic Chunking

In [9]:
import numpy as np
import spacy

# Load the Spacy model
nlp = spacy.load('en_core_web_sm')

def process(text):
    doc = nlp(text)
    sents = list(doc.sents)
    vecs = np.stack([sent.vector / sent.vector_norm for sent in sents])

    return sents, vecs

def cluster_text(sents, vecs, threshold):
    clusters = [[0]]
    for i in range(1, len(sents)):
        if np.dot(vecs[i], vecs[i-1]) < threshold:
            clusters.append([])
        clusters[-1].append(i)
    
    return clusters

def clean_text(text):
    # Add your text cleaning process here
    return text

# Initialize the clusters lengths list and final texts list
clusters_lens = []
final_texts = []

# Process the chunk
threshold = 0.3
sents, vecs = process(text)

# Cluster the sentences
clusters = cluster_text(sents, vecs, threshold)

for cluster in clusters:
    cluster_txt = clean_text(' '.join([sents[i].text for i in cluster]))
    cluster_len = len(cluster_txt)
    
    # Check if the cluster is too short
    if cluster_len < 60:
        continue
    
    # Check if the cluster is too long
    elif cluster_len > 3000:
        threshold = 0.6
        sents_div, vecs_div = process(cluster_txt)
        reclusters = cluster_text(sents_div, vecs_div, threshold)
        
        for subcluster in reclusters:
            div_txt = clean_text(' '.join([sents_div[i].text for i in subcluster]))
            div_len = len(div_txt)
            
            if div_len < 60 or div_len > 3000:
                continue
            
            clusters_lens.append(div_len)
            final_texts.append(div_txt)
            
    else:
        clusters_lens.append(cluster_len)
        final_texts.append(cluster_txt)

final_texts

['BRAIN DECODING :TOWARD REAL -TIME\nRECONSTRUCTION OF VISUAL PERCEPTION\nYohann Benchetrit1,∗, Hubert Banville1,∗, Jean-R ´emi King1,2\n1FAIR, Meta,2Laboratoire des Syst `emes Perceptifs, ´Ecole Normale Sup ´erieure, PSL University\n{ybenchetrit,hubertjb,jeanremi }@meta.com\nABSTRACT\nIn the past five years, the use of generative and foundational AI systems has\ngreatly improved the decoding of brain activity. Visual perception, in particular,\ncan now be decoded from functional Magnetic Resonance Imaging (fMRI) with\nremarkable fidelity. This neuroimaging technique, however, suffers from a lim-\nited temporal resolution ( ≈0.5 Hz) and thus fundamentally constrains its real-time\nusage. Here, we propose an alternative approach based on magnetoencephalog-\nraphy (MEG), a neuroimaging device capable of measuring brain activity with\nhigh temporal resolution ( ≈5,000 Hz). For this, we develop an MEG decoding\nmodel trained with both contrastive and regression objectives and consisting of