# clean markdown

In [1]:
from pathlib import Path
import re

In [2]:
notebook_dir = Path.cwd()
PROJECT_ROOT = notebook_dir.parent

In [3]:
filename = "PRECLEAN_Brain_and_behavior_a_cognitive_neuroscience_perspective_David_Eagleman_Jonathan_Downar"
filename_removedpictures = filename + "_removedpictures"
destination_file_removedpictures= (PROJECT_ROOT / "data" / "processed" / "neuroscience" / filename_removedpictures).with_suffix(".md")

## Helper Functions (The Logic Core)

In [4]:
# --- REGEX PATTERNS FOR WHOLE LINES ---
LINE_ARTIFACT_PATTERNS = [
    (r'^\s*(FIGURE|FIG|Fig|TABLE|TAB|Tab)(\.?)\s*[\d\.\-]+\s*.*$', ''),
    (r'^\s*(Source|Credit|Data from):.*$', ''),
]

# --- PATTERNS FOR INLINE REMOVAL ---
INLINE_REMOVAL_PATTERNS = [
    r'\(\s*(FIGURE|FIG|Fig|TABLE|TAB|Tab)\.?\s*[\d\.\-]+\s*\)',
]

def clean_whole_lines(text):
    """
    Removes lines that are likely image or table captions using Regex.
    """
    lines = text.split('\n')
    cleaned_lines = []
    for line in lines:
        skip = False
        for pattern, replacement in LINE_ARTIFACT_PATTERNS:
            if re.search(pattern, line, flags=re.IGNORECASE):
                skip = True
                break
        if not skip:
            cleaned_lines.append(line)
    return "\n".join(cleaned_lines)

def fix_broken_chapters(text):
    """
    Fixes broken chapter headers. Handles two specific patterns:
    
    Pattern 1 (Numbered):
      CHAPTER
      ## Motivation and Reward
      14
      -> # CHAPTER 14 Motivation and Reward
      
    Pattern 2 (Intro/Unnumbered):
      ## CHAPTER
      ## Introduction
      -> # CHAPTER Introduction
    """
    # PATTERN 1: Numbered Chapters
    # Matches "CHAPTER" (no hashes), then "## Title", then "Number"
    p1 = r'(?m)^\s*CHAPTER\s*$\n+\s*^##\s*(.+?)\s*$\n+\s*^(\d+)\s*$'
    text = re.sub(p1, r'# CHAPTER \2 \1', text)
    
    # PATTERN 2: Unnumbered/Intro Chapters
    # Matches "## CHAPTER", then "## Title"
    # (?m) = Multiline mode
    # ^\s*##\s*CHAPTER\s*$ = Line with exactly "## CHAPTER"
    # \n+ = Newlines
    # ^\s*##\s*(.+?)\s*$ = Line with "## Title", capturing the title
    p2 = r'(?m)^\s*##\s*CHAPTER\s*$\n+\s*^##\s*(.+?)\s*$'
    text = re.sub(p2, r'# CHAPTER \1', text)
    
    return text

def clean_inline_formatting(text):
    """
    Refines string spacing and removes specific OCR garbage WITHIN sentences.
    """
    # 1. Fix /u2014 artifacts
    text = text.replace('/u2014.d', ' - ')
    text = text.replace('/u2014', ' - ') 
    
    # 2. Remove inline patterns
    for pattern in INLINE_REMOVAL_PATTERNS:
        text = re.sub(pattern, '', text, flags=re.IGNORECASE)

    # 3. Fix 'mu- opioid' -> 'mu-opioid'
    text = re.sub(r'(\w+)-\s+(\w+)', r'\1-\2', text)
    
    # 4. Collapse multiple spaces
    text = re.sub(r'\s+', ' ', text)
    
    return text.strip()

def consolidate_broken_paragraphs(paragraphs):
    """
    CRITICAL FUNCTION: Fixes blank lines interrupting sentences.
    """
    if not paragraphs:
        return []
    
    merged = []
    buffer = paragraphs[0].strip()
    
    for i in range(1, len(paragraphs)):
        current_p = paragraphs[i].strip()
        if not current_p:
            continue
            
        ends_with_terminal = buffer.endswith(('.', '!', '?', ':', ';', '"', '”'))
        starts_lowercase = current_p[0].islower() if len(current_p) > 0 else False
        ends_connector = buffer.endswith((',', '-'))

        if (not ends_with_terminal) or starts_lowercase or ends_connector:
            buffer += " " + current_p
        else:
            merged.append(buffer)
            buffer = current_p
            
    if buffer:
        merged.append(buffer)
        
    return merged

## clean

In [5]:
def run_step_1_manual_cleaning(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        md_content = f.read()

    # --- CRITICAL FIX: Run the Chapter Stitcher FIRST ---
    # This runs on the raw text to catch the multi-line chapter pattern
    md_content = fix_broken_chapters(md_content)

    # 1. Global Line Removal (Captions)
    clean_content = clean_whole_lines(md_content)
    
    # 2. Split by Headers (#) to preserve document structure
    sections = re.split(r'(^#+\s+.*$)', clean_content, flags=re.MULTILINE)
    
    reconstructed_text = []
    
    for segment in sections:
        segment = segment.strip()
        if not segment: continue
            
        # If it's a Header, just add it to our output list
        if segment.startswith('#'):
            reconstructed_text.append(f"\n\n{segment}\n\n")
            continue
            
        # If it's Body Text:
        raw_paragraphs = segment.split('\n\n')
        stitched_paragraphs = consolidate_broken_paragraphs(raw_paragraphs)
        
        for p in stitched_paragraphs:
            reflowed = p.replace('\n', ' ')
            final_p = clean_inline_formatting(reflowed)
            
            if len(final_p) > 0:
                reconstructed_text.append(f"{final_p}\n\n")
                
    return "".join(reconstructed_text)

In [6]:
input_file_path = (PROJECT_ROOT / "data" / "processed" / "neuroscience" / filename_removedpictures).with_suffix(".md")
# File 1: The result of Manual Cleaning (Regex + Paragraph Stitching)
output_manual_clean = (PROJECT_ROOT / "data" / "processed" / "neuroscience" / (filename_removedpictures + "_STEP1_manual")).with_suffix(".md")

In [7]:
# --- EXECUTION OF STEP 1 ---
cleaned_markdown_text = run_step_1_manual_cleaning(input_file_path)

# Save to file
with open(output_manual_clean, 'w', encoding='utf-8') as f:
    f.write(cleaned_markdown_text)

print(f"STEP 1 COMPLETE. Cleaned Markdown saved to: {output_manual_clean}")

STEP 1 COMPLETE. Cleaned Markdown saved to: /home/fliperbaker/projects/rag1/rag1-mini/data/processed/neuroscience/PRECLEAN_Brain_and_behavior_a_cognitive_neuroscience_perspective_David_Eagleman_Jonathan_Downar_removedpictures_STEP1_manual.md


## spacy sentence separation

In [8]:
import spacy
# Load the Scientific SpaCy model
# disable=["ner"] makes it faster; we only need the parser for sentence boundaries.
print("Loading SciSpaCy model...")
nlp = spacy.load("en_core_sci_sm", disable=["ner"])

Loading SciSpaCy model...


  deserializers["tokenizer"] = lambda p: self.tokenizer.from_disk(  # type: ignore[union-attr]


In [13]:
def run_step_2_spacy_processing(clean_text):
    """
    Takes already cleaned text, parses it with SciSpaCy, 
    and adds metadata (Context tracking and Sentence counts).
    
    CRITICAL CHANGE: Now saves the 'sentences' list to preserve 
    SciSpaCy's intelligent splitting for the next phase.
    """
    # Split by Header again to determine context
    sections = re.split(r'(^#+\s+.*$)', clean_text, flags=re.MULTILINE)
    
    processed_chunks = []
    
    # Context trackers
    current_chapter = "Unknown Chapter"
    current_section = ""
    
    for segment in sections:
        segment = segment.strip()
        if not segment: continue
            
        # Update Context if we hit a header
        if segment.startswith('#'):
            clean_header = segment.lstrip('#').strip()
            if segment.startswith('# '):
                current_chapter = clean_header
                current_section = "" # Reset section on new chapter
            elif segment.startswith('##'):
                current_section = clean_header
            continue
            
        # Process Body Text
        # Note: Step 1 already handled the stitching and inline cleaning.
        # We just split by double newline here to get the "Paragraph Blocks".
        paragraphs = segment.split('\n\n')
        
        for p in paragraphs:
            p = p.strip()
            if not p: continue
                
            # --- SPACY NLP STEP ---
            # We pass the text to SpaCy to validate and split sentences intelligently.
            doc = nlp(p)
            
            # Extract the smart splits
            # This preserves the logic that "et al." is NOT a sentence break.
            sentence_list = [sent.text.strip() for sent in doc.sents]
            
            # Construct context string
            context_string = f"{current_chapter}"
            if current_section:
                context_string += f" > {current_section}"
            
            chunk_data = {
                "context": context_string,
                # We save the full text for reference
                "text": p,
                # CRITICAL: We save the list of sentences for the Embedding Step
                "sentences": sentence_list,
                "num_sentences": len(sentence_list)
            }
            processed_chunks.append(chunk_data)
            
    return processed_chunks

In [10]:
# File 2: The result of SpaCy Processing (Metadata + Sentence Counts)
output_spacy_processed = (PROJECT_ROOT / "data" / "processed" / "neuroscience" / (filename_removedpictures + "_STEP2_spacy")).with_suffix(".md")


In [14]:
# 1. RE-RUN THE PROCESSING to generate the new chunks with 'sentences'
final_chunks = run_step_2_spacy_processing(cleaned_markdown_text)

# 2. NOW run the saving/printing code
output_lines = [f"# Spacy Processed Analysis: {filename}\n"]

for i, chunk in enumerate(final_chunks):
    output_lines.append(f"---") 
    output_lines.append(f"### Chunk {i+1}")
    output_lines.append(f"**Context:** `{chunk['context']}`")
    output_lines.append(f"**Sentences:** {chunk['num_sentences']}")
    output_lines.append(f"") 
    
    # This loop will now work because 'sentences' exists in the new data
    for sent in chunk['sentences']:
        output_lines.append(f"- {sent}")
        
    output_lines.append(f"\n")

# Save to file
with open(output_spacy_processed, 'w', encoding='utf-8') as f:
    f.write("\n".join(output_lines))

print(f"STEP 2 COMPLETE. Processed Analysis saved to: {output_spacy_processed}")

STEP 2 COMPLETE. Processed Analysis saved to: /home/fliperbaker/projects/rag1/rag1-mini/data/processed/neuroscience/PRECLEAN_Brain_and_behavior_a_cognitive_neuroscience_perspective_David_Eagleman_Jonathan_Downar_removedpictures_STEP2_spacy.md


## post cleaning sentences

In [15]:
def post_process_sentences(chunks):
    """
    Filters out bad sentences from the SpaCy chunks based on specific rules:
    1. Must start with an Uppercase letter.
    2. Must end with terminal punctuation (. ? ! " ).
    3. Must be longer than 1 word.
    
    Returns: 
      - The list of cleaned chunks
      - A log of what was removed for manual inspection
    """
    cleaned_chunks = []
    removed_log = []
    
    # Allowed terminal punctuation
    # We include quotes and brackets in case a sentence ends with "..." or (1999).
    VALID_ENDINGS = ('.', '?', '!', '"', '”', ')', ']')

    for chunk in chunks:
        original_sentences = chunk['sentences']
        kept_sentences = []
        
        for sent in original_sentences:
            sent = sent.strip()
            reason = None
            
            # --- RULE 1: Too Short (One word or less) ---
            # We check if splitting by space gives fewer than 2 tokens
            if len(sent.split()) < 2:
                reason = "Too short (< 2 words)"
            
            # --- RULE 2: Starts with Lowercase ---
            # specific check: is the first char a letter and is it lower?
            # We use sent[0].islower() which ignores numbers/symbols
            elif sent and sent[0].islower():
                reason = "Starts with lowercase"

            # --- RULE 3: No Terminal Punctuation ---
            elif sent and not sent.endswith(VALID_ENDINGS):
                reason = "No ending punctuation"
            
            # DECISION
            if reason:
                removed_log.append(f"[{reason}]  {sent}")
            else:
                kept_sentences.append(sent)
        
        # Only keep the chunk if it still has valid sentences left
        if kept_sentences:
            new_chunk = chunk.copy()
            new_chunk['sentences'] = kept_sentences
            new_chunk['num_sentences'] = len(kept_sentences)
            # Reconstruct the full text block from the valid sentences only
            new_chunk['text'] = " ".join(kept_sentences)
            
            cleaned_chunks.append(new_chunk)
            
    return cleaned_chunks, removed_log

In [19]:
# --- CONFIGURATION ---
# Define the new output filename
output_post_processed = (PROJECT_ROOT / "data" / "processed" / "neuroscience" / (filename_removedpictures + "_STEP3_post_processed")).with_suffix(".md")

# --- EXECUTION ---
# Input: 'final_chunks' from the previous step
final_clean_chunks, deletion_log = post_process_sentences(final_chunks)

# --- REPORTING ---
print(f"--- POST-PROCESSING REPORT ---")
print(f"Original Chunks: {len(final_chunks)}")
print(f"Final Chunks:    {len(final_clean_chunks)}")
print(f"Sentences Removed: {len(deletion_log)}")
print(f"\n--- DELETED SENTENCES LOG (Check if any were valid!) ---")

for entry in deletion_log:
    print(entry)

# --- SAVING ---
output_lines = [f"# Final Cleaned Analysis: {filename}\n"]

for i, chunk in enumerate(final_clean_chunks):
    output_lines.append(f"---") 
    output_lines.append(f"### Chunk {i+1}")
    output_lines.append(f"**Context:** `{chunk['context']}`")
    output_lines.append(f"**Sentences:** {chunk['num_sentences']}")
    output_lines.append(f"") 
    
    # We write the reconstructed text block
    #output_lines.append(chunk['text'])
    #output_lines.append(f"\n")
    # This loop will now work because 'sentences' exists in the new data
    for sent in chunk['sentences']:
        output_lines.append(f"- {sent}")

with open(output_post_processed, 'w', encoding='utf-8') as f:
    f.write("\n".join(output_lines))

print(f"\nSUCCESS. Final file saved to: {output_post_processed}")

--- POST-PROCESSING REPORT ---
Original Chunks: 2625
Final Chunks:    2524
Sentences Removed: 341

--- DELETED SENTENCES LOG (Check if any were valid!) ---
[No ending punctuation]  A Cognitive Neuroscience Perspective
[No ending punctuation]  ``` PREFACE xv ABOUT THE AUTHORS xxi PART I THE BASICS CHAPTER 1 Introduction 2 CHAPTER 2 The Brain and Nervous System 36 CHAPTER 3 Neurons and Synapses 74 CHAPTER 4 Neuroplasticity 102 PART II HOW THE BRAIN INTERACTS WITH THE WORLD CHAPTER 5 Vision 130 CHAPTER 6 Other Senses 162 CHAPTER 7 The Motor System 196 PART III HIGHER LEVELS OF INTERACTION CHAPTER 8 Attention and Consciousness 232 CHAPTER 9 Memory 270 CHAPTER 10 Sleep 308 CHAPTER 11 Language and Lateralization 336 PART IV MOTIVATED BEHAVIORS CHAPTER 12 Decision Making 362 CHAPTER 13 Emotions 398 CHAPTER 14 Motivation and Reward 438 CHAPTER 15 Social Cognition 472 PART V DISORDERS OF BRAIN AND BEHAVIOR CHAPTER 16 Neurological and Psychiatric Disorders 514 GLOSSARY 556 REFERENCES 585 CREDITS

In [18]:
final_clean_chunks[500]

{'context': 'CHAPTER Introduction > Neurons Compete for Limited Space',
 'text': "Plasticity from world experience also involves a good deal of pruning (retraction of axonal branches) and cell death. Cells can die in one of two ways: necrosis (in an uncontrolled fashion) or apoptosis (in a deliberate, controlled fashion). The controlled process of apoptosis avoids collateral damage to neighbors, and it is a common sculpting mechanism in embryonic development. For example, the process of turning a human embryo's webbed hand into a baby's clearly defined fingers depends on sculpting away cells, not adding them (Kuida et al., 1998). The same principles may apply to the development of the brain. During development, 50% more neurons than needed are produced (Low &amp; Cheng, 2006). Massive die-off is standard operating procedure: neurons die because of failure to compete for chemicals provided by targets. Remember the experiment that removed one-half of the frog's tectum and resulted in a c