In [None]:
from pathlib import Path

In [None]:
notebook_dir = Path.cwd()
PROJECT_ROOT = notebook_dir.parent

## scispacy to sementation, cleaning and named entity recognition

In [None]:
filename = "ch1_ch14_Brain_and_behavior"
filename_removedpictures = filename + "_removedpictures"
destination_file_removedpictures= (PROJECT_ROOT / "data" / "processed" / "neuroscience" / filename_removedpictures).with_suffix(".md")

In [None]:
### **Model 1: `en_core_sci_sm` (General Scientific)**
### **Model 2: `en_ner_bc5cdr_md` (Biomedical)**
### **Model 3: `en_ner_bionlp13cg_md` (Biological Processes)**
# !pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.4/en_core_sci_sm-0.5.4.tar.gz
# !pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.4/en_ner_bc5cdr_md-0.5.4.tar.gz
# !pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.4/en_ner_bionlp13cg_md-0.5.4.tar.gz

# print("‚úÖ All libraries installed!")

In [27]:
import sys
print(f"Python version: {sys.version}")
print(f"Python executable: {sys.executable}")

Python version: 3.11.14 (main, Oct 21 2025, 18:31:21) [GCC 11.2.0]
Python executable: /home/fliperbaker/miniconda3/envs/rag1-mini/bin/python


In [29]:
# CELL 2: Import All Required Libraries
import spacy  # Main NLP library
import json   # For saving structured data
import re     # For text pattern matching and cleaning
from collections import defaultdict  # For organizing entities by type
from pathlib import Path  # For file path handling

In [30]:
# CELL 3: Load Your Raw Markdown File
# This is STEP 1: Raw Markdown (600 pages)

# ============================================
# CONFIGURATION: Change these to match your files
# ============================================
MARKDOWN_FILE = destination_file_removedpictures  # Your markdown file path
BOOK_NAME = "Brain and Behaviour- Daniel Eagleman"  # Name for metadata

# Read the markdown file
with open(MARKDOWN_FILE, 'r', encoding='utf-8') as f:
    raw_markdown = f.read()

# Display basic statistics about the raw file
print("=" * 70)
print("RAW MARKDOWN FILE LOADED")
print("=" * 70)
print(f"Book: {BOOK_NAME}")
print(f"Total characters: {len(raw_markdown):,}")
print(f"Total lines: {len(raw_markdown.splitlines()):,}")
print(f"File size: {len(raw_markdown) / 1024:.2f} KB")
print()

# Show a sample of the raw content (first 500 characters)
print("SAMPLE OF RAW CONTENT (first 500 characters):")
print("-" * 70)
print(raw_markdown[:500])
print("-" * 70)
print()

# Show some problematic areas (if they exist)
# Count excessive spaces
excessive_spaces = len(re.findall(r'\s{3,}', raw_markdown))
print(f"‚ö†Ô∏è  Found {excessive_spaces:,} instances of 3+ consecutive spaces")

# Count excessive line breaks
excessive_breaks = len(re.findall(r'\n{4,}', raw_markdown))
print(f"‚ö†Ô∏è  Found {excessive_breaks:,} instances of 4+ consecutive line breaks")

# Count spaces before punctuation
spaces_before_punct = len(re.findall(r'\s+[.,;:!?]', raw_markdown))
print(f"‚ö†Ô∏è  Found {spaces_before_punct:,} spaces before punctuation")

print("\n‚úÖ Raw markdown loaded and analyzed!")

RAW MARKDOWN FILE LOADED
Book: Brain and Behaviour- Daniel Eagleman
Total characters: 241,622
Total lines: 1,084
File size: 235.96 KB

SAMPLE OF RAW CONTENT (first 500 characters):
----------------------------------------------------------------------
## CHAPTER

## Introduction

STARTING OUT: A Spark of Awe in the Darkness

Who Are We?

In Pursuit of Principles

How We Know What We Know

RESEARCH METHODS: Magnetic Resonance Imaging

Thinking Critically about the Brain

The Big Questions in Cognitive Neuroscience

The Payoffs of Cognitive Neuroscience

1

## STARTING OUT:

## A Spark of Awe in the Darkness

On October 9, 1604, a brilliant spark of light grew to life in the darkness of the  night  sky  over  Europe.  A  few days later, the ast
----------------------------------------------------------------------

‚ö†Ô∏è  Found 112 instances of 3+ consecutive spaces
‚ö†Ô∏è  Found 0 instances of 4+ consecutive line breaks
‚ö†Ô∏è  Found 56 spaces before punctuation

‚úÖ Raw markdown loade

In [31]:
# CELL 4: Clean the Text
# This is STEP 2: Clean Text (remove noise)

def clean_text(text):
    """
    Clean common formatting issues in markdown text
    
    Args:
        text (str): Raw text with potential formatting issues
    
    Returns:
        str: Cleaned text
    """
    
    # CLEANING STEP 1: Fix excessive whitespace between words
    # Example: "the    neuron" ‚Üí "the neuron"
    text = re.sub(r'(\w)\s{2,}(\w)', r'\1 \2', text)
    
    # CLEANING STEP 2: Fix multiple consecutive line breaks
    # Example: "\n\n\n\n" ‚Üí "\n\n"
    text = re.sub(r'\n{3,}', '\n\n', text)
    
    # CLEANING STEP 3: Remove spaces before punctuation
    # Example: "word ." ‚Üí "word."
    text = re.sub(r'\s+([.,;:!?])', r'\1', text)
    
    # CLEANING STEP 4: Remove leading/trailing whitespace
    text = text.strip()
    
    return text

In [32]:
# Apply cleaning function
cleaned_text = clean_text(raw_markdown)

# Show the improvements
print("=" * 70)
print("TEXT CLEANING RESULTS")
print("=" * 70)
print(f"Original length: {len(raw_markdown):,} characters")
print(f"Cleaned length: {len(cleaned_text):,} characters")
print(f"Characters removed: {len(raw_markdown) - len(cleaned_text):,}")
print()

# Compare before and after on the same sample
sample_start = 1000  # Start at character 1000 to show a middle section
sample_length = 300

print("BEFORE CLEANING:")
print("-" * 70)
print(raw_markdown[sample_start:sample_start + sample_length])
print("-" * 70)
print()

print("AFTER CLEANING:")
print("-" * 70)
print(cleaned_text[sample_start:sample_start + sample_length])
print("-" * 70)
print()

print("‚úÖ Text cleaned successfully!")
print("üíæ Cleaned text stored in variable: cleaned_text")

TEXT CLEANING RESULTS
Original length: 241,622 characters
Cleaned length: 234,757 characters
Characters removed: 6,865

BEFORE CLEANING:
----------------------------------------------------------------------
  has  appeared  in  our  skies  to surpass  it  since  then,  even  four centuries later.

Today's astronomers would have called Kepler's star a supernova and could have told him some astonishing details about the nature of the object that captured his attention on that clear night so long  ago  ( 
----------------------------------------------------------------------

AFTER CLEANING:
----------------------------------------------------------------------
kies to surpass it since then,  even four centuries later.

Today's astronomers would have called Kepler's star a supernova and could have told him some astonishing details about the nature of the object that captured his attention on that clear night so long ago  ( FIGURE 1.1 ).  They could have to
-------------------------------

In [33]:
# CELL 5: Load the Three SciSpacy Models
# This prepares us for STEP 3: SciSpacy Processing

print("Loading SciSpacy models (this may take 1-2 minutes)...")
print()

# MODEL 1: General scientific text processor
# Purpose: Sentence segmentation + general scientific entities
print("üì¶ Loading Model 1: en_core_sci_sm (General Scientific)")
print("   - This model understands scientific text structure")
print("   - It will break text into sentences")
print("   - It recognizes general scientific terms")
nlp_base = spacy.load("en_core_sci_sm")
print("   ‚úì Loaded\n")

# MODEL 2: Biomedical entities (diseases and chemicals)
# Purpose: Find diseases, symptoms, and chemical compounds
print("üì¶ Loading Model 2: en_ner_bc5cdr_md (Diseases & Chemicals)")
print("   - Trained on biomedical literature")
print("   - Recognizes: diseases, symptoms, drugs, chemicals")
nlp_biomed = spacy.load("en_ner_bc5cdr_md")
print("   ‚úì Loaded\n")

# MODEL 3: Biological processes and proteins
# Purpose: Find biological processes, molecular functions
print("üì¶ Loading Model 3: en_ner_bionlp13cg_md (Biological Processes)")
print("   - Trained on molecular biology papers")
print("   - Recognizes: proteins, genes, cellular processes")
nlp_bio = spacy.load("en_ner_bionlp13cg_md")
print("   ‚úì Loaded\n")

print("=" * 70)
print("‚úÖ All 3 SciSpacy models loaded and ready!")
print("=" * 70)
print()

# Show what each model can do
print("MODEL CAPABILITIES:")
print()
print("Model 1 (nlp_base) can identify:")
print("  - Sentences, tokens, parts of speech")
print("  - General scientific entities")
print()
print("Model 2 (nlp_biomed) specializes in:")
print("  - DISEASE: Parkinson's, amnesia, depression")
print("  - CHEMICAL: dopamine, serotonin, glucose")
print()
print("Model 3 (nlp_bio) specializes in:")
print("  - PROTEIN: sodium-potassium pump, receptors")
print("  - PROCESS: neurotransmission, metabolism")

Loading SciSpacy models (this may take 1-2 minutes)...

üì¶ Loading Model 1: en_core_sci_sm (General Scientific)
   - This model understands scientific text structure
   - It will break text into sentences
   - It recognizes general scientific terms


  deserializers["tokenizer"] = lambda p: self.tokenizer.from_disk(  # type: ignore[union-attr]


   ‚úì Loaded

üì¶ Loading Model 2: en_ner_bc5cdr_md (Diseases & Chemicals)
   - Trained on biomedical literature
   - Recognizes: diseases, symptoms, drugs, chemicals
   ‚úì Loaded

üì¶ Loading Model 3: en_ner_bionlp13cg_md (Biological Processes)
   - Trained on molecular biology papers
   - Recognizes: proteins, genes, cellular processes
   ‚úì Loaded

‚úÖ All 3 SciSpacy models loaded and ready!

MODEL CAPABILITIES:

Model 1 (nlp_base) can identify:
  - Sentences, tokens, parts of speech
  - General scientific entities

Model 2 (nlp_biomed) specializes in:
  - DISEASE: Parkinson's, amnesia, depression
  - CHEMICAL: dopamine, serotonin, glucose

Model 3 (nlp_bio) specializes in:
  - PROTEIN: sodium-potassium pump, receptors
  - PROCESS: neurotransmission, metabolism


In [None]:
# CELL 6: Extract Sentences
# This is STEP 3a: Sentence segmentation

print("Processing text to extract sentences...")
print("(This may take a few minutes for a 600-page book)")
print()

# Process the cleaned text with the base model
# This creates a spaCy "Doc" object that contains linguistic information
doc_base = nlp_base(cleaned_text)

# Extract all sentences into a list
sentences = []
for i, sent in enumerate(doc_base.sents):
    sentence_data = {
        "id": i,  # Unique sentence ID (0, 1, 2, ...)
        "text": sent.text.strip(),  # The actual sentence text
        "start_char": sent.start_char,  # Where it starts in the original text
        "end_char": sent.end_char,  # Where it ends
        "token_count": len(sent)  # Number of words/tokens
    }
    sentences.append(sentence_data)

In [36]:
# Display statistics
print("=" * 70)
print("SENTENCE SEGMENTATION RESULTS")
print("=" * 70)
print(f"Total sentences extracted: {len(sentences):,}")
print(f"Average sentence length: {sum(s['token_count'] for s in sentences) / len(sentences):.1f} tokens")
print(f"Shortest sentence: {min(s['token_count'] for s in sentences)} tokens")
print(f"Longest sentence: {max(s['token_count'] for s in sentences)} tokens")
print()

# Show first 5 sentences as examples
print("FIRST 5 SENTENCES:")
print("-" * 70)
for sent in sentences[:5]:
    print(f"[Sentence {sent['id']}] ({sent['token_count']} tokens)")
    print(f"  {sent['text']}")  
    print(f"  Position: characters {sent['start_char']}-{sent['end_char']}")
    print()

# Show a few sentences from the middle of the book
middle_index = len(sentences) // 2
print(f"3 SENTENCES FROM MIDDLE OF BOOK (around sentence {middle_index}):")
print("-" * 70)
for sent in sentences[middle_index:middle_index+3]:
    print(f"[Sentence {sent['id']}]")
    print(f"  {sent['text']}")
    print()

print("‚úÖ Sentence segmentation complete!")
print("üíæ Sentences stored in variable: sentences (list of dictionaries)")

SENTENCE SEGMENTATION RESULTS
Total sentences extracted: 1,443
Average sentence length: 30.8 tokens
Shortest sentence: 3 tokens
Longest sentence: 247 tokens

FIRST 5 SENTENCES:
----------------------------------------------------------------------
[Sentence 0] (21 tokens)
  ## CHAPTER

## Introduction STARTING OUT: A Spark of Awe in the Darkness Who Are We?
  Position: characters 0-84

[Sentence 1] (116 tokens)
  In Pursuit of Principles How We Know What We Know RESEARCH METHODS: Magnetic Resonance Imaging Thinking Critically about the Brain The Big Questions in Cognitive Neuroscience The Payoffs of Cognitive Neuroscience 1

## STARTING OUT:

## A Spark of Awe in the Darkness On October 9, 1604, a brilliant spark of light grew to life in the darkness of the night sky over Europe.  A few days later, the astronomer Johannes Kepler began to gaze up at the new star that had appeared in the void, outshining all its peers, visible for a time even through the brightness of the day.
  Position

In [37]:
# CELL 7: Extract Entities with Model 1 (General Scientific)
# This is STEP 3b: Entity recognition (Part 1)

print("Extracting entities with Model 1 (General Scientific)...")
print()

# The doc_base we created earlier already contains entity information
# Now we'll extract it into a structured format

entities_model1 = []

for ent in doc_base.ents:
    entity_data = {
        "text": ent.text,  # The actual text of the entity
        "label": ent.label_,  # What type of entity it is
        "start_char": ent.start_char,  # Position in original text
        "end_char": ent.end_char,
        "model": "general_sci"  # Which model found it
    }
    entities_model1.append(entity_data)

# Analyze what we found
print("=" * 70)
print("MODEL 1 (GENERAL SCIENTIFIC) RESULTS")
print("=" * 70)
print(f"Total entities found: {len(entities_model1):,}")
print()

# Group entities by type to see what categories we found
entity_types = defaultdict(list)
for ent in entities_model1:
    entity_types[ent['label']].append(ent['text'])

print(f"Entity types found: {len(entity_types)}")
print()

# Show top entity types by frequency
print("TOP ENTITY TYPES:")
print("-" * 70)
sorted_types = sorted(entity_types.items(), key=lambda x: len(x[1]), reverse=True)
for label, texts in sorted_types[:10]:  # Show top 10 types
    unique_terms = len(set(texts))
    print(f"{label:20} {len(texts):5} mentions ({unique_terms} unique terms)")

print()

# Show examples for the most common entity types
print("EXAMPLES FROM TOP 3 ENTITY TYPES:")
print("-" * 70)
for label, texts in sorted_types[:3]:
    print(f"\n{label}:")
    # Show first 5 unique examples
    unique_examples = list(set(texts))[:5]
    for example in unique_examples:
        print(f"  ‚Ä¢ {example}")

print()
print("‚úÖ Model 1 entity extraction complete!")
print("üíæ Entities stored in variable: entities_model1")

Extracting entities with Model 1 (General Scientific)...

MODEL 1 (GENERAL SCIENTIFIC) RESULTS
Total entities found: 9,869

Entity types found: 1

TOP ENTITY TYPES:
----------------------------------------------------------------------
ENTITY                9869 mentions (4603 unique terms)

EXAMPLES FROM TOP 3 ENTITY TYPES:
----------------------------------------------------------------------

ENTITY:
  ‚Ä¢ imprinted
  ‚Ä¢ emergence
  ‚Ä¢ Halaas
  ‚Ä¢ bunker
  ‚Ä¢ Seymour

‚úÖ Model 1 entity extraction complete!
üíæ Entities stored in variable: entities_model1


In [38]:
# CELL 8: Extract Entities with Model 2 (Diseases & Chemicals)
# This is STEP 3b: Entity recognition (Part 2)

print("Extracting entities with Model 2 (Biomedical - Diseases & Chemicals)...")
print("(This processes the text again with a specialized model)")
print()

# Process the same cleaned text with the biomedical model
doc_biomed = nlp_biomed(cleaned_text)

# Extract entities from this model
entities_model2 = []

for ent in doc_biomed.ents:
    entity_data = {
        "text": ent.text,
        "label": ent.label_,
        "start_char": ent.start_char,
        "end_char": ent.end_char,
        "model": "biomedical"
    }
    entities_model2.append(entity_data)

# Analyze results
print("=" * 70)
print("MODEL 2 (BIOMEDICAL) RESULTS")
print("=" * 70)
print(f"Total entities found: {len(entities_model2):,}")
print()

# Group by type
entity_types_biomed = defaultdict(list)
for ent in entities_model2:
    entity_types_biomed[ent['label']].append(ent['text'])

print("ENTITY TYPES FOUND:")
print("-" * 70)
for label, texts in entity_types_biomed.items():
    unique_terms = len(set(texts))
    print(f"{label:20} {len(texts):5} mentions ({unique_terms} unique terms)")

print()

# Show specific examples
print("EXAMPLES OF DISEASES FOUND:")
print("-" * 70)
if 'DISEASE' in entity_types_biomed:
    diseases = list(set(entity_types_biomed['DISEASE']))[:10]
    for disease in diseases:
        print(f"  ‚Ä¢ {disease}")
else:
    print("  (No diseases found with this label)")

print()

print("EXAMPLES OF CHEMICALS FOUND:")
print("-" * 70)
if 'CHEMICAL' in entity_types_biomed:
    chemicals = list(set(entity_types_biomed['CHEMICAL']))[:10]
    for chemical in chemicals:
        print(f"  ‚Ä¢ {chemical}")
else:
    print("  (No chemicals found with this label)")

print()
print("‚úÖ Model 2 entity extraction complete!")
print("üíæ Entities stored in variable: entities_model2")

Extracting entities with Model 2 (Biomedical - Diseases & Chemicals)...
(This processes the text again with a specialized model)

MODEL 2 (BIOMEDICAL) RESULTS
Total entities found: 429

ENTITY TYPES FOUND:
----------------------------------------------------------------------
DISEASE                215 mentions (138 unique terms)
CHEMICAL               214 mentions (61 unique terms)

EXAMPLES OF DISEASES FOUND:
----------------------------------------------------------------------
  ‚Ä¢ deficit in memory recollection
  ‚Ä¢ movement disorders
  ‚Ä¢ peacefulness
  ‚Ä¢ major depression
  ‚Ä¢ Schizophrenia
  ‚Ä¢ compulsive shopping
  ‚Ä¢ nausea and vomiting
  ‚Ä¢ delirium
  ‚Ä¢ malnutrition
  ‚Ä¢ teasing

EXAMPLES OF CHEMICALS FOUND:
----------------------------------------------------------------------
  ‚Ä¢ smoking
  ‚Ä¢ Higgins
  ‚Ä¢ smoke
  ‚Ä¢ metallurgy
  ‚Ä¢ take-
  ‚Ä¢ buprenorphine
  ‚Ä¢ varenicline
  ‚Ä¢ psychotomimetic
  ‚Ä¢ bupropion
  ‚Ä¢ heroin

‚úÖ Model 2 entity extraction 

In [39]:
# CELL 9: Extract Entities with Model 3 (Biological Processes)
# This is STEP 3b: Entity recognition (Part 3)

print("Extracting entities with Model 3 (Biological Processes & Proteins)...")
print()

# Process with the biological model
doc_bio = nlp_bio(cleaned_text)

# Extract entities
entities_model3 = []

for ent in doc_bio.ents:
    entity_data = {
        "text": ent.text,
        "label": ent.label_,
        "start_char": ent.start_char,
        "end_char": ent.end_char,
        "model": "biological"
    }
    entities_model3.append(entity_data)

# Analyze results
print("=" * 70)
print("MODEL 3 (BIOLOGICAL PROCESSES) RESULTS")
print("=" * 70)
print(f"Total entities found: {len(entities_model3):,}")
print()

# Group by type
entity_types_bio = defaultdict(list)
for ent in entities_model3:
    entity_types_bio[ent['label']].append(ent['text'])

print("ENTITY TYPES FOUND:")
print("-" * 70)
for label, texts in sorted(entity_types_bio.items(), key=lambda x: len(x[1]), reverse=True):
    unique_terms = len(set(texts))
    print(f"{label:25} {len(texts):5} mentions ({unique_terms} unique)")

print()

# Show examples of different biological entity types
print("SAMPLE ENTITIES BY TYPE:")
print("-" * 70)
for label, texts in list(entity_types_bio.items())[:5]:  # Show first 5 types
    print(f"\n{label}:")
    unique_examples = list(set(texts))[:5]
    for example in unique_examples:
        print(f"  ‚Ä¢ {example}")

print()
print("‚úÖ Model 3 entity extraction complete!")
print("üíæ Entities stored in variable: entities_model3")

Extracting entities with Model 3 (Biological Processes & Proteins)...

MODEL 3 (BIOLOGICAL PROCESSES) RESULTS
Total entities found: 1,608

ENTITY TYPES FOUND:
----------------------------------------------------------------------
ORGAN                       392 mentions (37 unique)
ORGANISM                    269 mentions (69 unique)
SIMPLE_CHEMICAL             209 mentions (74 unique)
CELL                        123 mentions (28 unique)
CANCER                      119 mentions (54 unique)
GENE_OR_GENE_PRODUCT        118 mentions (74 unique)
TISSUE                       90 mentions (32 unique)
PATHOLOGICAL_FORMATION       69 mentions (37 unique)
MULTI_TISSUE_STRUCTURE       54 mentions (34 unique)
CELLULAR_COMPONENT           53 mentions (29 unique)
ORGANISM_SUBSTANCE           49 mentions (8 unique)
ORGANISM_SUBDIVISION         30 mentions (11 unique)
ANATOMICAL_SYSTEM            28 mentions (13 unique)
IMMATERIAL_ANATOMICAL_ENTITY     3 mentions (2 unique)
DEVELOPING_ANATOMICAL_STRUC