# Mountain NER Dataset Preparation

Convert raw CSV data with character markers to NER format with BIO tags.

## Setup

In [1]:
import pandas as pd
import ast
import random
from collections import Counter

random.seed(42)

## Load Raw Data

In [5]:
df = pd.read_csv('./data/raw/mountain_dataset_with_markup.csv')

print(f"Dataset size: {len(df)}")
print(f"\nColumns: {df.columns.tolist()}")
print(f"\nFirst row:")
print(df.iloc[0])

Dataset size: 1584

Columns: ['text', 'marker']

First row:
text      A visit to a science museum for hands-on learn...
marker                                                   []
Name: 0, dtype: object


## Parse Markers

In [22]:
def parse_marker(marker_str):
    """Parse marker string to extract character positions"""
    if marker_str == '[]' or not marker_str:
        return []
    try:
        return ast.literal_eval(marker_str)
    except:
        return []

# Test
text = df.iloc[22]['text']
markers = parse_marker(df.iloc[22]['marker'])

print(f"Text: {text}")
print(f"Markers: {markers}")

if markers:
    for start, end in markers:
        print(f"  Mountain: '{text[start:end]}'")

Text: The Carpathian Mountains are a vital part of Europe's natural heritage.
Markers: [(4, 24)]
  Mountain: 'Carpathian Mountains'


## Convert to BIO Tags

In [23]:
def create_ner_tags(text, markers):
    """Convert text and character markers to NER BIO tags"""
    tokens = text.split()
    ner_tags = ['O'] * len(tokens)
    
    if not markers:
        return tokens, ner_tags
    
    markers = sorted(markers, key=lambda x: x[0])
    
    # Create list of tokens with their character positions
    token_positions = []
    char_pos = 0
    for token in tokens:
        start = char_pos
        end = char_pos + len(token)
        token_positions.append((start, end))
        char_pos = end + 1
    
    # For each marker, find corresponding tokens
    for start, end in markers:
        entity_tokens = []
        
        for i, (token_start, token_end) in enumerate(token_positions):
            if not (token_end <= start or token_start >= end):
                entity_tokens.append(i)
        
        # Assign BIO tags
        if entity_tokens:
            for j, token_idx in enumerate(entity_tokens):
                if j == 0:
                    ner_tags[token_idx] = 'B-MOUNTAIN'
                else:
                    ner_tags[token_idx] = 'I-MOUNTAIN'
    
    return tokens, ner_tags

# Test
tokens, tags = create_ner_tags(text, markers)

print("Token-level NER tags:")
for token, tag in zip(tokens, tags):
    print(f"  {token:20s} {tag}")

Token-level NER tags:
  The                  O
  Carpathian           B-MOUNTAIN
  Mountains            I-MOUNTAIN
  are                  O
  a                    O
  vital                O
  part                 O
  of                   O
  Europe's             O
  natural              O
  heritage.            O


## Process Full Dataset

In [24]:
processed_data = []

for _, row in df.iterrows():
    text = row['text']
    markers = parse_marker(row['marker'])
    tokens, ner_tags = create_ner_tags(text, markers)
    
    if tokens:
        processed_data.append({
            'tokens': tokens,
            'ner_tags': ner_tags
        })

print(f"Processed {len(processed_data)} samples")

Processed 1584 samples


## Dataset Statistics

In [25]:
# Count tags
all_tags = [tag for item in processed_data for tag in item['ner_tags']]
tag_counts = Counter(all_tags)

print("Tag distribution:")
for tag, count in tag_counts.items():
    print(f"  {tag}: {count} ({count/len(all_tags)*100:.1f}%)")

# Count entities
num_entities = sum(1 for item in processed_data for tag in item['ner_tags'] if tag == 'B-MOUNTAIN')
samples_with_entities = sum(1 for item in processed_data if 'B-MOUNTAIN' in item['ner_tags'])

print(f"\nTotal entities: {num_entities}")
print(f"Samples with entities: {samples_with_entities}/{len(processed_data)}")
print(f"Average tokens per sample: {len(all_tags)/len(processed_data):.1f}")

Tag distribution:
  O: 21932 (98.2%)
  B-MOUNTAIN: 235 (1.1%)
  I-MOUNTAIN: 168 (0.8%)

Total entities: 235
Samples with entities: 226/1584
Average tokens per sample: 14.1


## Tokenization Example

In [27]:
from transformers import BertTokenizerFast

tokenizer = BertTokenizerFast.from_pretrained('bert-base-cased')

# Example with subword tokens
example = processed_data[22]
tokens = example['tokens']
labels = example['ner_tags']

print(f"Original tokens: {tokens}")
print(f"Original labels: {labels}")

# Tokenize
encoding = tokenizer(tokens, is_split_into_words=True, return_tensors='pt')
word_ids = encoding.word_ids()

print(f"\nBERT tokens: {tokenizer.convert_ids_to_tokens(encoding['input_ids'][0])}")
print(f"Word IDs: {word_ids}")

# Align labels
label2id = {'O': 0, 'B-MOUNTAIN': 1, 'I-MOUNTAIN': 2}
aligned_labels = []
previous_word_idx = None

for word_idx in word_ids:
    if word_idx is None:
        aligned_labels.append(-100)
    elif word_idx != previous_word_idx:
        aligned_labels.append(label2id[labels[word_idx]])
    else:
        aligned_labels.append(-100)
    previous_word_idx = word_idx

print(f"\nAligned labels: {aligned_labels}")
print("(-100 = ignored in loss calculation)")

Original tokens: ['The', 'Carpathian', 'Mountains', 'are', 'a', 'vital', 'part', 'of', "Europe's", 'natural', 'heritage.']
Original labels: ['O', 'B-MOUNTAIN', 'I-MOUNTAIN', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']

BERT tokens: ['[CLS]', 'The', 'Car', '##pathian', 'Mountains', 'are', 'a', 'vital', 'part', 'of', 'Europe', "'", 's', 'natural', 'heritage', '.', '[SEP]']
Word IDs: [None, 0, 1, 1, 2, 3, 4, 5, 6, 7, 8, 8, 8, 9, 10, 10, None]

Aligned labels: [-100, 0, 1, -100, 2, 0, 0, 0, 0, 0, 0, -100, -100, 0, 0, -100, -100]
(-100 = ignored in loss calculation)


## Train/Val Split

In [28]:
random.shuffle(processed_data)
split_idx = int(len(processed_data) * 0.8)

train_data = processed_data[:split_idx]
val_data = processed_data[split_idx:]

print(f"Train: {len(train_data)} samples")
print(f"Val: {len(val_data)} samples")

# Statistics per split
train_entities = sum(1 for item in train_data for tag in item['ner_tags'] if tag == 'B-MOUNTAIN')
val_entities = sum(1 for item in val_data for tag in item['ner_tags'] if tag == 'B-MOUNTAIN')

print(f"\nTrain entities: {train_entities}")
print(f"Val entities: {val_entities}")

Train: 1267 samples
Val: 317 samples

Train entities: 189
Val entities: 46


## Examples

In [29]:
print("Sample examples with mountains:\n")

count = 0
for item in train_data:
    if 'B-MOUNTAIN' in item['ner_tags']:
        print(f"Example {count+1}:")
        print(f"  Text: {' '.join(item['tokens'])}")
        print(f"  Tags: {item['ner_tags']}")
        print()
        count += 1
        if count >= 3:
            break

Sample examples with mountains:

Example 1:
  Text: The Himalayas are a sacred place for many people, and there are many monasteries and temples to be found in the region. #tibetanadventure #mountainspirit
  Tags: ['O', 'B-MOUNTAIN', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']

Example 2:
  Text: Gaze upon the snow-capped peaks of Mount Rainier, a dormant volcano in Washington state, and feel the power of nature's slumbering giant. #mountainviews #washingtonwonders
  Tags: ['O', 'O', 'O', 'O', 'O', 'O', 'B-MOUNTAIN', 'I-MOUNTAIN', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']

Example 3:
  Text: Embarking on an unforgettable journey through the rugged terrain of the Appalachian Mountains #hikingtrail #mountainexplorer
  Tags: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-MOUNTAIN', 'I-MOUNTAIN', 'O', 'O']

