## Import libraries

In [1]:
import numpy as np
import pandas as pd
import re
import string
from collections import Counter, defaultdict
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, f1_score
from sklearn.preprocessing import LabelEncoder
import sklearn_crfsuite
from sklearn_crfsuite import metrics
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize, sent_tokenize
import json
import pickle
import warnings

import os
import random

warnings.filterwarnings('ignore')

# Download required NLTK data
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')

try:
    nltk.data.find('corpora/stopwords')
except LookupError:
    nltk.download('stopwords')

try:
    nltk.data.find('corpora/wordnet')
except LookupError:
    nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/davepipon/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## Develop Name Entity Recognition for Conditional Random Field

In [2]:
class CRFCitationNER:
    def __init__(self):
        """
        Initialize the CRF-based NER system for citation identification.
        """
        self.model = sklearn_crfsuite.CRF(
            algorithm='lbfgs',
            c1=0.1,
            c2=0.1,
            max_iterations=100,
            all_possible_transitions=True
        )
        self.stemmer = PorterStemmer()
        self.lemmatizer = WordNetLemmatizer()
        self.stop_words = set(stopwords.words('english'))
        
        # Common dataset indicators and patterns
        self.dataset_indicators = {
            'survey_terms': ['survey', 'census', 'longitudinal', 'panel', 'cohort'],
            'data_terms': ['data', 'dataset', 'database', 'repository', 'archive'],
            'source_terms': ['bureau', 'agency', 'administration', 'institute', 'center'],
            'acronyms': ['CPS', 'ACS', 'NLSY', 'PSID', 'GSS', 'NHANES', 'SIPP']
        }
        
        # BIO tagging scheme: B-DATASET, I-DATASET, O
        self.label_encoder = LabelEncoder()
        
    def preprocess_text(self, text):
        """
        Preprocess text for better feature extraction.
        """
        # Remove extra whitespace and normalize
        text = re.sub(r'\s+', ' ', text.strip())
        return text
    
    def extract_word_features(self, sent, i):
        """
        Extract comprehensive features for word at position i in sentence.
        Features are designed specifically for dataset citation recognition.
        """
        word = sent[i]
        features = {
            # Basic word features
            'bias': 1.0,
            'word.lower()': word.lower(),
            'word[-3:]': word[-3:],  # suffix
            'word[-2:]': word[-2:],  # suffix
            'word.isupper()': word.isupper(),
            'word.istitle()': word.istitle(),
            'word.isdigit()': word.isdigit(),
            'word.isalpha()': word.isalpha(),
            'word.isalnum()': word.isalnum(),
            'len(word)': len(word),
            
            # Dataset-specific features
            'contains.digit': any(c.isdigit() for c in word),
            'contains.hyphen': '-' in word,
            'contains.underscore': '_' in word,
            'contains.parentheses': '(' in word or ')' in word,
            'all.caps': word.isupper() and len(word) > 1,
            
            # Acronym detection
            'is.acronym': (word.isupper() and len(word) >= 2 and len(word) <= 6),
            'known.dataset.acronym': word.upper() in self.dataset_indicators['acronyms'],
            
            # Dataset indicator terms
            'is.survey.term': word.lower() in self.dataset_indicators['survey_terms'],
            'is.data.term': word.lower() in self.dataset_indicators['data_terms'],
            'is.source.term': word.lower() in self.dataset_indicators['source_terms'],
            
            # Punctuation features
            'has.punctuation': any(c in string.punctuation for c in word),
            'ends.with.period': word.endswith('.'),
            'ends.with.comma': word.endswith(','),
            
            # Morphological features
            'stemmed': self.stemmer.stem(word.lower()),
            'lemmatized': self.lemmatizer.lemmatize(word.lower()),
        }
        
        # Context features (previous and next words)
        if i > 0:
            word1 = sent[i-1]
            features.update({
                '-1:word.lower()': word1.lower(),
                '-1:word.istitle()': word1.istitle(),
                '-1:word.isupper()': word1.isupper(),
                '-1:is.survey.term': word1.lower() in self.dataset_indicators['survey_terms'],
                '-1:is.data.term': word1.lower() in self.dataset_indicators['data_terms'],
                '-1:is.source.term': word1.lower() in self.dataset_indicators['source_terms'],
            })
        else:
            features['BOS'] = True  # Beginning of sentence
            
        if i < len(sent)-1:
            word1 = sent[i+1]
            features.update({
                '+1:word.lower()': word1.lower(),
                '+1:word.istitle()': word1.istitle(),
                '+1:word.isupper()': word1.isupper(),
                '+1:is.survey.term': word1.lower() in self.dataset_indicators['survey_terms'],
                '+1:is.data.term': word1.lower() in self.dataset_indicators['data_terms'],
                '+1:is.source.term': word1.lower() in self.dataset_indicators['source_terms'],
            })
        else:
            features['EOS'] = True  # End of sentence
            
        # Bigram features
        if i > 0:
            features['-1,0:words'] = f"{sent[i-1].lower()}_{word.lower()}"
        if i < len(sent)-1:
            features['0,+1:words'] = f"{word.lower()}_{sent[i+1].lower()}"
            
        # Window features (±2 words)
        if i > 1:
            features['-2:word.lower()'] = sent[i-2].lower()
        if i < len(sent)-2:
            features['+2:word.lower()'] = sent[i+2].lower()
            
        return features
    
    def sent2features(self, sent):
        """Convert sentence to feature vectors."""
        return [self.extract_word_features(sent, i) for i in range(len(sent))]
    
    def sent2labels(self, sent_labels):
        """Extract labels from sentence."""
        return sent_labels
    
    def sent2tokens(self, sent):
        """Extract tokens from sentence."""
        return sent
    
    def create_training_data_from_text(self, text_samples, annotations):
        """
        Create training data from text samples and their annotations.
        
        Args:
            text_samples: List of text strings
            annotations: List of annotations in format [{'start': int, 'end': int, 'label': str}]
        
        Returns:
            X_train, y_train: Features and labels for training
        """
        X, y = [], []
        
        for text, annots in zip(text_samples, annotations):
            # Tokenize text
            tokens = word_tokenize(text)
            labels = ['O'] * len(tokens)
            
            # Convert character-based annotations to token-based labels
            char_to_token = self._create_char_to_token_mapping(text, tokens)
            
            for annot in annots:
                start_char, end_char = annot['start'], annot['end']
                entity_type = annot.get('label', 'DATASET')
                
                # Find corresponding tokens
                start_token = char_to_token.get(start_char)
                end_token = char_to_token.get(end_char - 1)  # end_char is exclusive
                
                if start_token is not None and end_token is not None:
                    # Apply BIO tagging
                    labels[start_token] = f'B-{entity_type}'
                    for i in range(start_token + 1, end_token + 1):
                        if i < len(labels):
                            labels[i] = f'I-{entity_type}'
            
            X.append(tokens)
            y.append(labels)
        
        return X, y
    
    def _create_char_to_token_mapping(self, text, tokens):
        """Create mapping from character positions to token indices."""
        char_to_token = {}
        char_idx = 0
        
        for token_idx, token in enumerate(tokens):
            # Find token in text starting from current position
            while char_idx < len(text) and text[char_idx].isspace():
                char_idx += 1
            
            # Map all characters in this token
            token_start = char_idx
            for i in range(len(token)):
                if char_idx < len(text):
                    char_to_token[char_idx] = token_idx
                    char_idx += 1
        
        return char_to_token
    
    def prepare_training_data(self, X_tokens, y_labels):
        """
        Prepare feature vectors and labels for CRF training.
        """
        X_features = [self.sent2features(sent) for sent in X_tokens]
        y_formatted = [self.sent2labels(labels) for labels in y_labels]
        
        return X_features, y_formatted
    
    def train(self, X_tokens, y_labels):
        """
        Train the CRF model.
        
        Args:
            X_tokens: List of tokenized sentences
            y_labels: List of label sequences (BIO format)
        """
        print("Preparing training data...")
        X_train, y_train = self.prepare_training_data(X_tokens, y_labels)
        
        print("Training CRF model...")
        self.model.fit(X_train, y_train)
        
        print("Training completed!")
        return self
    
    def predict(self, X_tokens):
        """
        Predict labels for tokenized sentences.
        """
        X_features = [self.sent2features(sent) for sent in X_tokens]
        return self.model.predict(X_features)
    
    def predict_text(self, text):
        """
        Predict entities in raw text.
        
        Returns:
            List of dictionaries with 'text', 'label', 'start', 'end'
        """
        sentences = sent_tokenize(text)
        entities = []
        char_offset = 0
        
        for sentence in sentences:
            # Find sentence position in original text
            sent_start = text.find(sentence, char_offset)
            if sent_start == -1:
                sent_start = char_offset
                
            tokens = word_tokenize(sentence)
            if not tokens:
                char_offset = sent_start + len(sentence)
                continue
                
            predictions = self.predict([tokens])[0]
            
            # Convert predictions back to entities
            current_entity = None
            token_start = sent_start
            
            for token, label in zip(tokens, predictions):
                # Find token position in sentence
                token_pos = sentence.find(token, token_start - sent_start)
                if token_pos != -1:
                    token_abs_start = sent_start + token_pos
                    token_abs_end = token_abs_start + len(token)
                else:
                    # Fallback: approximate position
                    token_abs_start = token_start
                    token_abs_end = token_start + len(token)
                
                if label.startswith('B-'):
                    # Start of new entity
                    if current_entity:
                        entities.append(current_entity)
                    current_entity = {
                        'text': token,
                        'label': label[2:],
                        'start': token_abs_start,
                        'end': token_abs_end
                    }
                elif label.startswith('I-') and current_entity:
                    # Continue current entity
                    current_entity['text'] += f" {token}"
                    current_entity['end'] = token_abs_end
                else:
                    # End current entity
                    if current_entity:
                        entities.append(current_entity)
                        current_entity = None
                
                token_start = token_abs_end
            
            # Don't forget the last entity
            if current_entity:
                entities.append(current_entity)
            
            char_offset = sent_start + len(sentence)
        
        return entities
    
    def evaluate(self, X_test_tokens, y_test_labels):
        """
        Evaluate the model performance.
        """
        X_test, y_test = self.prepare_training_data(X_test_tokens, y_test_labels)
        y_pred = self.model.predict(X_test)
        
        # Flatten for evaluation
        y_true_flat = [label for sent_labels in y_test for label in sent_labels]
        y_pred_flat = [label for sent_labels in y_pred for label in sent_labels]
        
        # Print detailed metrics
        print("Classification Report:")
        print(classification_report(y_true_flat, y_pred_flat))
        
        # Entity-level metrics
        print("\nEntity-level F1 scores:")
        labels = list(self.model.classes_)
        for label in labels:
            if label != 'O':
                f1 = f1_score(y_true_flat, y_pred_flat, labels=[label], average='micro')
                print(f"{label}: {f1:.4f}")
        
        # Overall performance
        overall_f1 = f1_score(y_true_flat, y_pred_flat, average='weighted')
        print(f"\nOverall weighted F1: {overall_f1:.4f}")
        
        return {
            'classification_report': classification_report(y_true_flat, y_pred_flat, output_dict=True),
            'overall_f1': overall_f1,
            'predictions': y_pred
        }
    
    def save_model(self, filepath):
        """Save the trained model."""
        model_data = {
            'model': self.model,
            'dataset_indicators': self.dataset_indicators,
        }
        with open(filepath, 'wb') as f:
            pickle.dump(model_data, f)
        print(f"Model saved to {filepath}")
    
    def load_model(self, filepath):
        """Load a trained model."""
        with open(filepath, 'rb') as f:
            model_data = pickle.load(f)
        self.model = model_data['model']
        self.dataset_indicators = model_data['dataset_indicators']
        print(f"Model loaded from {filepath}")

## Randomly sample trianing and test data

In [3]:
# Paths
train_csv = "/Users/davepipon/Desktop/DS397 Data/coleridgeinitiative-show-us-the-data/train.csv"
docs_dir = "/Users/davepipon/Desktop/DS397 Data/coleridgeinitiative-show-us-the-data/train/"

df = pd.read_csv(train_csv)

# Shuffle rows to ensure randomness
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

text_samples = []
annotations = []

# Collect annotated samples
for _, row in df.iterrows():
    doc_id = row["Id"]
    dataset_label = row["dataset_label"]

    doc_path = os.path.join(docs_dir, f"{doc_id}.json")
    if not os.path.exists(doc_path):
        continue

    with open(doc_path, "r") as f:
        doc_json = json.load(f)

    for section in doc_json:
        text = section.get("text", "")
        if dataset_label.lower() in text.lower():
            # record sample
            text_samples.append(text)

            # find all mentions of dataset_label
            annots = []
            start = 0
            while True:
                idx = text.lower().find(dataset_label.lower(), start)
                if idx == -1:
                    break
                annots.append({
                    "start": idx,
                    "end": idx + len(dataset_label),
                    "label": "DATASET"
                })
                start = idx + len(dataset_label)

            annotations.append(annots)

# Randomly select 1000 samples
sample_indices = random.sample(range(len(text_samples)), min(1000, len(text_samples)))
text_samples = [text_samples[i] for i in sample_indices]
annotations = [annotations[i] for i in sample_indices]

print(f"Randomly sampled {len(text_samples)} annotated texts.")


Randomly sampled 1000 annotated texts.


## Training and testing the model

In [4]:
# Initialize CRF model
ner = CRFCitationNER()

# Convert texts + annotations → token/label sequences
X_tokens, y_labels = ner.create_training_data_from_text(text_samples, annotations)

# Train/test split (80/20)
X_train, X_test, y_train, y_test = train_test_split(X_tokens, y_labels, test_size=0.2, random_state=42)

# Train model
ner.train(X_train, y_train)

# Evaluate
results = ner.evaluate(X_test, y_test)


Preparing training data...
Training CRF model...
Training completed!
Classification Report:
              precision    recall  f1-score   support

   B-DATASET       0.77      0.76      0.77       375
   I-DATASET       0.76      0.71      0.73       469
           O       1.00      1.00      1.00    123005

    accuracy                           1.00    123849
   macro avg       0.84      0.82      0.83    123849
weighted avg       1.00      1.00      1.00    123849


Entity-level F1 scores:
B-DATASET: 0.7668
I-DATASET: 0.7341

Overall weighted F1: 0.9969
