In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import spacy
from spacy.tokens import DocBin
from spacy.util import filter_spans
from tqdm import tqdm
import json
import ast
import random
from spacy.training.example import Example
from pathlib import Path
import os

# Get the current file's directory (src folder)
current_dir = Path(__file__).parent if '__file__' in globals() else Path.cwd()

# Define data directory (one level up from src, then into data)
DATA_DIR = current_dir.parent / 'data'

# Create data directory if it doesn't exist
DATA_DIR.mkdir(parents=True, exist_ok=True)

# Set random seed for reproducibility
RANDOM_SEED = 42
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)

print(f"Data directory path: {DATA_DIR}")
print(f"Data directory exists: {DATA_DIR.exists()}")

In [None]:
# Load datasets
df_train = pd.read_excel(DATA_DIR / 'final_train_data.xlsx')
df_valid = pd.read_excel(DATA_DIR / 'final_valid_data.xlsx')
df_test = pd.read_excel(DATA_DIR / 'final_test_data.xlsx')

# Function to convert string lists to actual lists
def convert_string_to_list(string_list):
    try:
        return ast.literal_eval(string_list)
    except:
        return []

# Convert string lists to actual lists for each dataset
df_train['tokens'] = df_train['tokens'].apply(convert_string_to_list)
df_train['tags'] = df_train['tags'].apply(convert_string_to_list)

df_valid['tokens'] = df_valid['tokens'].apply(convert_string_to_list)
df_valid['tags'] = df_valid['tags'].apply(convert_string_to_list)

df_test['tokens'] = df_test['tokens'].apply(convert_string_to_list)
df_test['tags'] = df_test['tags'].apply(convert_string_to_list)

print("Dataset dimensions:")
print(f"Train: {len(df_train)} samples")
print(f"Valid: {len(df_valid)} samples")
print(f"Test: {len(df_test)} samples")

Veri seti boyutları:
Train: 59924 örnek
Valid: 8528 örnek
Test: 8262 örnek


In [None]:
# Check label distribution
def check_label_distribution(df, name):
    all_tags = []
    for tags in df['tags']:
        all_tags.extend(tags)
    unique, counts = np.unique(all_tags, return_counts=True)
    total = sum(counts)
    print(f"\nLabel distribution in {name} dataset:")
    for u, c in zip(unique, counts):
        print(f"Label {u}: {c} occurrences ({(c/total)*100:.2f}%)")

print("\nLabel distribution after filtering:")
check_label_distribution(df_train, "Train")
check_label_distribution(df_valid, "Validation")
check_label_distribution(df_test, "Test")

In [None]:
# Load label file and create label mappings
with open(DATA_DIR / 'label.json', 'r') as f:
    label_dict = json.load(f)

# Prepare label names for SpaCy (B-PERSON -> PERSON etc.)
spacy_labels = {}
for label, idx in label_dict.items():
    if label != 'O':
        entity_type = label.split('-')[1]  # B-PERSON -> PERSON
        spacy_labels[idx] = entity_type

print("\nSpaCy label mappings:")
for idx, label in spacy_labels.items():
    print(f"{idx}: {label}")

# Create an empty SpaCy model
nlp = spacy.blank("en")  # Blank model for English

# Add NER pipeline
if "ner" not in nlp.pipe_names:
    ner = nlp.add_pipe("ner")
    
# Add labels
for label in set(spacy_labels.values()):
    ner.add_label(label)

print("\nNER labels added.")


SpaCy etiket eşleştirmeleri:
1: DATE
2: DATE
3: PERSON
4: PERSON
5: ORG
6: ORG
7: PERCENT
8: PERCENT
9: MONEY
10: MONEY

NER etiketleri eklendi.

NER etiketleri eklendi.


In [None]:
# Convert data to SpaCy format
def convert_to_spacy_format(df):
    training_data = []
    
    for _, row in df.iterrows():
        text = ' '.join(row['tokens'])
        entities = []
        
        # Convert tags to entity spans
        i = 0
        while i < len(row['tags']):
            if row['tags'][i] != 0:  # If not O tag
                # Find entity type
                entity_type = spacy_labels.get(row['tags'][i])
                
                # Find start position
                start_char = len(' '.join(row['tokens'][:i]))
                if i > 0:
                    start_char += 1  # Add space
                    
                # Find end position
                end_char = start_char + len(row['tokens'][i])
                
                entities.append((start_char, end_char, entity_type))
            i += 1
        
        training_data.append((text, {"entities": entities}))
    
    return training_data

# Convert datasets
train_data = convert_to_spacy_format(df_train)
valid_data = convert_to_spacy_format(df_valid)

print(f"Training examples: {len(train_data)}")
print(f"Validation examples: {len(valid_data)}")

# Show first example
print("\nExample data:")
print("Text:", train_data[0][0])
print("Entities:", train_data[0][1]['entities'])

Eğitim örnekleri: 59924
Doğrulama örnekleri: 8528

Örnek veri:
Text: People start their own businesses for many reasons .
Entities: []


In [None]:
# Configure training settings
from spacy.training import offsets_to_biluo_tags

optimizer = nlp.begin_training()
batch_size = 16
epochs = 30

# Training loop
print("Starting training...")
for epoch in range(epochs):
    random.shuffle(train_data)
    losses = {}
    
    # Training in mini-batches
    batches = [train_data[i:i + batch_size] for i in range(0, len(train_data), batch_size)]
    
    with tqdm(total=len(batches), desc=f"Epoch {epoch+1}/{epochs}") as pbar:
        for batch in batches:
            examples = []
            for text, annotations in batch:
                doc = nlp.make_doc(text)
                example = Example.from_dict(doc, annotations)
                examples.append(example)
            
            # Train on batch
            nlp.update(
                examples,
                drop=0.5,  # dropout
                losses=losses
            )
            pbar.update(1)
    
    # Print loss at the end of epoch
    print(f"Epoch {epoch+1} losses:", losses)
    
    # Save model checkpoint after each epoch
    checkpoint_dir = DATA_DIR / 'model_checkpoints' / f'checkpoint_epoch_{epoch+1}'
    checkpoint_dir.mkdir(parents=True, exist_ok=True)
    nlp.to_disk(str(checkpoint_dir))
    print(f"Checkpoint saved: {checkpoint_dir}")

print("\nTraining completed!")

In [None]:
# Model evaluation
def evaluate_model(model, examples):
    tp = 0  # True positives
    fp = 0  # False positives
    fn = 0  # False negatives
    
    for text, annotations in examples:
        doc = model(text)
        gold_entities = annotations['entities']
        pred_entities = [(ent.start_char, ent.end_char, ent.label_) for ent in doc.ents]
        
        # True positives
        tp += len(set(gold_entities) & set(pred_entities))
        # False positives
        fp += len(set(pred_entities) - set(gold_entities))
        # False negatives
        fn += len(set(gold_entities) - set(pred_entities))
    
    # Calculate metrics
    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0
    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
    
    return {
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

# Evaluate model
metrics = evaluate_model(nlp, valid_data)
print("\nModel Performance:")
print(f"Precision: {metrics['precision']:.4f}")
print(f"Recall: {metrics['recall']:.4f}")
print(f"F1 Score: {metrics['f1']:.4f}")

In [None]:
# Test on sample texts
def test_ner(text):
    doc = nlp(text)
    entities = [(ent.text, ent.label_) for ent in doc.ents]
    return entities

# Test texts
test_texts = [
    "Apple CEO Tim Cook introduced the new iPhone model at a conference held in San Francisco.",
    "Microsoft announced its quarterly earnings in January 2024."
]

print("Model test results:\n")
for text in test_texts:
    print(f"Text: {text}")
    entities = test_ner(text)
    print("Found entities:")
    for entity, label in entities:
        print(f"  {entity}: {label}")
    print("-" * 50)

In [None]:
# Save final model
output_dir = DATA_DIR / "spacy_ner_model"
output_dir.mkdir(parents=True, exist_ok=True)
nlp.to_disk(str(output_dir))
print(f"\nModel saved to directory: {output_dir}")