In [1]:
pip install spacy pandas numpy jiwer scikit-learn


Collecting spacyNote: you may need to restart the kernel to use updated packages.

  Using cached spacy-3.7.6-cp312-cp312-win_amd64.whl.metadata (27 kB)
Collecting pandas
  Using cached pandas-2.2.3-cp312-cp312-win_amd64.whl.metadata (19 kB)
Collecting numpy
  Using cached numpy-2.1.1-cp312-cp312-win_amd64.whl.metadata (59 kB)
Collecting jiwer
  Using cached jiwer-3.0.4-py3-none-any.whl.metadata (2.6 kB)
Collecting scikit-learn
  Using cached scikit_learn-1.5.2-cp312-cp312-win_amd64.whl.metadata (13 kB)
Collecting spacy-legacy<3.1.0,>=3.0.11 (from spacy)
  Using cached spacy_legacy-3.0.12-py2.py3-none-any.whl.metadata (2.8 kB)
Collecting spacy-loggers<2.0.0,>=1.0.0 (from spacy)
  Using cached spacy_loggers-1.0.5-py3-none-any.whl.metadata (23 kB)
Collecting murmurhash<1.1.0,>=0.28.0 (from spacy)
  Using cached murmurhash-1.0.10-cp312-cp312-win_amd64.whl.metadata (2.0 kB)
Collecting cymem<2.1.0,>=2.0.2 (from spacy)
  Using cached cymem-2.0.8-cp312-cp312-win_amd64.whl.metadata (8.6 kB)
Co

In [3]:
#Donwload the models

!python -m spacy download en_core_web_sm
!python -m spacy download en_core_web_lg
!python -m spacy download en_core_web_trf



Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     --------------------- ------------------ 6.8/12.8 MB 34.9 MB/s eta 0:00:01
     --------------------------------------  12.6/12.8 MB 31.5 MB/s eta 0:00:01
     --------------------------------------  12.6/12.8 MB 31.5 MB/s eta 0:00:01
     --------------------------------------- 12.8/12.8 MB 17.1 MB/s eta 0:00:00
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
Collecting en-core-web-lg==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.7.1/en_core_web_lg-3.7.1-py3-none-any.whl (587.7 MB)
     ---------------------------------------- 0.0/587.7 MB ? eta -:--:--
     --------------------------------------- 6.8/587.7 MB 42.0 MB/s 

# 1.0 Using Spacy NER by default wihout Fine_Tuning to get a WER baseline


## 1.1 Using en_core_web_sm as Languaje Model 

In [20]:
# Import the required libraries
import pandas as pd
import spacy
import re
import string


file_path = '../data/Train_1.csv'
data = pd.read_csv(file_path)

# Load the SpaCy language model
model = spacy.load('en_core_web_sm')

def preprocess_tweet(text):
    if pd.isna(text):
        return ""
    # Convert text to lowercase
    #text = text.lower()
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    # Remove @ mentions
    text = re.sub(r'@\w+', '', text)
    # Remove hashtags (optional, keeping the hashtag content)
    text = re.sub(r'#', '', text)
    # Remove punctuation and numbers
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Remove non-ASCII characters (optional)
    text = re.sub(r'[^\x00-\x7f]', r'', text)
    # Remove stopwords and lemmatize (using SpaCy)
    doc = model(text)
    tokens = [token.lemma_ for token in doc if not token.is_stop and not token.is_punct]
    return " ".join(tokens)

# Apply preprocessing function to the 'text' column
data['cleaned_text'] = data['text'].apply(preprocess_tweet)

# Function to extract locations (GPE) from text
def extract_locations(text):
    if pd.isna(text):
        return []
    doc = model(text)
    locations = [ent.text for ent in doc.ents if ent.label_ == 'GPE']
    
    return ' '.join(set(locations))

# Apply the location extraction to the 'text' column
data['extracted_locations'] = data['cleaned_text'].apply(extract_locations)

# Display the tweets with extracted locations
data[['text', 'extracted_locations']].head()


Unnamed: 0,text,extracted_locations
0,,
1,"Flash floods struck a Maryland city on Sunday,...",Maryland
2,State of emergency declared for Maryland flood...,Maryland
3,Other parts of Maryland also saw significant d...,Baltimore Maryland
4,"Catastrophic Flooding Slams Ellicott City, Mar...",


In [21]:
data[['text', 'extracted_locations']].head(20)

Unnamed: 0,text,extracted_locations
0,,
1,"Flash floods struck a Maryland city on Sunday,...",Maryland
2,State of emergency declared for Maryland flood...,Maryland
3,Other parts of Maryland also saw significant d...,Baltimore Maryland
4,"Catastrophic Flooding Slams Ellicott City, Mar...",
5,WATCH: 1 missing after flash #FLOODING devasta...,Ellicott City Maryland
6,,
7,,
8,,
9,,


### 1.1.1 Calculating WER with en_core_web_sm

In [30]:
from jiwer import wer

# Function to calculate Word Error Rate (WER)
def calculate_wer(row):
    if not row['location'] or not row['extracted_locations']:
        return 1.0  # If either is empty, consider it a complete error
   
    # Use WER function from jiwer package
    return wer(str(row['location']), str(row['extracted_locations']))

# Apply the WER function to compare the 'location' and 'extracted_locations'
data['wer'] = data.apply(calculate_wer, axis=1)

# Display the WER for the first few rows
data[['location', 'extracted_locations', 'wer']].head()

#Get the average WER
average_wer = data['wer'].mean()
print(f"Average Word Error Rate (WER): {average_wer:.4f}")

#Get the accuracy
accuracy = 1 - average_wer  
print(f"Accuracy: {accuracy:.4f}")



Average Word Error Rate (WER): 0.9295
Accuracy: 0.0705


## 1.2 Using en_core_web_lg as Languaje Model 

In [32]:
# Import the required libraries
import pandas as pd
import spacy
import re
import string


file_path = '../data/Train_1.csv'
data = pd.read_csv(file_path)

# Load the SpaCy language model
model = spacy.load('en_core_web_lg')

def preprocess_tweet(text):
    if pd.isna(text):
        return ""
    # Convert text to lowercase
    #text = text.lower()
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    # Remove @ mentions
    text = re.sub(r'@\w+', '', text)
    # Remove hashtags  (optional, keeping the hashtag content)
    text = re.sub(r'#', '', text)
    # Remove punctuation and numbers
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Remove non-ASCII characters (optional)
    text = re.sub(r'[^\x00-\x7f]', r'', text)
    # Remove stopwords and lemmatize (using SpaCy)
    doc = model(text)
    tokens = [token.lemma_ for token in doc if not token.is_stop and not token.is_punct]
    return " ".join(tokens)

# Apply preprocessing function to the 'text' column
data['cleaned_text'] = data['text'].apply(preprocess_tweet)

# Function to extract locations (GPE) from text
def extract_locations(text):
    if pd.isna(text):
        return []
    doc = model(text)
    locations = [ent.text for ent in doc.ents if ent.label_ == 'GPE']
    
    return ' '.join(set(locations))

# Apply the location extraction to the 'text' column
data['extracted_locations'] = data['cleaned_text'].apply(extract_locations)

# Display the tweets with extracted locations
data[['text', 'extracted_locations']].head()


Unnamed: 0,text,extracted_locations
0,,
1,"Flash floods struck a Maryland city on Sunday,...",Maryland
2,State of emergency declared for Maryland flood...,Maryland
3,Other parts of Maryland also saw significant d...,Baltimore Dundalk Maryland
4,"Catastrophic Flooding Slams Ellicott City, Mar...",


### 1.2.1 Calculating WER with en_core_web_lg

In [33]:
from jiwer import wer

# Function to calculate Word Error Rate (WER)
def calculate_wer(row):
    if not row['location'] or not row['extracted_locations']:
        return 1.0  # If either is empty, consider it a complete error
   
    # Use WER function from jiwer package
    return wer(str(row['location']), str(row['extracted_locations']))

# Apply the WER function to compare the 'location' and 'extracted_locations'
data['wer'] = data.apply(calculate_wer, axis=1)

# Display the WER for the first few rows
data[['location', 'extracted_locations', 'wer']].head()

#Get the average WER
average_wer = data['wer'].mean()
print(f"Average Word Error Rate (WER): {average_wer:.4f}")

#Get the accuracy
accuracy = 1 - average_wer  
print(f"Accuracy: {accuracy:.4f}")



Average Word Error Rate (WER): 0.9385
Accuracy: 0.0615


# 2.0- Spacy Fine Tuning For NER 


## 2.1 Convert Training Dataset to Spacty Format

In [67]:
import pandas as pd

# Load your CSV file
file_path = '../data/Train_1.csv'
data = pd.read_csv(file_path)

# Define the label for location entities
LABEL = 'GPE'

# Function to extract multiple locations and convert to SpaCy format
def convert_to_spacy_format(row):
    text = row['cleaned_text']
    locations = row['location']  # Multiple locations might be separated by commas
    
    if pd.isna(text) or pd.isna(locations):
        return None
    
    entities = []
    for loc in locations.split(" "):  
        loc = loc.strip()
        
        start = text.lower().find(loc.lower())  # Case-insensitive search
        if start != -1:
            end = start + len(loc)
            print(loc,text, start, end, len(loc))
        
            entities.append((start, end, LABEL))  # Annotate the location as GPE
    
    return (text, {"entities": entities})

def preprocess_tweet(text):
    if pd.isna(text):
        return ""
    # Convert text to lowercase
    #text = text.lower()
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    # Remove @ mentions
    text = re.sub(r'@\w+', '', text)
    # Remove hashtags  (optional, keeping the hashtag content)
    text = re.sub(r'#', '', text)
    # Remove punctuation and numbers
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Remove non-ASCII characters (optional)
    text = re.sub(r'[^\x00-\x7f]', r'', text)
    # Remove stopwords and lemmatize (using SpaCy)
    doc = model(text)
    tokens = [token.lemma_ for token in doc if not token.is_stop and not token.is_punct]
    return " ".join(tokens)

# Apply preprocessing function to the 'text' column
data['cleaned_text'] = data['text'].apply(preprocess_tweet)

# Apply the conversion function to each row
TRAIN_DATA = [convert_to_spacy_format(row) for idx, row in data.iterrows() if convert_to_spacy_format(row) is not None]

# Example of the first converted data item with multiple locations
print(TRAIN_DATA[0])


Maryland flash flood strike Maryland city Sunday wash street toss car like bath toy 19 27 8
Maryland flash flood strike Maryland city Sunday wash street toss car like bath toy 19 27 8
Maryland state emergency declare Maryland flooding   24 32 8
Maryland state emergency declare Maryland flooding   24 32 8
Baltimore part Maryland see significant damage Sundays storm include Baltimore city neighborhood Dundalk Catonsville Rain total span 1 10 inch Maryland   ecflood 59 68 9
Maryland part Maryland see significant damage Sundays storm include Baltimore city neighborhood Dundalk Catonsville Rain total span 1 10 inch Maryland   ecflood 5 13 8
Baltimore part Maryland see significant damage Sundays storm include Baltimore city neighborhood Dundalk Catonsville Rain total span 1 10 inch Maryland   ecflood 59 68 9
Maryland part Maryland see significant damage Sundays storm include Baltimore city neighborhood Dundalk Catonsville Rain total span 1 10 inch Maryland   ecflood 5 13 8
Ellicott Catastrop

In [68]:
TRAIN_DATA

[('', {'entities': []}),
 ('flash flood strike Maryland city Sunday wash street toss car like bath toy',
  {'entities': [(19, 27, 'GPE')]}),
 ('state emergency declare Maryland flooding  ',
  {'entities': [(24, 32, 'GPE')]}),
 ('part Maryland see significant damage Sundays storm include Baltimore city neighborhood Dundalk Catonsville Rain total span 1 10 inch Maryland   ecflood',
  {'entities': [(59, 68, 'GPE'), (5, 13, 'GPE')]}),
 ('Catastrophic Flooding Slams Ellicott City Maryland Water Rescues report   Weather Channel  ',
  {'entities': [(28, 36, 'GPE'), (37, 41, 'GPE'), (42, 50, 'GPE')]}),
 ('watch 1 miss flash flooding devastate Ellicott City Maryland   GPWX',
  {'entities': [(38, 46, 'GPE'), (47, 51, 'GPE'), (52, 60, 'GPE')]}),
 ('', {'entities': []}),
 ('', {'entities': []}),
 ('', {'entities': []}),
 ('', {'entities': []}),
 ('', {'entities': []}),
 ('', {'entities': []}),
 ('', {'entities': []}),
 ('watch live aerial damage historic flash flooding Ellicott City Maryland',
  {

### 2.2 Split Train on Training and Validation 

In [55]:
from sklearn.model_selection import train_test_split

# Split the data into training and validation sets (80% train, 20% validation)
TRAIN_DATA, VALIDATION_DATA = train_test_split(TRAIN_DATA, test_size=0.2, random_state=42)


### 2.3 Set Up the Fine-Tuning Pipeline in SpaCy model (en_core_web_sm)


In [81]:
import spacy
from spacy.util import minibatch, compounding
from spacy.training import Example
import random

# Load the pre-trained SpaCy model
model = spacy.load("en_core_web_sm")

# Get the NER pipeline component
ner = model.get_pipe("ner")

# Add the new annotation (GPE)
ner.add_label("GPE")

# Function to remove duplicate entities
def remove_duplicate_entities(entities):
    seen = set()
    unique_entities = []
    for start, end, label in entities:
        if (start, end) not in seen:
            unique_entities.append((start, end, label))
            seen.add((start, end))
    return unique_entities
def remove_overlapping_entities(entities):
    # Sort entities by start position and length to prioritize longer entities
    entities = sorted(entities, key=lambda x: (x[0], x[1] - x[0]))
    unique_entities = []
    last_end = -1  # Track the end of the last added entity

    for start, end, label in entities:
        # Add the entity only if it doesn't overlap with the last added entity
        if start >= last_end:
            unique_entities.append((start, end, label))
            last_end = end  # Update the end position

    return unique_entities

def adjust_entity_offsets(text, entities):
    doc = model.make_doc(text)
    valid_entities = []
    
    for start, end, label in entities:
        span = doc.char_span(start, end, label=label)
        if span is not None:
            valid_entities.append((span.start_char, span.end_char, label))
        else:
            print(f"Misaligned entity: {text[start:end]} in text: {text} | {start}:{end}")
    
    return valid_entities


def calculate_validation_loss(model, validation_data):
    validation_loss = 0
    losses = {}
    
    for text, annotation in validation_data:
        # Create Example objects for validation
        doc = model.make_doc(text)
        example = Example.from_dict(doc, annotation)
        
        # Pass examples to model.update but with sgd=None to prevent updating weights
        model.update([example], sgd=None, drop=0, losses=losses)
    
    # Sum the losses from all validation examples
    validation_loss = sum(losses.values())
    return validation_loss



# Initialize lists to store losses
train_losses = []
val_losses = []

# Disable other pipeline components during training (we only want to train NER)
other_pipes = [pipe for pipe in model.pipe_names if pipe != "ner"]
with model.disable_pipes(*other_pipes):  # Disable other pipes
    optimizer = model.resume_training()
    print(optimizer.learn_rate)
    optimizer.learn_rate = 0.0005  
    losses = {}

    # Training loop
    for itn in range(10):  # Number of epochs
        random.shuffle(TRAIN_DATA)
        batches = minibatch(TRAIN_DATA, size=compounding(16.0, 64.0, 1.001))
        for batch in batches:
            texts, annotations = zip(*batch)

            # Convert (text, annotations) to Example objects
            examples = []
            for text, annotation in zip(texts, annotations):
                annotation['entities'] = adjust_entity_offsets(text, annotation['entities'])
                annotation['entities'] = remove_duplicate_entities(annotation['entities'])
                annotation['entities'] = remove_overlapping_entities(annotation['entities'])
                doc = model.make_doc(text)  
                example = Example.from_dict(doc, annotation)  #
                examples.append(example)

            # Update the model with the batch of Example objects
            model.update(examples, sgd=optimizer, drop=0.35, losses=losses)
        
        print(f"Epoch {itn}: Training Loss: {losses}")
        train_loss = sum(losses.values())
        train_losses.append(train_loss)
        
        val_loss = calculate_validation_loss(model, VALIDATION_DATA)
        val_losses.append(val_loss)

        print(f"Epoch {itn}: Training Loss: {train_loss} | Validation Loss : {val_loss}")

# Save the fine-tuned model
model.to_disk("fine_tuned_ner_model_sm")


0.001
Epoch 0: Training Loss: {'ner': 12109.07999790722}




Epoch 0: Training Loss: 12109.07999790722 | Validation Loss : 2085.565651803055
Epoch 1: Training Loss: {'ner': 21281.172348789554}
Epoch 1: Training Loss: 21281.172348789554 | Validation Loss : 1356.7612054909837
Epoch 2: Training Loss: {'ner': 29812.396038691295}
Epoch 2: Training Loss: 29812.396038691295 | Validation Loss : 922.9060541833636
Epoch 3: Training Loss: {'ner': 37692.40695183549}
Epoch 3: Training Loss: 37692.40695183549 | Validation Loss : 678.9763772265558
Epoch 4: Training Loss: {'ner': 45167.54280000732}
Epoch 4: Training Loss: 45167.54280000732 | Validation Loss : 517.4959729521828
Epoch 5: Training Loss: {'ner': 52271.948658625784}
Epoch 5: Training Loss: 52271.948658625784 | Validation Loss : 414.36701392746295
Epoch 6: Training Loss: {'ner': 59010.49021530068}
Epoch 6: Training Loss: 59010.49021530068 | Validation Loss : 367.6640559752909
Epoch 7: Training Loss: {'ner': 65468.530565695524}
Epoch 7: Training Loss: 65468.530565695524 | Validation Loss : 326.2138359

In [60]:
losses

{'ner': 74999.42562442123}

Testing the Fine-Tuned Model:

In [82]:
# Import the required libraries
import pandas as pd
import spacy
import re
import string


file_path = '../data/Train_1.csv'
data = pd.read_csv(file_path)

# Load the SpaCy language model
model = spacy.load('fine_tuned_ner_model_sm')

def preprocess_tweet(text):
    if pd.isna(text):
        return ""
    # Convert text to lowercase
    #text = text.lower()
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    # Remove @ mentions
    text = re.sub(r'@\w+', '', text)
    # Remove hashtags (optional, keeping the hashtag content)
    text = re.sub(r'#', '', text)
    # Remove punctuation and numbers
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Remove non-ASCII characters (optional)
    text = re.sub(r'[^\x00-\x7f]', r'', text)
    # Remove stopwords and lemmatize (using SpaCy)
    doc = model(text)
    tokens = [token.lemma_ for token in doc if not token.is_stop and not token.is_punct]
    return " ".join(tokens)

# Apply preprocessing function to the 'text' column
data['cleaned_text'] = data['text'].apply(preprocess_tweet)

# Function to extract locations (GPE) from text
def extract_locations(text):
    if pd.isna(text):
        return []
    doc = model(text)
    locations = [ent.text for ent in doc.ents if ent.label_ == 'GPE']
    
    return ' '.join(set(locations))

# Apply the location extraction to the 'text' column
data['extracted_locations'] = data['cleaned_text'].apply(extract_locations)

# Display the tweets with extracted locations
data[['text', 'extracted_locations']].head()


Unnamed: 0,text,extracted_locations
0,,
1,"Flash floods struck a Maryland city on Sunday,...",Maryland
2,State of emergency declared for Maryland flood...,Maryland
3,Other parts of Maryland also saw significant d...,Baltimore Maryland
4,"Catastrophic Flooding Slams Ellicott City, Mar...",Maryland Ellicott City


In [83]:
from jiwer import wer

# Function to calculate Word Error Rate (WER)
def calculate_wer(row):
    if not row['location'] or not row['extracted_locations']:
        return 1.0  # If either is empty, consider it a complete error
   
    # Use WER function from jiwer package
    return wer(str(row['location']), str(row['extracted_locations']))

# Apply the WER function to compare the 'location' and 'extracted_locations'
data['wer'] = data.apply(calculate_wer, axis=1)

# Display the WER for the first few rows
data[['location', 'extracted_locations', 'wer']].head()

#Get the average WER
average_wer = data['wer'].mean()
print(f"Average Word Error Rate (WER): {average_wer:.4f}")

#Get the accuracy
accuracy = 1 - average_wer  
print(f"Accuracy: {accuracy:.4f}")



Average Word Error Rate (WER): 0.8875
Accuracy: 0.1125
