# STREET-D
- Batch size of 500-2000

In [1]:
# Import required modules
import numpy as np
import pandas as pd
import pickle
import re
import spacy
import warnings

from spacy.util import minibatch, compounding
from tqdm.notebook import tqdm

# Settings
pd.set_option('max_colwidth', 80)
pd.options.display.max_rows = 200
warnings.filterwarnings('ignore')

## Load Data

In [2]:
# Load data
df = pd.read_csv('train.csv')

# Separate POI and street
df['poi'] = df['POI/street'].str.split('/', expand=True)[0]

# Load prepared data
with open('training_data/street-et1.pkl', 'rb') as file:
    data_train = pickle.load(file)

## Initialise spaCy Model

In [3]:
# Configure training
N_ITER = 300

def nlp_train(dat, N_ITER, early_stopping):
    
    # Prefer GPU
    spacy.prefer_gpu()
    
    # Create blank model
    nlp = spacy.blank('en')

    # Set up pipeline
    ner = nlp.create_pipe('ner')
    nlp.add_pipe(ner, last=True)

    # Annotations
    ner.add_label('STREET')
    
    # Create optimiser
    optimizer = nlp.begin_training(device=0)
    optimizer.learn_rate = 0.001
    
    # Initialise settings
    curr_loss = np.inf
    no_improvement = 0
    all_losses = []
    
    for i in tqdm(range(N_ITER)):
        np.random.shuffle(dat)
        losses = {}
        batches = minibatch(
            dat,
            size=compounding(500.0, 2000.0, 1.001)
        )
        for batch in tqdm(batches):
            text, annotations = zip(*batch)
            nlp.update(
                text,
                annotations,
                drop=0.3,
                sgd=optimizer,
                losses=losses
            )
        
        print(f"NER Loss: {losses['ner']}")

        # Check for early stopping after each pass
        if no_improvement >= early_stopping:
            return nlp, pd.DataFrame(all_losses)

        # Update loss after each pass
        if losses['ner'] < curr_loss:
            curr_loss = losses['ner']
            no_improvement = 0
        else:
            no_improvement += 1

        all_losses.append(losses)
    
    return nlp, pd.DataFrame(all_losses)

In [None]:
model, losses = nlp_train(data_train, N_ITER, 10)

  0%|          | 0/300 [00:00<?, ?it/s]

0it [00:00, ?it/s]

NER Loss: 347223.9649505615


0it [00:00, ?it/s]

NER Loss: 185084.15272140503


0it [00:00, ?it/s]

NER Loss: 149691.52951431274


0it [00:00, ?it/s]

NER Loss: 133465.14096069336


0it [00:00, ?it/s]

NER Loss: 119501.62750434875


0it [00:00, ?it/s]

NER Loss: 115799.21793746948


0it [00:00, ?it/s]

NER Loss: 111167.82790756226


0it [00:00, ?it/s]

NER Loss: 107499.11125183105


0it [00:00, ?it/s]

NER Loss: 104383.3983669281


0it [00:00, ?it/s]

NER Loss: 100936.32032966614


0it [00:00, ?it/s]

NER Loss: 98898.59268951416


0it [00:00, ?it/s]

NER Loss: 98176.44179058075


0it [00:00, ?it/s]

NER Loss: 95589.51345348358


0it [00:00, ?it/s]

NER Loss: 94231.97088050842


0it [00:00, ?it/s]

NER Loss: 92110.40188980103


0it [00:00, ?it/s]

NER Loss: 89787.72449111938


0it [00:00, ?it/s]

NER Loss: 90794.7285194397


0it [00:00, ?it/s]

NER Loss: 88818.22401046753


0it [00:00, ?it/s]

NER Loss: 89088.14843559265


0it [00:00, ?it/s]

NER Loss: 88876.44227790833


0it [00:00, ?it/s]

In [None]:
# Save model
model.to_disk('models/street-c/')
losses.to_csv('training_logs/street-c.csv', index=False)

## Prediction

In [None]:
test = pd.read_csv('test.csv')

In [None]:
docs = []
ents = []

for i in tqdm(range(test.shape[0])):
    temp_doc = model(test.raw_address.iloc[i])
    docs.append(df.raw_address.iloc[i])
    temp_ents = [ent.text for ent in temp_doc.ents]
    if len(temp_ents) > 0:
        ents.append(temp_ents[0])
    else:
        ents.append('')

In [None]:
preds = test.copy()
preds['street'] = ents
preds.to_csv('submissions/raw/streetc.csv', index=False)