# POI-9
- ET-3

In [1]:
# Import required modules
import numpy as np
import pandas as pd
import pickle
import re
import spacy
import warnings

from spacy.util import minibatch, compounding
from tqdm.notebook import tqdm

# Settings
pd.set_option('max_colwidth', 80)
pd.options.display.max_rows = 200
warnings.filterwarnings('ignore')

## Load Data

In [2]:
# Load data
df = pd.read_csv('train.csv')

# Separate POI and street
df['poi'] = df['POI/street'].str.split('/', expand=True)[0]

# Load prepared data
with open('training_data/poi-5-et3.pkl', 'rb') as file:
    data_train = pickle.load(file)

## Initialise spaCy Model

In [3]:
# Configure training
N_ITER = 300

def nlp_train(dat, N_ITER, early_stopping):
    
    # Prefer GPU
    spacy.prefer_gpu()
    
    # Create blank model
    nlp = spacy.blank('en')

    # Set up pipeline
    ner = nlp.create_pipe('ner')
    nlp.add_pipe(ner, last=True)

    # Annotations
    ner.add_label('POI')
    
    # Create optimiser
    optimizer = nlp.begin_training(device=0)
    optimizer.learn_rate = 0.001
    
    # Initialise settings
    curr_loss = np.inf
    no_improvement = 0
    all_losses = []
    
    for i in tqdm(range(N_ITER)):
        np.random.shuffle(dat)
        losses = {}
        batches = minibatch(
            dat,
            size=compounding(500.0, 2000.0, 1.001)
        )
        for batch in tqdm(batches):
            text, annotations = zip(*batch)
            nlp.update(
                text,
                annotations,
                drop=0.3,
                sgd=optimizer,
                losses=losses
            )
        
        print(f"NER Loss: {losses['ner']}")

        # Check for early stopping after each pass
        if no_improvement >= early_stopping:
            return nlp, pd.DataFrame(all_losses)

        # Update loss after each pass
        if losses['ner'] < curr_loss:
            curr_loss = losses['ner']
            no_improvement = 0
        else:
            no_improvement += 1

        all_losses.append(losses)
    
    return nlp, pd.DataFrame(all_losses)

In [4]:
model, losses = nlp_train(data_train, N_ITER, 10)

  0%|          | 0/300 [00:00<?, ?it/s]

0it [00:00, ?it/s]

NER Loss: 151035.4993596077


0it [00:00, ?it/s]

NER Loss: 106551.51812410355


0it [00:00, ?it/s]

NER Loss: 94563.70832109451


0it [00:00, ?it/s]

NER Loss: 87524.29758501053


0it [00:00, ?it/s]

NER Loss: 82789.29155111313


0it [00:00, ?it/s]

NER Loss: 82798.01507425308


0it [00:00, ?it/s]

NER Loss: 78022.1138830185


0it [00:00, ?it/s]

NER Loss: 75336.06128489971


0it [00:00, ?it/s]

NER Loss: 73337.22736215591


0it [00:00, ?it/s]

NER Loss: 71893.3647646904


0it [00:00, ?it/s]

NER Loss: 70477.19001555443


0it [00:00, ?it/s]

NER Loss: 69089.18301510811


0it [00:00, ?it/s]

NER Loss: 67072.47083759308


0it [00:00, ?it/s]

NER Loss: 66810.34185242653


0it [00:00, ?it/s]

NER Loss: 65246.27373290062


0it [00:00, ?it/s]

NER Loss: 65420.531306385994


0it [00:00, ?it/s]

NER Loss: 65430.44098317623


0it [00:00, ?it/s]

NER Loss: 63898.033774375916


0it [00:00, ?it/s]

NER Loss: 63309.421286821365


0it [00:00, ?it/s]

NER Loss: 62178.1648581028


0it [00:00, ?it/s]

NER Loss: 61636.02363282442


0it [00:00, ?it/s]

NER Loss: 61001.92452502251


0it [00:00, ?it/s]

NER Loss: 60336.806910157204


0it [00:00, ?it/s]

NER Loss: 60013.78205871582


0it [00:00, ?it/s]

NER Loss: 59580.94485592842


0it [00:00, ?it/s]

NER Loss: 58588.178179860115


0it [00:00, ?it/s]

NER Loss: 58083.789167404175


0it [00:00, ?it/s]

NER Loss: 57840.633401453495


0it [00:00, ?it/s]

NER Loss: 58192.64032334089


0it [00:00, ?it/s]

NER Loss: 57543.497377216816


0it [00:00, ?it/s]

NER Loss: 57017.65342307091


0it [00:00, ?it/s]

NER Loss: 56307.13642036915


0it [00:00, ?it/s]

NER Loss: 56294.86983919144


0it [00:00, ?it/s]

NER Loss: 55535.0337074101


0it [00:00, ?it/s]

NER Loss: 55757.68563890457


0it [00:00, ?it/s]

NER Loss: 54947.78615784645


0it [00:00, ?it/s]

NER Loss: 54841.05814898014


0it [00:00, ?it/s]

NER Loss: 53936.094769358635


0it [00:00, ?it/s]

NER Loss: 53961.32166582346


0it [00:00, ?it/s]

NER Loss: 53933.256591677666


0it [00:00, ?it/s]

NER Loss: 54283.62296140194


0it [00:00, ?it/s]

NER Loss: 53761.82596448064


0it [00:00, ?it/s]

NER Loss: 53440.82382392883


0it [00:00, ?it/s]

NER Loss: 53195.60938286781


0it [00:00, ?it/s]

NER Loss: 52926.11120951176


0it [00:00, ?it/s]

NER Loss: 52857.21766534448


0it [00:00, ?it/s]

NER Loss: 52375.54772877693


0it [00:00, ?it/s]

NER Loss: 52297.18665468693


0it [00:00, ?it/s]

NER Loss: 51732.6414565742


0it [00:00, ?it/s]

NER Loss: 51634.30935525894


0it [00:00, ?it/s]

NER Loss: 52080.12614482641


0it [00:00, ?it/s]

NER Loss: 51532.21825516224


0it [00:00, ?it/s]

NER Loss: 51269.700967252254


0it [00:00, ?it/s]

NER Loss: 50843.901606082916


0it [00:00, ?it/s]

NER Loss: 50993.42276850343


0it [00:00, ?it/s]

NER Loss: 50407.74461436272


0it [00:00, ?it/s]

NER Loss: 50194.186902701855


0it [00:00, ?it/s]

NER Loss: 50236.01281797886


0it [00:00, ?it/s]

NER Loss: 50244.58958423138


0it [00:00, ?it/s]

NER Loss: 49903.115661501884


0it [00:00, ?it/s]

NER Loss: 49850.738716602325


0it [00:00, ?it/s]

NER Loss: 49277.31911838055


0it [00:00, ?it/s]

NER Loss: 49344.54074662924


0it [00:00, ?it/s]

NER Loss: 49464.813209593296


0it [00:00, ?it/s]

NER Loss: 48757.197429060936


0it [00:00, ?it/s]

NER Loss: 48657.62051087618


0it [00:00, ?it/s]

NER Loss: 48662.45110440254


0it [00:00, ?it/s]

NER Loss: 48363.924767017365


0it [00:00, ?it/s]

NER Loss: 48088.13528487086


0it [00:00, ?it/s]

NER Loss: 48092.723262906075


0it [00:00, ?it/s]

NER Loss: 47988.827154785395


0it [00:00, ?it/s]

NER Loss: 48091.43518167734


0it [00:00, ?it/s]

NER Loss: 48084.86313307285


0it [00:00, ?it/s]

NER Loss: 47820.56253170967


0it [00:00, ?it/s]

NER Loss: 47653.81975996494


0it [00:00, ?it/s]

NER Loss: 47415.32926303148


0it [00:00, ?it/s]

NER Loss: 47508.558946728706


0it [00:00, ?it/s]

NER Loss: 47687.35799974203


0it [00:00, ?it/s]

NER Loss: 47006.88519048691


0it [00:00, ?it/s]

NER Loss: 46658.446940779686


0it [00:00, ?it/s]

NER Loss: 46567.19690537453


0it [00:00, ?it/s]

NER Loss: 47054.39720803499


0it [00:00, ?it/s]

NER Loss: 47025.11283406615


0it [00:00, ?it/s]

NER Loss: 46534.3088644743


0it [00:00, ?it/s]

NER Loss: 46263.41496577859


0it [00:00, ?it/s]

NER Loss: 46027.91851866245


0it [00:00, ?it/s]

NER Loss: 46591.44276612997


0it [00:00, ?it/s]

NER Loss: 46394.52002221346


0it [00:00, ?it/s]

NER Loss: 45864.90151667595


0it [00:00, ?it/s]

NER Loss: 45941.00642502308


0it [00:00, ?it/s]

NER Loss: 45938.71930330992


0it [00:00, ?it/s]

NER Loss: 45805.9185975194


0it [00:00, ?it/s]

NER Loss: 45472.64601185918


0it [00:00, ?it/s]

NER Loss: 45782.837633132935


0it [00:00, ?it/s]

NER Loss: 45250.29188686609


0it [00:00, ?it/s]

NER Loss: 45384.94148135185


0it [00:00, ?it/s]

NER Loss: 45701.35080091655


0it [00:00, ?it/s]

NER Loss: 45037.24941974878


0it [00:00, ?it/s]

NER Loss: 45165.9881695658


0it [00:00, ?it/s]

NER Loss: 45003.47063652426


0it [00:00, ?it/s]

NER Loss: 45060.7488642782


0it [00:00, ?it/s]

NER Loss: 44588.07041990757


0it [00:00, ?it/s]

NER Loss: 44486.1254619956


0it [00:00, ?it/s]

NER Loss: 44963.993746474385


0it [00:00, ?it/s]

NER Loss: 44357.39338439703


0it [00:00, ?it/s]

NER Loss: 44579.18489688635


0it [00:00, ?it/s]

NER Loss: 44121.907596588135


0it [00:00, ?it/s]

NER Loss: 44168.41449570656


0it [00:00, ?it/s]

NER Loss: 43776.51948839426


0it [00:00, ?it/s]

NER Loss: 44596.064805194736


0it [00:00, ?it/s]

NER Loss: 44225.59891445935


0it [00:00, ?it/s]

NER Loss: 44520.2166287601


0it [00:00, ?it/s]

NER Loss: 43760.27790045738


0it [00:00, ?it/s]

NER Loss: 43637.79362279177


0it [00:00, ?it/s]

NER Loss: 43746.765719771385


0it [00:00, ?it/s]

NER Loss: 43530.92385292053


0it [00:00, ?it/s]

NER Loss: 43469.45649826527


0it [00:00, ?it/s]

NER Loss: 43435.99737340212


0it [00:00, ?it/s]

NER Loss: 43195.15932968259


0it [00:00, ?it/s]

NER Loss: 42686.91748346388


0it [00:00, ?it/s]

NER Loss: 43137.8381973505


0it [00:00, ?it/s]

NER Loss: 43108.80121847987


0it [00:00, ?it/s]

NER Loss: 42810.63225609064


0it [00:00, ?it/s]

NER Loss: 43034.592143028975


0it [00:00, ?it/s]

NER Loss: 42958.95924168825


0it [00:00, ?it/s]

NER Loss: 42793.41516029835


0it [00:00, ?it/s]

NER Loss: 42976.26204535365


0it [00:00, ?it/s]

NER Loss: 42788.47618409991


0it [00:00, ?it/s]

NER Loss: 42407.75560709834


0it [00:00, ?it/s]

NER Loss: 42542.50488418341


0it [00:00, ?it/s]

NER Loss: 42542.43311715126


0it [00:00, ?it/s]

NER Loss: 42641.75442773104


0it [00:00, ?it/s]

NER Loss: 42628.669369757175


0it [00:00, ?it/s]

NER Loss: 42447.348999261856


0it [00:00, ?it/s]

NER Loss: 42075.444044813514


0it [00:00, ?it/s]

NER Loss: 42510.75864446163


0it [00:00, ?it/s]

NER Loss: 41875.72164720297


0it [00:00, ?it/s]

NER Loss: 41950.58252578974


0it [00:00, ?it/s]

NER Loss: 41891.7556213215


0it [00:00, ?it/s]

NER Loss: 41968.241988033056


0it [00:00, ?it/s]

NER Loss: 41942.235080599785


0it [00:00, ?it/s]

NER Loss: 41973.92354339361


0it [00:00, ?it/s]

NER Loss: 41393.36966216564


0it [00:00, ?it/s]

NER Loss: 41699.057162463665


0it [00:00, ?it/s]

NER Loss: 41383.58850288391


0it [00:00, ?it/s]

NER Loss: 41441.97539305687


0it [00:00, ?it/s]

NER Loss: 41082.90503755212


0it [00:00, ?it/s]

NER Loss: 41169.81649708748


0it [00:00, ?it/s]

NER Loss: 41485.19834971428


0it [00:00, ?it/s]

NER Loss: 41632.51943080127


0it [00:00, ?it/s]

NER Loss: 41032.86197531223


0it [00:00, ?it/s]

NER Loss: 41490.19726604223


0it [00:00, ?it/s]

NER Loss: 41010.65044990182


0it [00:00, ?it/s]

NER Loss: 41083.21658009291


0it [00:00, ?it/s]

NER Loss: 41056.01275604963


0it [00:00, ?it/s]

NER Loss: 40604.90560597181


0it [00:00, ?it/s]

NER Loss: 40703.990614682436


0it [00:00, ?it/s]

NER Loss: 41184.19233930111


0it [00:00, ?it/s]

NER Loss: 40573.32297211513


0it [00:00, ?it/s]

NER Loss: 40884.95852050185


0it [00:00, ?it/s]

NER Loss: 41040.836894363165


0it [00:00, ?it/s]

NER Loss: 40831.27245104313


0it [00:00, ?it/s]

NER Loss: 40725.60165400803


0it [00:00, ?it/s]

NER Loss: 40719.816217303276


0it [00:00, ?it/s]

NER Loss: 40053.44318366051


0it [00:00, ?it/s]

NER Loss: 40348.49982847273


0it [00:00, ?it/s]

NER Loss: 40352.42564949393


0it [00:00, ?it/s]

NER Loss: 40699.46646386385


0it [00:00, ?it/s]

NER Loss: 40405.908837884665


0it [00:00, ?it/s]

NER Loss: 39845.39409315586


0it [00:00, ?it/s]

NER Loss: 39919.39559967816


0it [00:00, ?it/s]

NER Loss: 40065.951096594334


0it [00:00, ?it/s]

NER Loss: 39546.48545333743


0it [00:00, ?it/s]

NER Loss: 39852.028211236


0it [00:00, ?it/s]

NER Loss: 39916.22830244899


0it [00:00, ?it/s]

NER Loss: 40226.618235941976


0it [00:00, ?it/s]

NER Loss: 40110.56768811494


0it [00:00, ?it/s]

NER Loss: 40143.92957538366


0it [00:00, ?it/s]

NER Loss: 39619.24209475517


0it [00:00, ?it/s]

NER Loss: 39800.36791932583


0it [00:00, ?it/s]

NER Loss: 39576.47683995962


0it [00:00, ?it/s]

NER Loss: 39772.79196688533


0it [00:00, ?it/s]

NER Loss: 39326.96015647054


0it [00:00, ?it/s]

NER Loss: 39749.124997258186


0it [00:00, ?it/s]

NER Loss: 39713.8455632925


0it [00:00, ?it/s]

NER Loss: 39715.65544992685


0it [00:00, ?it/s]

NER Loss: 39340.21746778488


0it [00:00, ?it/s]

NER Loss: 39662.19532878697


0it [00:00, ?it/s]

NER Loss: 39384.920057594776


0it [00:00, ?it/s]

NER Loss: 39485.24426281452


0it [00:00, ?it/s]

NER Loss: 39408.00271952152


0it [00:00, ?it/s]

NER Loss: 39577.66425189376


0it [00:00, ?it/s]

NER Loss: 39114.89815193415


0it [00:00, ?it/s]

NER Loss: 39141.86602076888


0it [00:00, ?it/s]

NER Loss: 38968.36008191109


0it [00:00, ?it/s]

NER Loss: 39018.218765586615


0it [00:00, ?it/s]

NER Loss: 38850.29924309254


0it [00:00, ?it/s]

NER Loss: 39076.40092381835


0it [00:00, ?it/s]

NER Loss: 38996.975058078766


0it [00:00, ?it/s]

NER Loss: 39149.00345060229


0it [00:00, ?it/s]

NER Loss: 38638.91381165385


0it [00:00, ?it/s]

NER Loss: 39176.95450672507


0it [00:00, ?it/s]

NER Loss: 38703.82399317622


0it [00:00, ?it/s]

NER Loss: 38823.08366815001


0it [00:00, ?it/s]

NER Loss: 38781.15494956076


0it [00:00, ?it/s]

NER Loss: 38728.94631573558


0it [00:00, ?it/s]

NER Loss: 38913.07231545448


0it [00:00, ?it/s]

NER Loss: 38673.05962818861


0it [00:00, ?it/s]

NER Loss: 38759.976555883884


0it [00:00, ?it/s]

NER Loss: 38624.146294385195


0it [00:00, ?it/s]

NER Loss: 38619.1814032197


0it [00:00, ?it/s]

NER Loss: 38476.96019347012


0it [00:00, ?it/s]

NER Loss: 38441.04058563709


0it [00:00, ?it/s]

NER Loss: 38402.93265196681


0it [00:00, ?it/s]

NER Loss: 38521.765998095274


0it [00:00, ?it/s]

NER Loss: 38634.37345033884


0it [00:00, ?it/s]

NER Loss: 38370.44370958209


0it [00:00, ?it/s]

NER Loss: 38477.6578951776


0it [00:00, ?it/s]

NER Loss: 38324.65358868241


0it [00:00, ?it/s]

NER Loss: 38266.292306140065


0it [00:00, ?it/s]

NER Loss: 38081.74881392717


0it [00:00, ?it/s]

NER Loss: 38090.830759331584


0it [00:00, ?it/s]

NER Loss: 37857.9144016169


0it [00:00, ?it/s]

NER Loss: 38056.54835510254


0it [00:00, ?it/s]

NER Loss: 38220.372847020626


0it [00:00, ?it/s]

NER Loss: 37686.44927281141


0it [00:00, ?it/s]

NER Loss: 37693.20906877518


0it [00:00, ?it/s]

NER Loss: 37786.221274882555


0it [00:00, ?it/s]

NER Loss: 37790.142194479704


0it [00:00, ?it/s]

NER Loss: 37652.52229768038


0it [00:00, ?it/s]

NER Loss: 37866.23686206341


0it [00:00, ?it/s]

NER Loss: 37867.431133151054


0it [00:00, ?it/s]

NER Loss: 37550.66228455305


0it [00:00, ?it/s]

NER Loss: 37992.90934801847


0it [00:00, ?it/s]

NER Loss: 37937.577074825764


0it [00:00, ?it/s]

NER Loss: 37795.47509711981


0it [00:00, ?it/s]

NER Loss: 37734.78297957778


0it [00:00, ?it/s]

NER Loss: 37553.92319959402


0it [00:00, ?it/s]

NER Loss: 37431.250716239214


0it [00:00, ?it/s]

NER Loss: 37329.89446353912


0it [00:00, ?it/s]

NER Loss: 37229.10836750269


0it [00:00, ?it/s]

NER Loss: 37487.1497053504


0it [00:00, ?it/s]

NER Loss: 37673.78578346968


0it [00:00, ?it/s]

NER Loss: 37397.39598917961


0it [00:00, ?it/s]

NER Loss: 37552.7552023381


0it [00:00, ?it/s]

NER Loss: 37328.65487791598


0it [00:00, ?it/s]

NER Loss: 37097.58583243191


0it [00:00, ?it/s]

NER Loss: 37450.3614679873


0it [00:00, ?it/s]

NER Loss: 37376.733900904655


0it [00:00, ?it/s]

NER Loss: 37267.27497222088


0it [00:00, ?it/s]

NER Loss: 37299.33589744568


0it [00:00, ?it/s]

NER Loss: 37068.56867837906


0it [00:00, ?it/s]

NER Loss: 37330.06866842508


0it [00:00, ?it/s]

NER Loss: 36818.379946917295


0it [00:00, ?it/s]

NER Loss: 36930.869483709335


0it [00:00, ?it/s]

NER Loss: 36828.873002529144


0it [00:00, ?it/s]

NER Loss: 37040.07736814022


0it [00:00, ?it/s]

NER Loss: 36874.36979177594


0it [00:00, ?it/s]

NER Loss: 36940.06986400485


0it [00:00, ?it/s]

NER Loss: 36903.627212405205


0it [00:00, ?it/s]

NER Loss: 37098.80882374197


0it [00:00, ?it/s]

NER Loss: 37013.013229072094


0it [00:00, ?it/s]

NER Loss: 36588.70508906618


0it [00:00, ?it/s]

NER Loss: 36620.10859525204


0it [00:00, ?it/s]

NER Loss: 36702.46385836601


0it [00:00, ?it/s]

NER Loss: 36660.59601583332


0it [00:00, ?it/s]

NER Loss: 36805.11048105359


0it [00:00, ?it/s]

NER Loss: 36738.13179038465


0it [00:00, ?it/s]

NER Loss: 36633.931630939245


0it [00:00, ?it/s]

NER Loss: 36641.86126100272


0it [00:00, ?it/s]

NER Loss: 36522.6586907655


0it [00:00, ?it/s]

NER Loss: 36711.41562333703


0it [00:00, ?it/s]

NER Loss: 36374.29920647293


0it [00:00, ?it/s]

NER Loss: 36786.68952295184


0it [00:00, ?it/s]

NER Loss: 36390.49623632431


0it [00:00, ?it/s]

NER Loss: 36671.15628564358


0it [00:00, ?it/s]

NER Loss: 36238.15839013457


0it [00:00, ?it/s]

NER Loss: 36393.87537680566


0it [00:00, ?it/s]

NER Loss: 36133.889384649694


0it [00:00, ?it/s]

NER Loss: 36614.23965507746


0it [00:00, ?it/s]

NER Loss: 36402.126871705055


0it [00:00, ?it/s]

NER Loss: 36328.36373576522


0it [00:00, ?it/s]

NER Loss: 36097.285271406174


0it [00:00, ?it/s]

NER Loss: 36267.2390601933


0it [00:00, ?it/s]

NER Loss: 35994.681202828884


0it [00:00, ?it/s]

NER Loss: 36145.687381491065


0it [00:00, ?it/s]

NER Loss: 35647.51537903026


0it [00:00, ?it/s]

NER Loss: 36093.694907188416


0it [00:00, ?it/s]

NER Loss: 35961.23419852555


0it [00:00, ?it/s]

NER Loss: 35787.22582793236


0it [00:00, ?it/s]

NER Loss: 35859.42261594534


0it [00:00, ?it/s]

NER Loss: 36062.57434979081


0it [00:00, ?it/s]

NER Loss: 36021.00970778987


0it [00:00, ?it/s]

NER Loss: 36059.141882389784


0it [00:00, ?it/s]

NER Loss: 35565.81542447209


0it [00:00, ?it/s]

NER Loss: 35961.64190304279


0it [00:00, ?it/s]

NER Loss: 36012.898213759065


0it [00:00, ?it/s]

NER Loss: 35949.38183212653


0it [00:00, ?it/s]

NER Loss: 36309.80254402757


0it [00:00, ?it/s]

NER Loss: 35978.57386884093


In [5]:
# Save model
model.to_disk('models/poi-9/')
losses.to_csv('training_logs/poi-9.csv', index=False)

## Prediction

In [6]:
test = pd.read_csv('test_data/test_poi-et3.csv')

In [7]:
docs = []
ents = []

for i in tqdm(range(test.shape[0])):
    temp_doc = model(test.raw_address.iloc[i])
    docs.append(df.raw_address.iloc[i])
    temp_ents = [ent.text for ent in temp_doc.ents]
    if len(temp_ents) > 0:
        ents.append(temp_ents[0])
    else:
        ents.append('')

  0%|          | 0/50000 [00:00<?, ?it/s]

In [8]:
preds = test.copy()
preds['poi'] = ents
preds.to_csv('submissions/raw/poi9.csv', index=False)