# Train spacy NER Model for Videogames

In this notebook we try to use the annotated video game wikipedia corpus in order to train a spacy model for video game entity recognition

### Prepare Training data

In [135]:
import json
import os

In [136]:
NER_DIR = "/home/pmuehleder/data/wikipedia/ner_train"

In [137]:
# load training dataset
sents = []
for filename in os.listdir(NER_DIR):
    with open(os.path.join(NER_DIR, filename)) as f:
        sents += json.load(f)

In [138]:
len(sents)

9446

In [122]:
sents[460]

{'annotations': [{'end': 172,
   'name': 'Final Fantasy IV',
   'start': 156,
   'type': 'Game'},
  {'end': 457,
   'name': 'Final Fantasy IV: The Complete Collection',
   'start': 416,
   'type': 'Game'}],
 'text': "'Final Fantasy IV: The After Years' is an episodic role-playing video game co-developed by Matrix Software and Square Enix, as the sequel to the 1991 title Final Fantasy IV. Originally released in Japan as a mobile game in 2008, an enhanced WiiWare port of the title was released in North America, Europe and Japan in 2009. In 2011, the game was bundled with Final Fantasy IV as the PlayStation Portable compilation Final Fantasy IV: The Complete Collection, which also included a new game; Final Fantasy IV: Interlude, which served as a bridge between the original game and The After Years. Using the same style as the Nintendo DS version of Final Fantasy IV, this game was remade for the Android and iOS platforms."}

In [34]:
TRAIN_DATA = []
for sent in sents:
    entities = [ (x["start"], x["end"], "VIDEOGAME") for x in sent["annotations"] ]
    TRAIN_DATA.append( (sent["text"], { 'entities': entities } ) )

In [35]:
TRAIN_DATA[3]

('Kaboom! is one of the titles included on the Activision Anthology compilation.',
 {'entities': [(45, 65, 'VIDEOGAME')]})

# Train spacy model

In [52]:
import spacy
import random
from tqdm import tqdm

In [40]:
LABEL = "VIDEOGAME"
OUT_DIR = "/home/pmuehleder/data/wikipedia/models"

In [77]:
output_dir = OUT_DIR
n_iter = 20

train_data = TRAIN_DATA[:3000]

In [78]:
#initiate empty model
nlp = spacy.blank("en")

In [79]:
#choose ner pipe if available, otherwise create ner pip
if 'ner' not in nlp.pipe_names:
    ner = nlp.create_pipe('ner')
    nlp.add_pipe(ner)
else:
    ner = nlp.get_pipe('ner')

In [80]:
#add new entity to ner
ner.add_label(LABEL)

In [81]:
#train new entity ner model
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
with nlp.disable_pipes(*other_pipes):  
    optimizer = nlp.begin_training()
    for itn in tqdm(range(n_iter)):
        random.shuffle(train_data)
        losses = {}
        for text, annotations in train_data:
            nlp.update([text], [annotations], sgd=optimizer, drop=0.35, losses=losses)
    print(losses)

100%|██████████| 20/20 [2:15:29<00:00, 406.45s/it]  

{'ner': 2622.976525583266}





In [82]:
#save model to disc
nlp.meta["name"] = "videogame"
nlp.to_disk(output_dir)