### Imports

In [2]:
import spacy
import random
import copy
from spacy.gold import GoldParse, offsets_from_biluo_tags, biluo_tags_from_offsets
from spacy.tokens import Doc
from spacy.pipeline import EntityRecognizer
from spacy.util import minibatch, compounding
import pandas as pd
from collections import defaultdict

### Training

In [None]:
nlp = spacy.blank('en')
print('Loaded model')
new_infixes = nlp.Defaults.infixes + (r'''[-\/\,]''',)
compiled_infixes = spacy.util.compile_infix_regex(new_infixes)
nlp.tokenizer.infix_finditer = compiled_infixes.finditer
if "ner" not in nlp.pipe_names:
    ner = nlp.create_pipe("ner")
    nlp.add_pipe(ner, last=True)
else:
    ner = nlp.get_pipe("ner")
print('Added delims')
for x in ['StreetAddDefaulter', 'CityDefaulter', 'StateDefaulter', 'PinCodeDefaulter', 'NameDefaulter', 'AmountDefaulter', 'EvictionDateDefaulter', 'ThreatDefaulter', 'DocDateDefaulter']:
    ner.add_label(x)
print('Added labels')

In [None]:
dat = pd.io.excel.read_excel('/home/hackathon/data/hackathon_train_set_tabular_2.xlsx')
sentences = defaultdict(list)
nerv = defaultdict(list)
newdat = zip([dat['doc_num'][x] for x in range(20244)], [dat['sentence_num'][x] for x in range(20244)], [dat['unique_tok_id'][x] for x in range(20244)], [dat['word'][x] for x in range(20244)], [dat['ner'][x] for x in range(20244)])

for x, y, z, v, w in newdat:
    sentences[y].append(v)
    nerv[y].append(w)
    
#print(len(sentences), len(nerv))
train_data = []
for x in range(1, len(nerv)+1):
    try:
        doc = Doc(nlp.vocab, sentences[x])
        offsets = offsets_from_biluo_tags(doc, nerv[x])
        train_data.append((doc, {'entities': offsets}))
    except:
        pass

test_data = train_data[:300]
train_data = train_data[300:]

(len(test_data), len(train_data))

In [None]:
nlp = spacy.load('./nlpv5')
#other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]
#nlp.disable_pipes(*other_pipes)
#nlp.begin_training()
for itn in range(50):
    random.shuffle(train_data)
    losses = {}
    batches = minibatch(train_data, size=compounding(256.0, 1700.0, 1.002))
    for batch in batches:
        texts, annotations = zip(*batch)
        nlp.update(
            texts,
            annotations,
            drop=0.05,
            losses=losses,
        )
    print(itn, "Losses", losses)

nlp.to_disk('./nlpv6')

### Testing

In [35]:
test_file = '/home/hackathon/other/hackathon_test_set_tabular_2_wo_out.xlsx'

testing = pd.io.excel.read_excel(test_file, keep_default_na=False)

sentences = defaultdict(list)
xls_len = len(testing.index)
newdat = zip([testing['doc_num'][x] for x in range(xls_len)], [testing['sentence_num'][x] for x in range(xls_len)], [testing['unique_tok_id'][x] for x in range(xls_len)], [testing['word'][x] for x in range(xls_len)])

for x, y, z, v in newdat:
    sentences[y].append(v)
    
final_testing_data = []
for x in range(1, len(sentences)+1):
    final_testing_data.append(sentences[x])
    
final_testing_data

[["SO'd", 'ib101'],
 ['r', '-', 'PC(Lte31'],
 ['-', 'J4i', '(', 'UCAA', '-', 'A', '--', 'C', ')', 'W'],
 ['(', '-1', '-', 'c'],
 ['I', 'V11117'],
 ['10'],
 ['1e', '1'],
 ['I', 'NOTICE', 'TO', 'PAY', 'RENT', 'OR', 'QUIT', 'NOTICE'],
 ['TO', ':'],
 ['SHERNICE',
  'MUNDELL',
  ',',
  'TENANT',
  'in',
  'possession',
  'and',
  'all',
  'others',
  ':'],
 ['TAKE', 'NOTICE', 'THAT', ':'],
 ['1', '.'],
 ['Pursuant',
  'to',
  'a',
  'written',
  'lease',
  'dated',
  'August',
  '1',
  ',',
  '2018',
  'you',
  'are',
  'obligated',
  'to',
  'pay',
  'certain',
  'rents',
  'on',
  'the',
  'premises',
  'described',
  'as',
  ':',
  '1311',
  'CHARLESTOWN',
  'DR',
  '(',
  'the',
  '"',
  'Premises',
  '"',
  ')',
  ',',
  'of',
  'which',
  'you',
  'now',
  'hold',
  'possession',
  '.'],
 ['2', '.'],
 ['You',
  'are',
  'late',
  'in',
  'the',
  'payment',
  'of',
  'rents',
  'totaling',
  '$',
  '3',
  ',',
  '500.00',
  '.'],
 ['This',
  'rent',
  'was',
  'due',
  'on',
  'Januar

In [36]:
final_nerv = []

model_paths = ['./model-v6', './model-v7', './nlpv4', './nlpv5', './testingv1']

model = spacy.load(model_paths[4])
ner = model.get_pipe('ner')
for sentence in final_testing_data:
    doc = Doc(model.vocab, sentence)
    prediction = ner(doc)
    ents = [(x.start_char, x.end_char, x.label_) for x in prediction.ents]
    tags = biluo_tags_from_offsets(doc, ents)
    final_nerv.append(tags)
    
final_nerv

[['O', 'O'],
 ['O', 'O', 'O'],
 ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'],
 ['O', 'O', 'O', 'O'],
 ['O', 'O'],
 ['O'],
 ['O', 'O'],
 ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'],
 ['O', 'O'],
 ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'],
 ['O', 'O', 'O', 'O'],
 ['O', 'O'],
 ['O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-StreetAddDefaulter',
  'I-StreetAddDefaulter',
  'I-StreetAddDefaulter',
  'L-StreetAddDefaulter',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O'],
 ['O', 'O'],
 ['O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-AmountDefaulter',
  'I-AmountDefaulter',
  'I-AmountDefaulter',
  'L-AmountDefaulter',
  'O'],
 ['O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O'],
 ['O', 'O'],
 ['O',

In [37]:
ner_vals = []
for x in final_nerv:
    for e in x:
        ner_vals.append(e)
        
testing['ner'] = ner_vals
testing.to_excel('./team3_3.xlsx')

In [38]:
testing.shape

(7247, 5)