In [65]:
import torch
torch.get_num_threads(), torch.get_num_interop_threads()

(8, 8)

In [123]:
from flair.data import Corpus
from flair.datasets import ColumnCorpus

# define columns
columns = {0: 'text', 1: 'pos_raw', 2:'pos', 3: 'ner'}

# this is the folder in which train, test and dev files reside
data_folder = "lang/train"

# init a corpus using column format, data folder and the names of the train, dev and test files
corpus: Corpus = ColumnCorpus(data_folder, columns,
                              train_file='train.txt',
                              test_file='test.txt',
                              dev_file='dev.txt')

2021-05-03 08:51:52,624 Reading data from lang/train
2021-05-03 08:51:52,625 Train: lang/train/train.txt
2021-05-03 08:51:52,626 Dev: lang/train/dev.txt
2021-05-03 08:51:52,626 Test: lang/train/test.txt


In [126]:
print(corpus.train[0].to_tagged_string('ner'))

Jonathan <B-PER> E. <I-PER> Lyerly <I-PER> , Birmingham <B-LOC> , for Chatham <B-PER> Steel <I-PER> Corp <I-PER> .


In [135]:
from flair.embeddings import CharacterEmbeddings
from flair.embeddings import BertEmbeddings
from flair.embeddings import TransformerWordEmbeddings

tag_type = 'ner'

# 3. make the tag dictionary from the corpus
tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)
print(tag_dictionary)

# 4. initialize embeddings
embedding_types = [
    WordEmbeddings('glove'),
    FlairEmbeddings('news-forward'),
    FlairEmbeddings('news-backward'),
]

embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types)

# 5. initialize sequence tagger
from flair.models import SequenceTagger

tagger: SequenceTagger = SequenceTagger(hidden_size=256,
                                        embeddings=embeddings,
                                        tag_dictionary=tag_dictionary,
                                        tag_type=tag_type,
                                        use_crf=True)

# 6. initialize trainer
from flair.trainers import ModelTrainer

trainer: ModelTrainer = ModelTrainer(tagger, corpus)

# 7. start training
trainer.train('resources/taggers/caselaw-ner',
              learning_rate=0.1,
              mini_batch_size=32,
              max_epochs=300,
              train_with_dev=True)

Dictionary with 11 tags: <unk>, O, B-PER, I-PER, B-LOC, B-ORG, I-ORG, I-LOC, for, <START>, <STOP>
2021-05-04 17:49:22,747 ----------------------------------------------------------------------------------------------------
2021-05-04 17:49:22,750 Model: "SequenceTagger(
  (embeddings): StackedEmbeddings(
    (list_embedding_0): WordEmbeddings('glove')
    (list_embedding_1): FlairEmbeddings(
      (lm): LanguageModel(
        (drop): Dropout(p=0.05, inplace=False)
        (encoder): Embedding(300, 100)
        (rnn): LSTM(100, 2048)
        (decoder): Linear(in_features=2048, out_features=300, bias=True)
      )
    )
    (list_embedding_2): FlairEmbeddings(
      (lm): LanguageModel(
        (drop): Dropout(p=0.05, inplace=False)
        (encoder): Embedding(300, 100)
        (rnn): LSTM(100, 2048)
        (decoder): Linear(in_features=2048, out_features=300, bias=True)
      )
    )
  )
  (word_dropout): WordDropout(p=0.05)
  (locked_dropout): LockedDropout(p=0.5)
  (embedding2nn): L

{'test_score': 0.9610806504522236,
 'dev_score_history': [],
 'train_loss_history': [4.217902458025231,
  1.9173794817574457,
  1.5259242630510501,
  1.3148993303297394,
  1.1945309310325012,
  1.1001300050422769,
  1.0178894032078505,
  0.9777165817105945,
  0.9312874959055547,
  0.8807075541910107,
  0.8536559131367172,
  0.804245673837125,
  0.7834359433678196,
  0.7761984739762535,
  0.7460582441062367,
  0.724599302398439,
  0.7121288289253708,
  0.6866882113803658,
  0.672673628852379,
  0.6519406260907358,
  0.6427299273344665,
  0.6353996382208866,
  0.6155686472796305,
  0.6099216974967845,
  0.5983864381107187,
  0.5683613157607798,
  0.5687790419602521,
  0.5745856598578697,
  0.5533680060352723,
  0.5513438197858571,
  0.5318309461020334,
  0.526245037965829,
  0.5215133483997177,
  0.5075164587034876,
  0.4976111733359186,
  0.4900099718867274,
  0.49288949705842655,
  0.47406843328942677,
  0.4781086317477763,
  0.4725292115236768,
  0.46481637367416556,
  0.4516221803125

In [130]:
from flair.data import Sentence
from flair.models import SequenceTagger
model = SequenceTagger.load('resources/taggers/caselaw-ner/final-model.pt')

2021-05-03 10:29:17,086 loading file resources/taggers/caselaw-ner/final-model.pt


In [131]:
text = "Thos. James, A. A. Jones, Peter Smith, Jr., District Attorney, and Harwell G. Davis, Asst. Atty. Gen., and Ridenour, Swenson, Gleere & Evans by Hon. Harold H. Swenson, Jr. and Jeffrey A. Bernick, Phoenix, for Plaintiff with P. R. Jones for Cisco, Inc., along with Geo. M. Jackson from Cupid, Jokester, Nostrum & Jacobs, P. C. for petitioner and state of Connecticut. Adam Wm. Puke for City of New York"
def parseText(text):
    sentence = Sentence(text)
    model.predict(sentence)
    print(sentence.to_tagged_string())
    for span in sentence.get_spans():
        print(span)
parseText(text)

Thos <B-PER> . <I-PER> James <I-PER> , A. <B-PER> A. <I-PER> Jones <I-PER> , Peter <B-PER> Smith <I-PER> , <I-PER> Jr. <I-PER> , District Attorney , and Harwell <B-PER> G. <I-PER> Davis <I-PER> , Asst . Atty . Gen. , and Ridenour <B-ORG> , <I-ORG> Swenson <I-ORG> , <I-ORG> Gleere <I-ORG> & <I-ORG> Evans <I-ORG> by Hon <B-PER> . <I-PER> Harold <I-PER> H. <I-PER> Swenson <I-PER> , <I-PER> Jr. <I-PER> and Jeffrey <B-PER> A. <I-PER> Bernick <I-PER> , Phoenix <B-LOC> , for Plaintiff with P <B-PER> . <I-PER> R. <I-PER> Jones <I-PER> for Cisco <B-ORG> , <I-ORG> Inc. <I-ORG> , along with Geo <B-PER> . <I-PER> M. <I-PER> Jackson <I-PER> from Cupid <B-ORG> , <I-ORG> Jokester <I-ORG> , <I-ORG> Nostrum <I-ORG> & <I-ORG> Jacobs <I-ORG> , P. C. for petitioner and state of Connecticut <B-LOC> . Adam <B-PER> Wm <I-PER> . <I-PER> Puke <I-PER> for City of New <B-LOC> York <I-LOC>
Span [1,2,3]: "Thos . James"   [− Labels: PER (0.7764)]
Span [5,6,7]: "A. A. Jones"   [− Labels: PER (0.8883)]
Span [9,10,11,

In [132]:
def tagIt(text):
    sentence = Sentence(text)
    model.predict(sentence)
%timeit tagIt(text)

27.3 ms ± 186 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [133]:
parseText("Douglas Catts, of Schmittinger and Rodriguez, Dover, Stuart Young, of Young, Conaway, Stargatt and Taylor, Wilmington, Scott Reese, of Cooch and Taylor, Wilmington, Jeffery S. Marlin, of Tybout, Redfearn, Casarino and Pell, Wilmington, J.R. Julian, of J.R. Julian Professional Ass’n, Wilmington, James T. McKinstry, of Richards, Lay-ton and Finger, Wilmington, Michael Good-rick, of Theison, Lank, Mulford and Goldberg, P.A., Wilmington, James F. Kipp, of Trzuskowski, Kipp, Kelleher and Pearce, P.A., Wilmington, Howard M. Berg, of Howard M. Berg and Associates, P.A., Wilmington, James W. Semple, of Morris, James, Hitchens and Williams, Wilmington, James Perry, of Komissaroff and Perry, Wilmington, Somers S. Price, of Potter, Anderson and Corroon, Wilmington, John El-zufon, of Elzufon and Bailey, Wilmington, Michael J. Johnson, Biggs and Battaglia, Wilmington, Richard R.S. Hannum, of Prickett, Jones, Elliott, Kristol and Schnee, Wilmington, Richard Allen Paul, of E.I. du-Pont deNemours and Co., Inc., Wilmington, for defendants.")

Douglas <B-PER> Catts <I-PER> , of Schmittinger <B-ORG> and <I-ORG> Rodriguez <I-ORG> , Dover <B-LOC> , Stuart <B-PER> Young <I-PER> , of Young <B-LOC> , Conaway <B-ORG> , <I-ORG> Stargatt <I-ORG> and <I-ORG> Taylor <I-ORG> , Wilmington <B-LOC> , Scott <B-PER> Reese <I-PER> , of Cooch <B-ORG> and <I-ORG> Taylor <I-ORG> , Wilmington <B-LOC> , Jeffery <B-PER> S. <I-PER> Marlin <I-PER> , of Tybout <B-ORG> , <I-ORG> Redfearn <I-ORG> , <I-ORG> Casarino <I-ORG> and <I-ORG> Pell <I-ORG> , Wilmington <B-LOC> , J.R. <B-PER> Julian <I-PER> , of J.R <B-PER> . <I-PER> Julian <I-PER> Professional <I-PER> Ass’n <I-PER> , Wilmington , James <B-PER> T <I-PER> . <I-PER> McKinstry <I-PER> , of Richards <B-LOC> , Lay-ton <B-ORG> and <I-ORG> Finger <I-ORG> , Wilmington <B-LOC> , Michael <B-PER> Good-rick <I-PER> , of Theison <B-ORG> , <I-ORG> Lank <I-ORG> , <I-ORG> Mulford <I-ORG> and <I-ORG> Goldberg <I-ORG> , P.A. , Wilmington , James <B-PER> F. <I-PER> Kipp <I-PER> , of Trzuskowski <B-ORG> , <I-ORG> Ki