This notebook looks at the output from the snippet repository
and how to use it to train NER, classification, and mlm models.

In [6]:
from functools import partial
from importlib import reload
from typing import List

import torch
from spacy import displacy
import src.data.snippet_repository as sr

### Named Enitiy Recognition Models (NER)

In [7]:
ner_repo = sr.SnippetRepository(sr.SnippetRepositoryMode.NER)

In [8]:
ner_data = ner_repo.get_training_data(batch_size=10)
detected = False
while not detected:
    ner_df = next(ner_data)
    detected = any(ner_df['ner_tags'].apply(lambda ner_tags: any(map(lambda t: t!="O", ner_tags))))

In [9]:
text = ner_df.iloc[9].text
ner_tags = ner_df.iloc[9].ner_tags
sr.visualize_ner_tags(text, ner_tags)

In [10]:
mlm_repo = sr.SnippetRepository(sr.SnippetRepositoryMode.MASKED_LM)

In [11]:
mlm_data = mlm_repo.get_training_data(batch_size=10)

In [14]:
df = next(mlm_data)
df

Unnamed: 0,text,pos_tags,mask,label
0,"[These, investments, in, conventional, social,...","[DT, NNS, IN, JJ, JJ, NNS, VBP, VBN, TO, VB, N...","[False, False, False, False, False, False, Fal...",0
1,"[The, arbiter, ,, a, member, of, the, project,...","[DT, NN, ,, DT, NN, IN, DT, NN, NN, IN, JJ, NN...","[False, False, False, False, False, False, Fal...",0
2,"[All, 90, students, in, the, class, were, divi...","[DT, CD, NNS, IN, DT, NN, VBD, VBN, IN, CD, NN...","[False, False, False, False, False, False, Fal...",0
3,"[While, CNNs, are, trained, to, map, input, im...","[IN, NNS, VBP, VBN, TO, VB, NN, NNS, IN, DT, V...","[False, False, False, False, False, False, Fal...",0
4,"[The, M, protein, has, the, ability, to, form,...","[DT, NNP, NN, VBZ, DT, NN, TO, VB, NN, HYPH, J...","[False, False, False, False, False, False, Fal...",0
5,"[However, ,, the, following, autumn, ,, it, wa...","[RB, ,, DT, VBG, NN, ,, PRP, VBD, VBN, IN, JJS...","[False, False, False, False, False, False, Fal...",0
6,"[5, -, 22, Table, 5, -, 9, ., Tenure, status, ...","[CD, SYM, CD, NNP, CD, SYM, CD, ., NN, NN, IN,...","[False, False, False, False, False, False, Fal...",0
7,"[Residuals, of, the, regression, models, were,...","[NNS, IN, DT, NN, NNS, VBD, VBN, IN, NN, IN, D...","[False, False, False, False, False, False, Fal...",0
8,"[12, Representatives, of, state, and, local, e...","[CD, NNPS, IN, NN, CC, JJ, JJ, NNS, :, JJ, IN,...","[False, False, False, False, False, False, Fal...",0
9,"[These, should, be, able, to, identify, antibo...","[DT, MD, VB, JJ, TO, VB, NNS, NN, IN, DT, NNP,...","[False, False, False, False, False, False, Fal...",0


In [13]:
df

Unnamed: 0,text,tags,ner_tags
0,"[Projections, and, Potential, Impacts, ,, 175,...","[NNP, CC, NNP, NNPS, ,, CD, NN, NNP, ,, NNP, I...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
1,"[Given, the, complexity, of, the, regional, oc...","[VBN, DT, NN, IN, DT, JJ, NN, NN, IN, DT, NNP,...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
2,"[The, table, may, help, users, to, estimate, t...","[DT, NN, MD, VB, NNS, TO, VB, DT, JJ, JJ, NN, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
3,"[The, sample, powder, was, then, split, for, s...","[DT, NN, NN, VBD, RB, VBN, IN, JJ, NN, CC, NNP...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O]"
4,"[The, 237, subjects, without, manual, segmenta...","[DT, CD, NNS, IN, JJ, NNS, VBD, VBN, IN, JJ, N...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
5,"[+, (, 1, -, p1)a2, ], +, Pr(G, =, 2)[pg=2(1, ...","[NFP, -LRB-, CD, HYPH, NN, -RRB-, :, NNS, NN, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O]"
6,"[Many, immigrant, workers, moved, to, fill, me...","[JJ, JJ, NNS, VBD, TO, VB, NN, NNS, IN, PRP$, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
7,"[Therefore, ,, we, recommend, collecting, how,...","[RB, ,, PRP, VBP, VBG, WRB, NNP, VBZ, VBN, VBN...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
8,"[However, ,, IMR, are, 3.81, %, more, likely, ...","[RB, ,, NNP, VBP, CD, NN, RBR, JJ, IN, NNP, TO...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
9,"[(, Though, we, do, not, consider, (, R, >, r,...","[-LRB-, IN, PRP, VBP, RB, VB, -LRB-, NN, XX, N...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
