This notebook looks at the output from the snippet repository
and how to use it to train NER, classification, and mlm models.

In [1]:
from functools import partial
from importlib import reload
from typing import List

import torch
from spacy import displacy
import democratizing_data_ml_algorithms.data.snippet_repository as sr

### Named Enitiy Recognition Models (NER)

In [2]:
ner_repo = sr.SnippetRepository(sr.SnippetRepositoryMode.NER)

In [4]:
ner_data = ner_repo.get_training_data(batch_size=10)
detected = False
while not detected:
    ner_df = next(ner_data)
    detected = any(ner_df['ner_tags'].apply(lambda ner_tags: any(map(lambda t: t!="O", ner_tags))))

In [5]:
text = ner_df.iloc[9].text
ner_tags = ner_df.iloc[9].ner_tags
sr.visualize_ner_tags(text, ner_tags)

In [2]:
mlm_repo = sr.SnippetRepository(sr.SnippetRepositoryMode.MASKED_LM)

In [3]:
mlm_data = mlm_repo.get_training_data(batch_size=10, balance_labels=True)
detected = False
while not detected:
    mlm_df = next(mlm_data)
    detected = any(mlm_df['mask'].apply(lambda token_masks: any(token_masks)))

In [4]:
mlm_df

Unnamed: 0,text,pos_tags,mask,label
0,"[However, ,, the, percentages, of, blacks, ear...","[RB, ,, DT, NNS, IN, NNS, VBG, NNP, NN, POS, N...","[False, False, False, False, False, False, Fal...",0
1,"[5b, shows, DeKalb, County, ,, AL, ,, which, i...","[NNP, VBZ, NNP, NNP, ,, NNP, ,, WDT, IN, DT, J...","[False, False, False, False, False, False, Fal...",0
2,"[TIMSS, =, Trends, in, International, Mathemat...","[NNP, IN, NNS, IN, NNP, NNP, CC, NNP, NNP, .]","[False, False, True, True, True, True, True, T...",1
3,"[There, is, only, one, accuracy, for, the, fif...","[EX, VBZ, RB, CD, NN, IN, DT, JJ, NN, IN, DT, ...","[False, False, False, False, False, False, Fal...",0
4,"[x, G, :, Figure, 1, summarizes, the, relation...","[NFP, NN, :, NN, CD, VBZ, DT, NNS, IN, NNS, .]","[False, False, False, False, False, False, Fal...",0
5,"[Percentage, distribution, of, ever, married, ...","[NN, NN, IN, RB, VBN, CD, SYM, CD, NN, JJ, NNS...","[False, False, False, False, False, False, Fal...",1
6,"[A, variety, of, different, mixed, -, effects,...","[DT, NN, IN, JJ, JJ, HYPH, NNS, NNS, ,, DT, NN...","[False, False, False, False, False, False, Fal...",0
7,"[SOURCES, :, National, Science, Foundation, ,,...","[NNS, :, NNP, NNP, NNP, ,, NNP, NNP, IN, NNP, ...","[False, False, False, False, False, False, Fal...",1
8,"[Data, come, from, a, positively, selected, sa...","[NNS, VBP, IN, DT, RB, VBN, NN, IN, JJ, NNS, W...","[False, False, False, False, False, False, Fal...",0
9,"[Table, S6, Data, used, in, preparation, of, t...","[NN, NNP, NNP, VBN, IN, NN, IN, DT, NN, VBD, V...","[False, False, False, False, False, False, Fal...",1
