# Part of speech labeling using NLPScholar

In [1]:
!pip install gdown



Goal: Label English words with their part of speech in a sentence

Data: Let's use some data that's not fully correctly formatted for NLPScholar so you can see how it works. Namely, data from [here](https://github.com/UniversalDependencies/UD_English-PUD/blob/master/en_pud-ud-test.conllu). 

In [2]:
# Sample
!gdown '1IYUBUOcS_hZINPicUPXffKEf8NvY6gC9'

Downloading...
From: https://drive.google.com/uc?id=1IYUBUOcS_hZINPicUPXffKEf8NvY6gC9
To: /Users/forrestdavis/Teaching/NLPScholar/src/docs/token_classification_example/en_pud-ud-test.conllu
100%|██████████████████████████████████████| 1.35M/1.35M [00:00<00:00, 3.29MB/s]


In [46]:
def get_sentences_from_conllu(fname: str):
    all_sentences = []
    all_labels = []
    sentence = []
    labels = []
    pos2id = {}
    idx = 0
    with open(fname, 'r') as f:
        for line in f:
            line = line.strip()
            if '#' in line:
                continue

            line = line.split()

            if line == []:
                all_sentences.append(sentence)
                all_labels.append(labels)
                sentence = []
                labels = []
                continue 
                
            word, pos = line[1], line[3]
            if pos not in pos2id:
                pos2id[pos] = idx
                idx += 1
            sentence.append(word)
            labels.append(pos2id[pos])

    return all_sentences, all_labels, pos2id

sentences, labels, pos2id = get_sentences_from_conllu('en_pud-ud-test.conllu')

In [7]:
print(sentences[0])
print(labels[0])

['“', 'While', 'much', 'of', 'the', 'digital', 'transition', 'is', 'unprecedented', 'in', 'the', 'United', 'States', ',', 'the', 'peaceful', 'transition', 'of', 'power', 'is', 'not', ',', '”', 'Obama', 'special', 'assistant', 'Kori', 'Schulman', 'wrote', 'in', 'a', 'blog', 'post', 'Monday', '.']
[0, 1, 2, 3, 4, 2, 5, 6, 2, 3, 4, 7, 7, 0, 4, 2, 5, 3, 5, 6, 8, 0, 0, 7, 2, 5, 7, 7, 9, 3, 4, 5, 5, 7, 0]


In [11]:
for pos, idx in pos2id.items():
    print(f"{idx}: {pos}")

0: PUNCT
1: SCONJ
2: ADJ
3: ADP
4: DET
5: NOUN
6: AUX
7: PROPN
8: ADV
9: VERB
10: PRON
11: CCONJ
12: PART
13: SYM
14: NUM
15: _
16: INTJ
17: X


In [47]:
def split_data(sentences: list, labels: list): 
    ind1 = int(len(sentences)*0.8)
    ind2 = int(len(sentences)*0.9)
    train = sentences[:ind1] 
    val = sentences[ind1 : ind2]
    test = sentences[ind2 : ]
    sentences = {'train': train, 
                'valid': val, 
                'test': test}
    train = labels[:ind1] 
    val = labels[ind1 : ind2]
    test = labels[ind2 : ]
    labels = {'train': train, 
                'valid': val, 
                'test': test} 
    return sentences, labels
sentences, labels = split_data(sentences, labels)

In [48]:
# create jsonl
import json
with open("train.jsonl", 'w') as f:
    for sentence, label in zip(sentences['train'], labels['train']):
        item = {'tokens': sentence, 
                'tags': label}
        f.write(json.dumps(item) + "\n")

In [49]:
# create json 
with open("valid.jsonl", 'w') as f:
    for sentence, label in zip(sentences['valid'], labels['valid']):
        item = {'tokens': sentence, 
                'tags': label}
        f.write(json.dumps(item) + "\n")

In [50]:
# create test
import pandas as pd
# Has to be a text file with the following information
data = {'textid': [],
        'text': [], 
        'condition': [],
        'target': []}

# reverse pos2id
id2pos = {}
for pos, idx in pos2id.items():
    id2pos[idx] = pos

textid = 0
for s, l in  zip(sentences['test'], labels['test']):
    # map labels back to pos
    l = list(map(lambda x: id2pos[x], l))
    data['textid'].append(textid)
    data['text'].append(' '.join(s))
    data['condition'].append('UD_EN')
    data['target'].append(' '.join(l))
data = pd.DataFrame.from_dict(data)
data.to_csv('test.tsv', sep='\t', index=False)

        