In [1]:
%%capture

!pip install transformers
!pip install datasets
# evalution metrics packages
!pip install evaluate
!pip install seqeval

In [2]:
from datasets import load_dataset
import pprint

dataset = load_dataset("conll2003")
dataset

Downloading builder script:   0%|          | 0.00/9.57k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/3.73k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/12.2k [00:00<?, ?B/s]

Downloading and preparing dataset conll2003/conll2003 to /root/.cache/huggingface/datasets/conll2003/conll2003/1.0.0/9a4d16a94f8674ba3466315300359b0acd891b68b6c8743ddf60b9c702adce98...


Downloading data:   0%|          | 0.00/983k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/14041 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3250 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3453 [00:00<?, ? examples/s]

Dataset conll2003 downloaded and prepared to /root/.cache/huggingface/datasets/conll2003/conll2003/1.0.0/9a4d16a94f8674ba3466315300359b0acd891b68b6c8743ddf60b9c702adce98. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3453
    })
})

In [3]:
pprint.pp(dataset["train"][0])

{'id': '0',
 'tokens': ['EU',
            'rejects',
            'German',
            'call',
            'to',
            'boycott',
            'British',
            'lamb',
            '.'],
 'pos_tags': [22, 42, 16, 21, 35, 37, 16, 21, 7],
 'chunk_tags': [11, 21, 11, 12, 21, 22, 11, 12, 0],
 'ner_tags': [3, 0, 7, 0, 0, 0, 7, 0, 0]}


In [4]:
dir(dataset["train"].features["ner_tags"])

['__annotations__',
 '__class__',
 '__dataclass_fields__',
 '__dataclass_params__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__slotnames__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_type',
 'dtype',
 'feature',
 'id',
 'length',
 'pa_type']

In [5]:
dataset["train"].features["ner_tags"].feature

ClassLabel(names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC'], id=None)

In [6]:
!mkdir -p /content/data

!gdown https://drive.google.com/uc?id=10g5jQ1_gLQf8pxLDAv_XL37Vl8DcJgB9
!gdown https://drive.google.com/uc?id=1JbGpBUQJ_kKx0DLZBMMbKSAplbvkKYos
!gdown https://drive.google.com/uc?id=1u1be-yyEJ7NGgicvZ6eyCk1ScEOVA2IQ

!mv *.txt /content/data/

Downloading...
From: https://drive.google.com/uc?id=10g5jQ1_gLQf8pxLDAv_XL37Vl8DcJgB9
To: /content/train.txt
100% 4.65M/4.65M [00:00<00:00, 265MB/s]
Downloading...
From: https://drive.google.com/uc?id=1JbGpBUQJ_kKx0DLZBMMbKSAplbvkKYos
To: /content/dev.txt
100% 243k/243k [00:00<00:00, 130MB/s]
Downloading...
From: https://drive.google.com/uc?id=1u1be-yyEJ7NGgicvZ6eyCk1ScEOVA2IQ
To: /content/test.txt
100% 243k/243k [00:00<00:00, 126MB/s]


In [7]:
!head -n3 /content/data/train.txt
!head -n3 /content/data/test.txt
!head -n3 /content/data/dev.txt


# id 309f5b26-951e-472b-948e-47632249862b	domain=en
robert _ _ B-OtherPER

# id 5239d808-f300-46ea-aa3b-5093040213a3	domain=en
eli _ _ B-OtherPER

# id 5239d808-f300-46ea-aa3b-5093040213a3	domain=en
eli _ _ B-OtherPER


In [8]:
import os
from tqdm.auto import tqdm
import subprocess
import numpy as np
import pandas as pd


def num_lines_in_file(file_path):
    """ Calculate the number of line in a file """
    return int(subprocess.check_output('wc -l %s' % file_path, shell=True).strip().split()[0])

def convert_to_conll(text_file):
    if not os.path.exists(text_file):
        return []
    
    data = {"tokens": [], "ner_tags": []}

    with open(text_file, "r", encoding="utf-8") as f:
        tokens, ner_tags = [], []
        for i, line in tqdm(enumerate(f), total=num_lines_in_file(text_file), position=0):
            line = line.strip()
            if line:
                if line.startswith("#"):
                    if len(tokens) > 0 and len(ner_tags) > 0:
                        data["tokens"].append(tokens)
                        data["ner_tags"].append(ner_tags)

                        tokens, ner_tags = [], []

                else:
                    data_line = line.split("_ _")
                    tokens.append(data_line[0].strip())
                    ner_tags.append(data_line[1].strip())
    
    return data

In [9]:
df_train = pd.DataFrame(convert_to_conll("/content/data/train.txt"))
df_train.head()

  0%|          | 0/303345 [00:00<?, ?it/s]

Unnamed: 0,tokens,ner_tags
0,"[robert, gottschalk, 1939, academy, award, win...","[B-OtherPER, I-OtherPER, O, B-VisualWork, I-Vi..."
1,"[during, the, reign, of, the, tongzhi, emperor...","[O, O, O, O, O, B-OtherPER, I-OtherPER, O, O, ..."
2,"[further, research, led, in, the, 1960s, to, t...","[O, O, O, O, O, O, O, O, B-OtherPER, O, O, O, ..."
3,"[the, ideas, were, introduced, by, william, bu...","[O, O, O, O, O, B-OtherPER, I-OtherPER, O, O, ..."
4,"[thaddeus, mosley, (, a&s, 1950, ), –, sculpto...","[B-OtherPER, I-OtherPER, O, O, O, O, O, O, O, ..."


In [10]:
df_dev = pd.DataFrame(convert_to_conll("/content/data/dev.txt"))
df_dev.head()

  0%|          | 0/15936 [00:00<?, ?it/s]

Unnamed: 0,tokens,ner_tags
0,"[eli, lilly, founder, president, of, pharmaceu...","[B-OtherPER, I-OtherPER, O, O, O, O, O, B-Publ..."
1,"[christoph, haberland, designed, a, new, marbl...","[B-OtherPER, I-OtherPER, O, O, O, O, B-OtherPR..."
2,"[he, was, succeeded, as, chancellor, by, sir, ...","[O, O, O, O, O, O, B-OtherPER, I-OtherPER, I-O..."
3,"[it, was, described, by, edward, meyrick, in, ...","[O, O, O, O, B-OtherPER, I-OtherPER, O, O, O]"
4,"[having, suffered, depredation, of, the, more,...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."


In [11]:
df_test = pd.DataFrame(convert_to_conll("/content/data/test.txt"))
df_test.head()

  0%|          | 0/15936 [00:00<?, ?it/s]

Unnamed: 0,tokens,ner_tags
0,"[eli, lilly, founder, president, of, pharmaceu...","[B-OtherPER, I-OtherPER, O, O, O, O, O, B-Publ..."
1,"[christoph, haberland, designed, a, new, marbl...","[B-OtherPER, I-OtherPER, O, O, O, O, B-OtherPR..."
2,"[he, was, succeeded, as, chancellor, by, sir, ...","[O, O, O, O, O, O, B-OtherPER, I-OtherPER, I-O..."
3,"[it, was, described, by, edward, meyrick, in, ...","[O, O, O, O, B-OtherPER, I-OtherPER, O, O, O]"
4,"[having, suffered, depredation, of, the, more,...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."


In [12]:
import itertools

tag_list = list(sorted(list(set(list(itertools.chain(*df_train["ner_tags"].values.tolist()))))))
tag_list.remove("O")
tag_list = ["O"] + tag_list
print(f"Tag list #{len(tag_list)}")

Tag list #67


In [13]:
!mkdir -p /content/ner_data

df_train.to_csv("/content/ner_data/train.csv", sep="\t", encoding="utf-8", index=False)
df_dev.to_csv("/content/ner_data/dev.csv", sep="\t", encoding="utf-8", index=False)
df_test.to_csv("/content/ner_data/test.csv", sep="\t", encoding="utf-8", index=False)

In [14]:
import datasets
from datasets import load_dataset

data_files = {}
data_files["train"] = "/content/ner_data/train.csv"
data_files["validation"] = "/content/ner_data/dev.csv"

extension = data_files["train"].split(".")[-1]
raw_datasets = load_dataset(extension, data_files=data_files, delimiter="\t")
raw_datasets



Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-6077e46db1ebe3a0/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

  

Extracting data files #0:   0%|          | 0/1 [00:00<?, ?obj/s]

Extracting data files #1:   0%|          | 0/1 [00:00<?, ?obj/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-6077e46db1ebe3a0/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 16777
    })
    validation: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 870
    })
})

In [15]:
pprint.pp(raw_datasets["train"][0])

{'tokens': "['robert', 'gottschalk', '1939', 'academy', 'award', 'winner', "
           "'and', 'founder', 'of', 'panavision']",
 'ner_tags': "['B-OtherPER', 'I-OtherPER', 'O', 'B-VisualWork', "
             "'I-VisualWork', 'O', 'O', 'O', 'O', 'B-ORG']"}


In [16]:
features = datasets.Features({
    "tokens": datasets.features.Sequence(datasets.features.Value("string")),
    "ner_tags": datasets.features.Sequence(datasets.features.ClassLabel(names=tag_list))
})
features

{'tokens': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None),
 'ner_tags': Sequence(feature=ClassLabel(names=['O', 'B-AerospaceManufacturer', 'B-AnatomicalStructure', 'B-ArtWork', 'B-Artist', 'B-Athlete', 'B-CarManufacturer', 'B-Cleric', 'B-Clothing', 'B-Disease', 'B-Drink', 'B-Facility', 'B-Food', 'B-HumanSettlement', 'B-MedicalProcedure', 'B-Medication/Vaccine', 'B-MusicalGRP', 'B-MusicalWork', 'B-ORG', 'B-OtherLOC', 'B-OtherPER', 'B-OtherPROD', 'B-Politician', 'B-PrivateCorp', 'B-PublicCorp', 'B-Scientist', 'B-Software', 'B-SportsGRP', 'B-SportsManager', 'B-Station', 'B-Symptom', 'B-Vehicle', 'B-VisualWork', 'B-WrittenWork', 'I-AerospaceManufacturer', 'I-AnatomicalStructure', 'I-ArtWork', 'I-Artist', 'I-Athlete', 'I-CarManufacturer', 'I-Cleric', 'I-Clothing', 'I-Disease', 'I-Drink', 'I-Facility', 'I-Food', 'I-HumanSettlement', 'I-MedicalProcedure', 'I-Medication/Vaccine', 'I-MusicalGRP', 'I-MusicalWork', 'I-ORG', 'I-OtherLOC', 'I-OtherPER', 'I-OtherPROD', 'I-Polit

In [17]:
import ast

def prepare_features(examples):
    ner_tags = []
    tokens = []

    for i, example in enumerate(examples["ner_tags"]):
        ner_tags.append([tag_list.index(tag) for tag in ast.literal_eval(examples["ner_tags"][i])])
        tokens.append(ast.literal_eval(examples["tokens"][i]))

    return {
        "tokens": tokens,
        "ner_tags": ner_tags
    }

In [18]:
dataset_prep = raw_datasets.map(prepare_features, features=features, batched=True, batch_size=100)
dataset_prep

  0%|          | 0/168 [00:00<?, ?ba/s]

  0%|          | 0/9 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 16777
    })
    validation: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 870
    })
})

In [19]:
pprint.pp(raw_datasets["train"][0])
pprint.pp(dataset_prep["train"][0])

{'tokens': "['robert', 'gottschalk', '1939', 'academy', 'award', 'winner', "
           "'and', 'founder', 'of', 'panavision']",
 'ner_tags': "['B-OtherPER', 'I-OtherPER', 'O', 'B-VisualWork', "
             "'I-VisualWork', 'O', 'O', 'O', 'O', 'B-ORG']"}
{'tokens': ['robert',
            'gottschalk',
            '1939',
            'academy',
            'award',
            'winner',
            'and',
            'founder',
            'of',
            'panavision'],
 'ner_tags': [20, 53, 0, 32, 65, 0, 0, 0, 0, 18]}
