In [1]:
from google.colab import drive
drive.mount('/content/drive')
#!ls "/content/drive/My Drive/collab_sandbox"
%cd drive/MyDrive/collab_sandbox/NER/ner_tenses_recognition
!ls

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/MyDrive/collab_sandbox/NER/ner_tenses_recognition
datasets    logs				     ner-test.csv  spacy_ner
index.html  ner_tenses_recognition_conll_type.ipynb  results


In [None]:
!ls

datasets  ner_tenses_recognition_conll_type.ipynb  results
logs	  ner-test.csv				   spacy_ner


In [18]:
!pip install transformers catalyst datasets seqeval 

Collecting catalyst
  Downloading catalyst-20.12-py2.py3-none-any.whl (490 kB)
[K     |████████████████████████████████| 490 kB 6.0 MB/s 
[?25hCollecting datasets
  Downloading datasets-1.3.0-py3-none-any.whl (181 kB)
[K     |████████████████████████████████| 181 kB 24.0 MB/s 
[?25hCollecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[K     |████████████████████████████████| 43 kB 2.0 MB/s 
[?25hCollecting tensorboardX>=2.1.0
  Downloading tensorboardX-2.1-py2.py3-none-any.whl (308 kB)
[K     |████████████████████████████████| 308 kB 20.2 MB/s 
Collecting fsspec
  Downloading fsspec-0.8.7-py3-none-any.whl (103 kB)
[K     |████████████████████████████████| 103 kB 31.6 MB/s 
[?25hCollecting pyarrow>=0.17.1
  Downloading pyarrow-3.0.0-cp37-cp37m-manylinux2014_x86_64.whl (20.7 MB)
[K     |████████████████████████████████| 20.7 MB 1.4 MB/s 
Collecting huggingface-hub==0.0.2
  Downloading huggingface_hub-0.0.2-py3-none-any.whl (24 kB)
Collecting xxhash
  Downloading xxhash

In [19]:
import pandas as pd
import numpy as np
import json
from operator import itemgetter
import spacy

import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForTokenClassification, AutoConfig, DistilBertForTokenClassification
from catalyst.utils import set_global_seed
from datasets import load_dataset, load_metric
from sklearn.model_selection import train_test_split
from transformers import Trainer, TrainingArguments

SEED = 10
np.random.seed(SEED)
torch.manual_seed(SEED)

<torch._C.Generator at 0x7fc6ebbf2310>

# Prepare classifiend data for labeling

In [None]:
# OPTIONAL SHORTCUT FOR LABELING
# import spacy
# import pandas as pd

# nlp = spacy.load("en_core_web_sm")

# dataset_for_labeling_path = "./datasets/predicted_tenses_2_16_21_NER_1207_balanced_a1.csv"
# dataset_for_labeling = pd.read_csv(dataset_for_labeling_path, encoding='utf-8')
# dataset_for_labeling['original_sent'] = dataset_for_labeling['sent']
# # dataset_for_labeling = dataset_for_labeling[['sent']]

# def tokinize_sent(sent):
#     sent = str(sent)
#     sent = nlp(sent)
#     sent = " ".join([str(token) for token in sent])
#     return sent

# dataset_for_labeling['sent'] = dataset_for_labeling['sent'].apply(tokinize_sent)

# dataset_for_labeling.to_csv(f'{dataset_for_labeling_path[:-4]}_tokenized.csv',index=False, encoding='utf-8')

In [None]:
# dataset_for_labeling = pd.read_csv("./datasets/predicted_tenses_dataframe_NER_2_14_21_tokenized.csv")
# dataset_for_labeling.head()

In [None]:
# temp_dataset = dataset_for_labeling[dataset_for_labeling['type'].str.contains('a1_present_continuous_act_rn')]
# temp_dataset.to_csv("./datasets/predicted_tenses_dataframe_NER_2_14_21_tokenized_a1_present_continuous_act_rn.csv", index=False)

# Convert Labeled data to conll2003 format

In [22]:
def tokenize(text):
    tok_start = 0
    out = []
    for tok in text.split():
        if len(tok):
            out.append((tok, tok_start))
            tok_start += len(tok) + 1
        else:
            tok_start += 1
    return out


def create_tokens_and_tags(text, spans):
    tokens_and_idx = tokenize(text)
    if spans:
        spans = list(sorted(spans, key=itemgetter('start')))
        span = spans.pop(0)
        prefix = 'B-'
        tokens, tags = [], []
        for token, token_start in tokens_and_idx:
            tokens.append(token)
            token_end = token_start + len(token) - 1
            if not span or token_end < span['start']:
                tags.append('O')
            elif token_start >= span['end']:
                tags.append('O')
            else:
                tags.append(prefix + span['label'])
                if (span['end'] - 1) > token_end:
                    prefix = 'I-'
                elif len(spans):
                    span = spans.pop(0)
                    prefix = 'B-'
                else:
                    span = None
    else:
        tokens = [token for token, _ in tokens_and_idx]
        tags = ['O'] * len(tokens)

    return tokens, tags

def get_features(labels):
    labels = json.loads(labels)
    if len(labels) > 0:
        labels = labels['objects']
        results = []
        # print(labels)

        for obj in labels:
            temp = {}
            temp = obj['data']['location']
            temp['label'] = obj['value']
            results.append(temp)
        return results

    return labels

def create_NER_dataset(name_input_dataset):
  dataset = pd.read_csv(name_input_dataset, encoding='utf-8')
  dataset = dataset[['Labeled Data', 'Label']]

  ner_dataset = pd.DataFrame(columns=['sent', 'named_entity'])

  for i in range(len(dataset)):
      labels = get_features(dataset['Label'][i])
      if len(labels) > 0:
          sent = str(dataset['Labeled Data'][i])
          token, tag = create_tokens_and_tags(sent, labels)
          
          row = {}
          row['sent'] = sent
          row['named_entity'] = " ".join(tag)
          ner_dataset = ner_dataset.append(row, ignore_index=True)

  ner_dataset.to_csv(f'{name_input_dataset[:-4]}_conll2003.csv',index=False, encoding='utf-8')


def convert_dataset_to_conll2003(name_input_dataset):
    dataset = pd.read_csv(name_input_dataset, encoding='utf-8')
    dataset = dataset[['Labeled Data', 'Label']]

    dataset_train, dataset_dev = train_test_split(dataset, test_size=0.2)
    dataset_train, dataset_dev = dataset_train.reset_index(drop=True), dataset_dev.reset_index(drop=True)

    # create train 
    conll_file_train = open(f'{name_input_dataset[:-4]}_conll2003_train.conll', 'w')
    conll_file_train.write("-DOCSTART- -X- O O\n\n\n")

    for i in range(len(dataset_train)):
        labels = get_features(dataset_train['Label'][i])
        if len(labels) > 0:
            sent = str(dataset_train['Labeled Data'][i])
            token, tag = create_tokens_and_tags(sent, labels)
            for token, tag in zip(token, tag):
                conll_file_train.write(f'{token} {tag}\n')
            conll_file_train.write('\n')
    conll_file_train.close()

    # create dev 
    conll_file_dev = open(f'{name_input_dataset[:-4]}_conll2003_dev.conll', 'w')
    conll_file_dev.write("-DOCSTART- -X- O O\n\n\n")

    for i in range(len(dataset_dev)):
        labels = get_features(dataset_dev['Label'][i])
        if len(labels) > 0:
            sent = str(dataset_dev['Labeled Data'][i])
            token, tag = create_tokens_and_tags(sent, labels)
            for token, tag in zip(token, tag):
                conll_file_dev.write(f'{token} {tag}\n')
            conll_file_dev.write('\n')
    conll_file_dev.close()

In [23]:
create_NER_dataset('./datasets/combined_24_02_2021.csv')

In [None]:
convert_dataset_to_conll2003("./datasets/combined_24_02_2021.csv")

# Train model for NER

In [24]:
NER_TYPES = {
    "O": 0,
    'B-a1_be_have_do_in_the_past':1,
    'B-a1_can':2,
    'B-a1_comparative_exept':3,
    'B-a1_comparative_long':4,
    'B-a1_comparative_short':5,
    'B-a1_future_simple':6,
    'B-a1_have_has_got':7,
    'B-a1_past_simple_irreg':8,
    'B-a1_past_simple_reg':9,
    'B-a1_possesive_s_sing':10,
    'B-a1_possessive_s_plurar':11,
    'B-a1_present_continuous_act_rn':12,
    'B-a1_present_simple_3d_pers':13,
    'B-a1_present_simple_reg_act':14,
    'B-a1_special_questions':15,
    'B-a1_superlative_exept':16,
    'B-a1_superlative_long':17,
    'B-a1_superlative_short':18,
    'B-a1_there_is_am_are':19,
    'B-a1_there_was_were':20,
    'B-a1_there_will_be':21,
    'B-a1_to_be_future_will_be':22,
    'B-a1_to_be_past_was_were':23,
    'B-a1_to_be_present_is_am_are':24,
    'B-a1_want_would_like_to':25,
    'B-a1_be_have_do_in_the_past':26,
    # inside
    'I-a1_can':27,
    'I-a1_comparative_exept':28,
    'I-a1_comparative_long':29,
    'I-a1_comparative_short':30,
    'I-a1_future_simple':31,
    'I-a1_have_has_got':32,
    'I-a1_past_simple_irreg':33,
    'I-a1_past_simple_reg':34,
    'I-a1_possesive_s_sing':35,
    'I-a1_possessive_s_plurar':36,
    'I-a1_present_continuous_act_rn':37,
    'I-a1_present_simple_3d_pers':38,
    'I-a1_present_simple_reg_act':39,
    'I-a1_special_questions':40,
    'I-a1_superlative_exept':41,
    'I-a1_superlative_long':42,
    'I-a1_superlative_short':43,
    'I-a1_there_is_am_are':44,
    'I-a1_there_was_were':45,
    'I-a1_there_will_be':46,
    'I-a1_to_be_future_will_be':47,
    'I-a1_to_be_past_was_were':48,
    'I-a1_to_be_present_is_am_are':49,
    'I-a1_want_would_like_to':50,
}

params = {
    'data': {
        'text_field_name': 'sent',
        'label_field_name': 'named_entity',
        'path_to_dataset': './datascombined_24_02_2021.csv',
        'path_to_test_pred_scores': 'data/pred.txt'
    },
    'model': {
        'max_seq_length': 128,
        'model_name': 'distilbert-base-uncased',
        'num_classes': 70
    },
    'training': {
        'learn_rate': 3e-5,
        'num_epochs': 20,                          
        'accum_steps': 2,                         
        'batch_size': 2,
        'trashhold': 0.2,                          
        'log_dir': 'logdir' 
    }
}

SyntaxError: ignored

In [None]:
custom_conll_dataset = pd.read_csv('./datasets/present_simple_test_conll2003.csv', encoding='utf-8')
custom_conll_dataset.head(2)

Unnamed: 0,sent,named_entity
0,I started working on the application by contac...,O O O O O O O O O O O O O O O O O O O O B-a1_p...
1,"And Atlantic - Heydt , who 's the largest scaf...",O O O O O O O O O O O O O O O O O B-a1_present...


In [None]:
id2tag = {id: tag for tag, id in NER_TYPES.items()}

In [None]:
class CustomNERDataset(Dataset):
    def __init__(
        self,
        texts,
        labels = None,
        max_seq_length = 128,
        model_name = "distilbert-base-uncased",
    ):
        self.texts = texts
        self.labels = labels
        self.max_seq_length = max_seq_length

        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        # suppresses tokenizer warnings
    def __len__(self):
        return len(self.texts)

    def __getitem__(self, index):

        # encoding the text
        x = self.texts[index]
        x = x.split(" ")

        output_dict = self.tokenizer(
            x,
            add_special_tokens=True,
            padding="max_length",
            max_length=self.max_seq_length,
            return_tensors="pt",
            truncation=True,
            return_attention_mask=True,
            is_split_into_words=True, 
            return_offsets_mapping=True, 
            # padding=True,
        )

        output_dict['input_ids'] = output_dict['input_ids'].squeeze(0)
        output_dict['attention_mask'] = output_dict['attention_mask'].squeeze(0)

        output_dict.pop("offset_mapping")

        # encoding target
        target = self.labels[index]
        target = [NER_TYPES.get(typ, 0) for typ in target.split()]
        word_ids = output_dict.word_ids()
        aligned_labels = [-100 if i is None else target[i] for i in word_ids]
        y_encoded = torch.Tensor(aligned_labels).long()
        # output_dict["targets"] = y_encoded 
        output_dict['labels'] = y_encoded

        return output_dict

In [None]:
def read_data(params, dataset):
    # dataset = pd.read_csv(params['data']['path_to_dataset'], encoding='utf-8')
    dataset = dataset.sample(frac=1)
    train, valid, test = np.split(dataset, 
               [int(.8*len(dataset)), int(.9*len(dataset))])
    # print(train.head())
    test.to_csv('ner-test.csv', encoding='utf-8', index=False)
    # creating PyTorch Datasets
    train_dataset = CustomNERDataset(
        texts=train[params["data"]["text_field_name"]].values.tolist(),
        labels=train[params["data"]["label_field_name"]].values.tolist(),
        max_seq_length=params["model"]["max_seq_length"],
        model_name=params["model"]["model_name"],
    )

    valid_dataset = CustomNERDataset(
        texts=valid[params["data"]["text_field_name"]].values.tolist(),
        labels=valid[params["data"]["label_field_name"]].values.tolist(),
        max_seq_length=params["model"]["max_seq_length"],
        model_name=params["model"]["model_name"],
    )

    test_dataset = CustomNERDataset(
        texts=test[params["data"]["text_field_name"]].values.tolist(),
        labels=test[params["data"]["label_field_name"]].values.tolist(),
        max_seq_length=params["model"]["max_seq_length"],
        model_name=params["model"]["model_name"],
    )

    train_val_loaders = {
        "train": train_dataset,
        "valid": valid_dataset,
    }

    test_loaders = {
        "test": test_dataset,
    }

    return train_val_loaders, test_loaders

In [None]:
train_val_loaders, test_loaders = read_data(params, custom_conll_dataset)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=442.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=466062.0, style=ProgressStyle(descripti…




In [None]:
metric = load_metric("seqeval")

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [id2tag[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [id2tag[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [None]:
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=3,              # total number of training epochs
    per_device_train_batch_size=2,  # batch size per device during training
    per_device_eval_batch_size=2,   # batch size for evaluation
    warmup_steps=1,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    # logging_steps=1,
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
)

# model = DistilBertForTokenClassification.from_pretrained(params["model"]["model_name"], num_labels=len(NER_TYPES))
model = AutoModelForTokenClassification.from_config(
    AutoConfig.from_pretrained(params["model"]["model_name"], num_labels=len(NER_TYPES))
)

trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_val_loaders['train'],         # training dataset
    eval_dataset=train_val_loaders['valid'],             # evaluation dataset
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy,Runtime,Samples Per Second
1,No log,0.484599,0.0,0.0,0.0,0.903226,1.4185,1.41
2,No log,0.453999,0.0,0.0,0.0,0.887097,1.4117,1.417
3,No log,0.45369,0.0,0.0,0.0,0.854839,1.4079,1.421



Precision and F-score are ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior.



TrainOutput(global_step=24, training_loss=0.5742547512054443, metrics={'train_runtime': 117.745, 'train_samples_per_second': 0.204, 'total_flos': 2446542950400, 'epoch': 3.0})

In [None]:
trainer.evaluate()

{'epoch': 3.0,
 'eval_accuracy': 0.8548387096774194,
 'eval_f1': 0.0,
 'eval_loss': 0.453689843416214,
 'eval_precision': 0.0,
 'eval_recall': 0.0,
 'eval_runtime': 1.417,
 'eval_samples_per_second': 1.411}

# Predict

In [None]:


predictions, labels, _ = trainer.predict(test_loaders["test"])
predictions = np.argmax(predictions, axis=2)

# Remove ignored index (special tokens)
true_predictions = [
    [id2tag[p] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]
true_labels = [
    [id2tag[l] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]

results = metric.compute(predictions=true_predictions, references=true_labels)
results

{'a1_present_simple_reg_act': {'f1': 0.0,
  'number': 2,
  'precision': 0.0,
  'recall': 0.0},
 'overall_accuracy': 0.9032258064516129,
 'overall_f1': 0.0,
 'overall_precision': 0.0,
 'overall_recall': 0.0}

In [None]:
# true_predictions[0], true_labels[0]

# Spacy ner Recognition

In [None]:
# !chmod +x ./install

In [None]:
import cupy
a = cupy.zeros((1,1))
a

array([[0.]])

In [None]:
!pip freeze | grep cupy

cupy-cuda101==7.4.0


In [3]:
!pip install -U pip setuptools wheel

Collecting pip
[?25l  Downloading https://files.pythonhosted.org/packages/fe/ef/60d7ba03b5c442309ef42e7d69959f73aacccd0d86008362a681c4698e83/pip-21.0.1-py3-none-any.whl (1.5MB)
[K     |████████████████████████████████| 1.5MB 4.0MB/s 
[?25hRequirement already up-to-date: setuptools in /usr/local/lib/python3.7/dist-packages (53.0.0)
Requirement already up-to-date: wheel in /usr/local/lib/python3.7/dist-packages (0.36.2)
Installing collected packages: pip
  Found existing installation: pip 19.3.1
    Uninstalling pip-19.3.1:
      Successfully uninstalled pip-19.3.1
Successfully installed pip-21.0.1


In [4]:
!pip install -U spacy[cuda101,transformers,lookups]

Collecting spacy[cuda101,lookups,transformers]
  Downloading spacy-3.0.3-cp37-cp37m-manylinux2014_x86_64.whl (12.7 MB)
[K     |████████████████████████████████| 12.7 MB 235 kB/s 
Collecting catalogue<2.1.0,>=2.0.1
  Downloading catalogue-2.0.1-py3-none-any.whl (9.6 kB)
Collecting pydantic<1.8.0,>=1.7.1
  Downloading pydantic-1.7.3-cp37-cp37m-manylinux2014_x86_64.whl (9.1 MB)
[K     |████████████████████████████████| 9.1 MB 51.5 MB/s 
Collecting thinc<8.1.0,>=8.0.0
  Downloading thinc-8.0.1-cp37-cp37m-manylinux2014_x86_64.whl (1.1 MB)
[K     |████████████████████████████████| 1.1 MB 48.1 MB/s 
Collecting spacy-legacy<3.1.0,>=3.0.0
  Downloading spacy_legacy-3.0.1-py2.py3-none-any.whl (7.0 kB)
Collecting pathy
  Downloading pathy-0.4.0-py3-none-any.whl (36 kB)
Collecting typer<0.4.0,>=0.3.0
  Downloading typer-0.3.2-py3-none-any.whl (21 kB)
Collecting srsly<3.0.0,>=2.4.0
  Downloading srsly-2.4.0-cp37-cp37m-manylinux2014_x86_64.whl (456 kB)
[K     |████████████████████████████████| 4

In [5]:
!python -m spacy download en_core_web_trf

2021-02-24 14:19:03.461397: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.10.1
Collecting en-core-web-trf==3.0.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_trf-3.0.0/en_core_web_trf-3.0.0-py3-none-any.whl (459.7 MB)
[K     |████████████████████████████████| 459.7 MB 17 kB/s 
Installing collected packages: en-core-web-trf
Successfully installed en-core-web-trf-3.0.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_trf')


In [None]:
!python -m spacy download en_core_web_sm

2021-02-24 11:15:01.470804: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.10.1
Collecting en-core-web-sm==3.0.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.0.0/en_core_web_sm-3.0.0-py3-none-any.whl (13.7 MB)
[K     |████████████████████████████████| 13.7 MB 76 kB/s 
Installing collected packages: en-core-web-sm
  Attempting uninstall: en-core-web-sm
    Found existing installation: en-core-web-sm 2.2.5
    Uninstalling en-core-web-sm-2.2.5:
      Successfully uninstalled en-core-web-sm-2.2.5
Successfully installed en-core-web-sm-3.0.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [2]:
import spacy
from spacy.tokens import Doc, DocBin
spacy.require_gpu(0)

True

In [None]:
import cupy
cupy.dot

<function cupy.linalg.product.dot>

In [None]:
!pip freeze | grep spacy 

en-core-web-trf @ https://github.com/explosion/spacy-models/releases/download/en_core_web_trf-3.0.0/en_core_web_trf-3.0.0-py3-none-any.whl
spacy==3.0.3
spacy-alignments==0.7.2
spacy-legacy==3.0.1
spacy-lookups-data==1.0.0
spacy-transformers==1.0.1


In [None]:
!pip freeze | grep cupy

cupy-cuda111==8.4.0


In [None]:
!ls ./spacy_ner/

base_config.cfg
config.cfg
export-2021-02-15T09_52_22.918Z_conll2003_dev.spacy
export-2021-02-15T09_52_22.918Z_conll2003_train.spacy
model-best
model-last


In [13]:
!python -m spacy init fill-config ./spacy_ner/base_config.cfg ./spacy_ner/config.cfg

2021-02-24 15:35:18.634786: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.10.1
[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
spacy_ner/config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [14]:
!python -m spacy convert ./datasets/combined_24_02_2021_conll2003_train.conll spacy_ner --converter ner

2021-02-24 15:35:26.326714: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.10.1
[38;5;4mℹ Auto-detected token-per-line NER format[0m
[38;5;3m⚠ Document delimiters found, automatic document segmentation with `-n`
disabled.[0m
[38;5;2m✔ Generated output file (1 documents):
spacy_ner/combined_24_02_2021_conll2003_train.spacy[0m


In [15]:
!python -m spacy convert ./datasets/combined_24_02_2021_conll2003_dev.conll spacy_ner --converter ner

2021-02-24 15:35:32.378197: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.10.1
[38;5;4mℹ Auto-detected token-per-line NER format[0m
[38;5;3m⚠ Document delimiters found, automatic document segmentation with `-n`
disabled.[0m
[38;5;2m✔ Generated output file (1 documents):
spacy_ner/combined_24_02_2021_conll2003_dev.spacy[0m


In [None]:
# !nvidia-smi

In [16]:
!python -m spacy train ./spacy_ner/config.cfg --output ./spacy_ner --paths.train ./spacy_ner/combined_24_02_2021_conll2003_train.spacy \
--paths.dev ./spacy_ner/combined_24_02_2021_conll2003_dev.spacy -g 0

2021-02-24 15:36:21.580874: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.10.1
[38;5;4mℹ Using GPU: 0[0m
[1m
Set up nlp object from config
Pipeline: ['transformer', 'ner']
Created vocabulary
Finished initializing nlp object
Initialized pipeline components: ['transformer', 'ner']
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['transformer', 'ner'][0m
[38;5;4mℹ Initial learn rate: 3.0000000000000004e-05[0m
E    #       LOSS TRANS...  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  -------------  --------  ------  ------  ------  ------
  0       0         477.12    962.83    0.33    0.18    1.97    0.00
 13     100       88878.35  43087.48    0.00    0.00    0.00    0.00
 26     200      195700.82  34385.64    1.77    8.57    0.99    0.02
 39     300       19651.58  19178.59   28.57   29.58   27.63    0.29
 53     400       32618.94  13145.15    1.88   20.00    0.99    0.02
 66     500       31

In [None]:
import spacy
from spacy import displacy

nlp = spacy.load("./spacy_ner/model-last/")
normal_npl = spacy.load("en_core_web_sm")

In [None]:
TEST_TEXT = """I'm working in London for the next two weeks.
""" 
all_ents = nlp._meta['labels']['ner']
len(all_ents)

23

In [None]:
base_colors = [
              #  '#fd7b2a',
              #  "#117caa",
              #  "#90c23c",
              #  "#f4db5e",
              #  "#879bbd",
              #  "#189cff",
              #  "#b5b6bb",
              #  "#feb4a9",
              #  "#ffb672"
]
base_color = "#e9f0fb"


# colors = {"a1_present_continuous_act_rn": "#bfe1d9"}
# options = {"ents": ["a1_present_continuous_act_rn"], "colors": colors}

colors = {}
options = {"ents": [], "colors": {}}
options['ents'] = all_ents

for ent in all_ents:
  colors[str(ent)] = base_color

options['colors'] = colors

In [None]:
# options

In [None]:
texts = [
         'There are other ways that I am using self - care to tend to my mental health.',
         "What's more, essential workers are risking their lives so that we can have our necessities.",
         'While all of this is happening, the luxury travel market is growing and becoming even more exclusive than it already was.',
         "That means he's always swinging in a hammock and waiting for his phone to go off.",
         "A stressed-out lady, Lisa, is pulling at her hair.",
         "If you are already blogging, affiliate marketing can complement your blog strategy.",
         "I am having a lesson right now, so I can't give you a sentence",
         "You're eating your braid again, she said, stop it.",
         "Gross, I said, it's like they're growing wings.",
         "The thing is, I might even like what he's saying and maybe even agree with him.",
         "Sometimes his mouth moves like he's talking to someone, but there's no one else there.",
         "He's just sitting there outside her house in normal civilian clothes, all alone.",
         "He doesn't know where he's going; he just goes.",
         "He's still smiling, looking at Sean with kind, knowing eyes.",
         "He follows Joey into his apartment, past a living area where two other guys are watching TV.",
         "Sean accidentally shoulders the elevator wall as he's walking out, but doesn't acknowledge it.",
         "He feels comfortable giving away his address, whereas Sean would never dare give him his.",
         "It's the same address from the police files, just two blocks away.",
         "He's smiling in the picture, even though you're not supposed to do that for ID photos.",
         'When her friends asked her about it, she told them that she was having trouble with some "horse syndicate" people.',
         "In a letter addressed to her husband, Renee wrote that she was unhappy in her marriage and was contemplating getting a divorce.",
         "Terry was coming toward me, a huge grin plastered on his face - as he pumped his legs on his unicycle.",
         "We met at Piedmont Park, where Terry had suggested we could ride bikes.",
         "What I had feared ended up coming true - my hair smelled like Terry's discount ham for weeks.",
         "Perseverance has a large amount of data in its memory banks which it is gradually offloading to Earth.",
         "Nasa is promising more in the next few days.",
         "It shows the robot heading down to the ground on Thursday to make its landing.",
         "It was acquired by the rocket cradle that placed the vehicle on the surface.",
         "Perseverance has been put in a near-equatorial Martian crater known as Jezero where it will search for signs of past microbial life.",
         "You can see the dust kicked up by the engines.",
         "We're probably about 2m or so above the surface of Mars.",
         "And then the curly electrical umbilical that is taking all of the electrical signals from the descent stage down to the computer inside the belly of the rover, the ones and zeros that represent this image.",
         "Engineers report Perseverance to be in good health, as they gradually commission its systems.",
         "Even now, with just this limited first release of pictures, there were fascinating rocks to discuss, she told reporters.",
         "The $2.7bn (£1.9bn) robot is the fifth rover to be put on Mars by Nasa.",
         "As well as searching for signs of life, Perseverance's other key objective is to select and package rock samples that can be brought back to Earth laboratories by later missions."
]

for text in texts:
  doc  = nlp(text)
  
  doc.ents = (ent for ent in doc.ents if len(str(ent)) > 2)
  if len(doc.ents) > 0:
    displacy.render(doc,style='ent',jupyter=True, options=options)
  else:
    print(doc)

It was acquired by the rocket cradle that placed the vehicle on the surface.


In [None]:
doc.ents


In [None]:
 for i, p in enumerate(doc):
    if i == 0:
        settings = p.get("settings", {})
        self.direction = settings.get("direction", DEFAULT_DIR)
        self.lang = settings.get("lang", DEFAULT_LANG)
    render_id = f"{id_prefix}-{i}"
    svg = self.render_svg(render_id, p["words"], p["arcs"])
    rendered.append(svg)

In [None]:
vars(doc)

TypeError: ignored

In [None]:
text = """
It shows the robot heading down to the ground on Thursday to make its landing. It was acquired by the rocket cradle that placed the vehicle on the surface.
Perseverance has a large amount of data in its memory banks which it is gradually offloading to Earth.
Among other pictures is a view from a satellite that captures the rover in the parachute phase of its descent.
This also represents an immense technical achievement because the satellite - the Mars Reconnaissance Orbiter - was approximately 700km from Perseverance at the time and traveling at about 3km/s.
Nasa is promising more in the next few days.
This offering will include short movies shot during the Entry, Descent and Landing (EDL) sequence - with sound.
Perseverance has been put in a near-equatorial Martian crater known as Jezero where it will search for signs of past microbial life.
"""

for item in nlp(text).sents:
  item = str(item)
  doc  = nlp(item)
  
  doc.ents = (ent for ent in doc.ents if len(str(ent)) > 2)
  if len(doc.ents) > 0:
    displacy.render(doc,style='ent',jupyter=True, options=options)
  else:
    print(doc)

ValueError: ignored

In [None]:
import re
def pretty_text(dirty_text):
    bad_chars = {
        "’":"'",
        "‘": "'",
        "“": ' ',
        "”": ' ',
        "—": "-",
        "…": "...",
        "–": "-",
        '"': " ",
        '[': "",
        "]": "",
        '(Laughter)': ' ',
        '(Applause)': ' ',
        '--': '-',
        "&gt;": ''
    }
    new_text = str(dirty_text).strip()
    for bad_char in bad_chars:
        new_text = new_text.replace(bad_char, bad_chars[bad_char])
                
    _replace_whitespace_ = re.compile(r"\s+")
    new_text = _replace_whitespace_.sub(" ", new_text).strip()

    _replace_multiple_whitespaces = re.compile(r'\s{1,}')
    new_text = _replace_multiple_whitespaces.sub(" ", new_text).strip()

    return new_text

In [None]:
!touch ./datasets/medium_article.txt 

In [None]:
medium_article = normal_npl(pretty_text(open("./datasets/medium_article.txt", encoding='utf-8').read()))

In [None]:
some = """<body style="font-size: 16px; font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Helvetica, Arial, sans-serif, 'Apple Color Emoji', 'Segoe UI Emoji', 'Segoe UI Symbol'; padding: 4rem 2rem; direction: ltr">"""
my_html = open("index.html", 'w', encoding='utf-8')
my_html.write(str(some))

for item in medium_article.sents:
  item = str(item)
  doc  = nlp(item)
  
  doc.ents = (ent for ent in doc.ents if len(str(ent)) > 2)
  if len(doc.ents) > 0:
    # displacy.render(doc,style='ent',jupyter=True, options=options)
    html = str(displacy.render(doc,style='ent',page=True, options=options)).replace("<!DOCTYPE html>", " ").replace("</html>", " ").replace("</body>", " ").replace('<html lang="en">', " ").replace(some,'')
    my_html.write(html)
  # else:
  #   print(doc)
my_html.close()