# Python Notebook for demonstrating NER using Linguistic Structures

### 1. Import required module from the package

In [1]:
# Import the class from the package
from linguisticstructure_ner import LingusticStructureforNER
import pprint

### 2. Create module object, initialize required files. 
### The project required GloVE embeddings which can be downloaded as pretrained vectors. The code also requires Senna embeddings.

In [2]:
linguistic_ner =  LingusticStructureforNER()

# Give the necessary file inputs, embeddings required by the module
embeddings = {}
senna_hash_path = 'resources'
glove_path = 'resources/test_embeddings.txt'
embeddings['glove_path'] = glove_path
embeddings['senna_hash_path'] = senna_hash_path

#input_file is the input file in the DITK.NER format
input_file = 'resources/ner_test_input.txt' 
file_dict = {"train": input_file, "test": input_file, "dev": input_file}

### 3. Read the dataset from the initialized location. The data should be in DITK.NER format

In [3]:
# Initialize the module by reading the data. Returns a file dict in the same format as above
data = linguistic_ner.read_dataset(file_dict, embeddings=embeddings)

extract_glove_embeddings()... 63 pre-trained words
get_tree_data()... 8 sentences
get_tree_data()... 8 sentences
get_tree_data()... 8 sentences


In [4]:
pprint.pprint(data)

{'dev': [['Yes',
          'UH',
          '(TOP(S(INTJ*)',
          'O',
          'bc/cnn/00/cnn_0003',
          '0',
          '0',
          '-',
          '-',
          '-',
          'Linda_Hamilton',
          '*',
          '-'],
         ['they',
          'PRP',
          '(NP*)',
          'O',
          'bc/cnn/00/cnn_0003',
          '0',
          '1',
          '-',
          '-',
          '-',
          'Linda_Hamilton',
          '*',
          '(15)'],
         ['did',
          'VBD',
          '(VP*)',
          'O',
          'bc/cnn/00/cnn_0003',
          '0',
          '2',
          'do',
          '01',
          '-',
          'Linda_Hamilton',
          '(V*)',
          '-'],
         ['/.',
          '.',
          '*))',
          'O',
          'bc/cnn/00/cnn_0003',
          '0',
          '3',
          '-',
          '-',
          '-',
          'Linda_Hamilton',
          '*',
          '-'],
         [],
         ['and',
          'CC',
       

           'company',
           '-',
           '1',
           'Linda_Hamilton',
           '*',
           '*)',
           '-'],
          ['that',
           'WDT',
           '(SBAR(WHNP*)',
           'O',
           'bc/cnn/00/cnn_0003',
           '0',
           '7',
           '-',
           '-',
           '-',
           'Linda_Hamilton',
           '*',
           '(R-ARG1*)',
           '-'],
          ['approached',
           'VBD',
           '(S(VP*',
           'O',
           'bc/cnn/00/cnn_0003',
           '0',
           '8',
           'approach',
           '01',
           '2',
           'Linda_Hamilton',
           '*',
           '(V*)',
           '-'],
          ['me',
           'PRP',
           '(NP*))))))',
           'O',
           'bc/cnn/00/cnn_0003',
           '0',
           '9',
           '-',
           '-',
           '-',
           'Linda_Hamilton',
           '*)',
           '(ARG2*)',
           '(76)'],
          ['/.',
           '

            'medicine',
            '-',
            '1',
            'Linda_Hamilton',
            '*',
            '(ARG1*',
            '-'],
           ['or',
            'CC',
            '*',
            'O',
            'bc/cnn/00/cnn_0003',
            '0',
            '6',
            '-',
            '-',
            '-',
            'Linda_Hamilton',
            '*',
            '*',
            '-'],
           ['pharmaceuticals',
            'NNS',
            '*))))',
            'O',
            'bc/cnn/00/cnn_0003',
            '0',
            '7',
            '-',
            '-',
            '-',
            'Linda_Hamilton',
            '*',
            '*)',
            '-'],
           ['/.',
            '.',
            '*))',
            'O',
            'bc/cnn/00/cnn_0003',
            '0',
            '8',
            '-',
            '-',
            '-',
            'Linda_Hamilton',
            '*',
            '*',
            '-'],
           [],
       

### 4. Train the model with required epoches (Suggested 100)

In [5]:
# Next step is to train the model
# max_epoches limits the model training to limited epoches.
linguistic_ner.train(data, max_epoches=1)

extract_glove_embeddings()... 63 pre-trained words
get_tree_data()... 8 sentences
get_tree_data()... 8 sentences
get_tree_data()... 8 sentences
Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.

For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
If you depend on functionality not listed there, please file an issue.

Instructions for updating:

Future major versions of TensorFlow will allow gradients to flow
into the labels input on backprop by default.

See `tf.nn.softmax_cross_entropy_with_logits_v2`.

Instructions for updating:
Use tf.cast instead.


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "



<Epoch 1>
[train] average loss 57.007; elapsed 4s                         
[dev] precision=0.0% recall=0.0% f1=0.000%; elapsed 1s;
best

<Best Epoch 1>
[train] average loss 57.007
[dev] precision=0.0% recall=0.0% f1=0.000%


### 5. Get the predictions from the trained model on the test data

In [6]:
# After the model is trained, predict on the test dataset.
# Returns a list of predicted word, entity type pairs
predictions = linguistic_ner.predict(data['test'])

Instructions for updating:
Use standard file APIs to check for files with this prefix.
INFO:tensorflow:Restoring parameters from models/ontonotes.model
Read ontonotes\pos.txt... 79 lines
Read ontonotes\ne.txt... 18 lines
get_tree_data()... 8 sentences


In [7]:
pprint.pprint(predictions)

[(None, None, 'Yes', 'B-WORK_OF_ART'),
 (None, None, 'they', 'I-WORK_OF_ART'),
 (None, None, 'did', 'I-WORK_OF_ART'),
 (None, None, '/.', 'I-WORK_OF_ART'),
 (),
 (None, None, 'and', 'B-PERSON'),
 (None, None, 'they', 'I-PERSON'),
 (None, None, 'were', 'I-PERSON'),
 (None, None, 'not', 'I-PERSON'),
 (None, None, 'the', 'I-PERSON'),
 (None, None, 'first', 'I-PERSON'),
 (None, None, 'company', 'I-PERSON'),
 (None, None, 'that', 'I-PERSON'),
 (None, None, 'approached', 'I-PERSON'),
 (None, None, 'me', 'I-PERSON'),
 (None, None, '/.', 'I-PERSON'),
 (),
 (None, None, 'but', 'B-WORK_OF_ART'),
 (None, None, 'I', 'I-WORK_OF_ART'),
 (None, None, 'am', 'I-WORK_OF_ART'),
 (None, None, 'not', 'I-WORK_OF_ART'),
 (None, None, 'selling', 'I-WORK_OF_ART'),
 (None, None, 'medicine', 'I-WORK_OF_ART'),
 (None, None, 'or', 'I-WORK_OF_ART'),
 (None, None, 'pharmaceuticals', 'I-WORK_OF_ART'),
 (None, None, '/.', 'I-WORK_OF_ART'),
 (),
 (None, None, 'I', 'B-WORK_OF_ART'),
 (None, None, "'m", 'I-WORK_OF_ART'),

### 6. Evaluate the model using the true labels from the model

In [8]:
# The convert ground truth converts the actual tags into a format required by evaluate.
# The evaluate method takes the predicted tags, converted actual/true tags for evaluation. 
# It returns a precision, recall, f1 for the model.
actual_tags = linguistic_ner.convert_ground_truth(data['test'])
precision, recall, f1 = linguistic_ner.evaluate(predictions, actual_tags)

In [9]:
print('Precision:', precision, 'Recall:', recall, 'F1:', f1)

Precision: 1.3157894736842104 Recall: 100.0 F1: 2.5974025974025974


### 7. Optionally, the model can be stored and reused as follows

    The load and save model take a directory as input which can be used to store and load the  model from external sources.

In [10]:
# Optionally, a model can be stored and restored as well. The location is optional. The model will be stored in internal path if not supplied.
linguistic_ner.save_model('D:/Test')
linguistic_ner.load_model('D:/Test')
# Continue with other options (Predict, Evaluate)

INFO:tensorflow:Restoring parameters from D:/Test/ontonotes.model
