# Named Entity Recognition (NER) by Machine learning 

In the following notebook, you can find a simple implementation of named entity recognition by machine learning

## Import the required tools 

In [26]:
import numpy as np
import pandas as pd
import scipy as sp
import matplotlib.pyplot as plt
%matplotlib inline

from itertools import chain
import sklearn
import scipy.stats
from sklearn.metrics import make_scorer

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV
import sklearn_crfsuite  # to install this package in windows with pip please run: 'pip install sklearn-crfsuite'
from sklearn_crfsuite import metrics, scorers
import nltk
from sklearn.model_selection import train_test_split

In [4]:
dfExoplanetsNASA = pd.read_json('./data/dfExoplanetsNASAdetected100rand_v3.json', orient = 'table')

In [5]:
dfExoplanetsNASA.head()

Unnamed: 0,absnum,sent,tag,pos,dep,lemma,label
0,159,This disfavours the possibility of GI-caused s...,"[DT, VBZ, DT, NN, IN, NNP, HYPH, VBN, JJ, NN, ...","[DET, VERB, DET, NOUN, ADP, PROPN, PUNCT, VERB...","[nsubj, ROOT, det, dobj, prep, npadvmod, punct...","[this, disfavour, the, possibility, of, GI, -,...",
1,550,Brigham Young University (BYU) has been assist...,"[NNP, NNP, NNP, -LRB-, NNP, -RRB-, VBZ, VBN, V...","[PROPN, PROPN, PROPN, PUNCT, PROPN, PUNCT, VER...","[compound, compound, nsubj, punct, appos, punc...","[Brigham, Young, University, (, BYU, ), have, ...",
2,999,Shadows have been detected in several protopla...,"[NNS, VBP, VBN, VBN, IN, JJ, JJ, NNS, ,, VBG, ...","[NOUN, VERB, VERB, VERB, ADP, ADJ, ADJ, NOUN, ...","[nsubjpass, aux, auxpass, ROOT, prep, amod, am...","[shadow, have, be, detect, in, several, protop...",
3,1388,We found that about half the stars are too fai...,"[PRP, VBD, IN, IN, PDT, DT, NNS, VBP, RB, JJ, ...","[PRON, VERB, ADP, ADP, DET, DET, NOUN, VERB, A...","[nsubj, ROOT, mark, quantmod, nummod, det, nsu...","[-PRON-, find, that, about, half, the, star, b...",discovery
4,1468,We detected a p.d.f. pattern that represents a...,"[PRP, VBD, DT, NN, ., NN, WDT, VBZ, DT, NN, VB...","[PRON, VERB, DET, NOUN, PUNCT, NOUN, DET, VERB...","[nsubj, ROOT, det, amod, punct, dobj, nsubj, r...","[-PRON-, detect, a, p.d.f, ., pattern, that, r...",discovery


In [28]:
def word2features(lemma, tag, pos, dep, i):
    word = lemma[i]
    postag = pos[i]
    tagtag = tag[i]
    deptag = dep[i]
    

    features = {
        'word': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'postag': postag,
        'postag[:2]': postag[:2],
        'tag': tagtag,
        'tag[:2]': tagtag[:2],
        'dep': deptag,
        'dep[:2]': deptag[:2],
        
    }
    if i > 0:
        word1 = lemma[i-1]
        postag1 = pos[i-1]
        tagtag1 = tag[i-1]
        deptag1 = dep[i-1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:postag': postag1,
            '-1:postag[:2]': postag1[:2],
            '-1:tag': tagtag1,
            '-1:tag[:2]': tagtag1[:2],
            '-1:dep': deptag1,
            '-1:dep[:2]': deptag1[:2],
        })
    else:
        features['BOS'] = True

    if i < len(lemma)-1:
        word1 = lemma[i+1]
        postag1 = pos[i+1]
        tagtag1 = tag[i+1]
        deptag1 = dep[i+1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:postag': postag1,
            '+1:postag[:2]': postag1[:2],
            '+1:tag': tagtag1,
            '+1:tag[:2]': tagtag1[:2],
            '+1:dep': deptag1,
            '+1:dep[:2]': deptag1[:2],
        })
    else:
        features['EOS'] = True

    return features


def sent2features(j):
    return [word2features(dfExoplanetsNASA.lemma[j], dfExoplanetsNASA.tag[j], dfExoplanetsNASA.pos[j], dfExoplanetsNASA.dep[j], i) for i in range(len(dfExoplanetsNASA.lemma[j]))]

def sent2labels(j):
    return dfExoplanetsNASA.label[j]

In [29]:
sent2features(0)[0]

{'word': 'this',
 'word[-3:]': 'his',
 'word[-2:]': 'is',
 'postag': 'DET',
 'postag[:2]': 'DE',
 'tag': 'DT',
 'tag[:2]': 'DT',
 'dep': 'nsubj',
 'dep[:2]': 'ns',
 'BOS': True,
 '+1:word.lower()': 'disfavour',
 '+1:postag': 'VERB',
 '+1:postag[:2]': 'VE',
 '+1:tag': 'VBZ',
 '+1:tag[:2]': 'VB',
 '+1:dep': 'ROOT',
 '+1:dep[:2]': 'RO'}

In [31]:
%%time
X_train, X_test, y_train, y_test = train_test_split([sent2features(s) for s in range(len(dfExoplanetsNASA))], [sent2labels(s) for s in range(len(dfExoplanetsNASA))], random_state = 0)

Wall time: 223 ms


In [32]:
%%time
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)

ValueError: The numbers of items and labels differ: |x| = 11, |y| = 9

In [33]:
y_pred = crf.predict(X_test)
metrics.flat_f1_score(y_test, y_pred,
                      average='weighted', labels=labels)

NameError: name 'crf' is not defined

In [34]:
# group B and I results
sorted_labels = sorted(
    labels,
    key=lambda name: (name[1:], name[0])
)
print(metrics.flat_classification_report(
    y_test, y_pred, labels=sorted_labels, digits=3
))

NameError: name 'labels' is not defined