# Named Entity Recognition (NER) by Machine learning 

In the following notebook, you can find a simple implementation of named entity recognition by machine learning

## Import the required tools 

In [1]:
import numpy as np
import pandas as pd
import scipy as sp
import matplotlib.pyplot as plt
import collections
import os
from itertools import permutations
from sklearn.metrics import make_scorer,confusion_matrix
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV
import sklearn_crfsuite  # to install this package in windows with pip please run: 'pip install sklearn-crfsuite'
from sklearn_crfsuite import metrics, scorers
import nltk
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
import eli5  # to install this package in windows with conda please run: 'conda install -c conda-forge eli5'

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/gerdgrasshoff/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/gerdgrasshoff/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


ImportError: cannot import name 'RandomizedLogisticRegression'

Please download the data '01_df_v013.pickle' in github on  your local machine from  this directory: vorlesung2019/notebooks/yeghaneh/data

In [None]:
importVersion = '013'  # The  desired version pickle-file of your data (now we are working on version 013 of our data)

In [None]:
path= '../data/01_df_v{0}.pickle'.format(importVersion)  # Put the path of the data in your local machine here, consider the letter "r" before the path

### Let's use data "01_df_v013" to build a NER system. 
The data 01_df_v013 is annotated (labeled) data of english version of well-known kepler's book: New Astronomy (Latin: Astronomia nova). You can find that in the repository.


### Read the data 

In [None]:
dfAstroNova = pd.read_pickle(path)# The data  is saves as a pickle file. We read it as a pandas dataFrame
type(dfAstroNova)  

In [None]:
 # Sort the data based on the chapters of the book 
dfAstroNova['chapter'] = dfAstroNova.chapter.replace("appendix b",np.nan).astype(float)  
dfAstroNova.sort_values(by='chapter' , inplace=True)
dfAstroNova.chapter.fillna('appendix b', inplace=True)

In [None]:



len(dfAstroNova)   # Number of rows.

In [None]:
dfAstroNova.head(5)  

In [None]:
dfAstroNova[740:745]

In [None]:
dfAstroNova[1585:1590]

In [None]:
dfAstroNova.tail(5)

### Primary Data Exploration

In [None]:
dfAstroNova["text"][5]    #  5th row of the data in the format of text 

In [None]:
dfAstroNova["tagged"][5] # 5th row of the data in a format (token, label). A list of lists of tuples! later position tags of the token will be added to each tuple. the tokens are chosen before by some rules.

In [None]:
 df = sum(dfAstroNova['tagged'], [])  # Flatt the data by removing first lists
len(df)

In [None]:
df[5]  

In [None]:
df_flatten = sum(df,[]) # Flatt the data by removing the second list


In [None]:
df_flatten[0:20]  # Here we have tokens (entities) with their labels

In [None]:
len(df_flatten) # Number of all tokens (roughly speaking all words or entities)

In [None]:
tokens = [x[0] for x in df_flatten] # Extract the tokens for further use in the frequency plot

In [None]:
labels=([x[1] for x in df_flatten]) # Extract the labels for further use in the frequency plot

In [None]:
tokens[0:5]

In [None]:
labels[0:5]

In [None]:
# Frequency plot of all tokens in text, as you see there is a lot 'the', 'of',... in text, they normally consider as noise, we will back to that later.
freqdist = nltk.FreqDist(tokens)
plt.figure(figsize=(25,5))
freqdist.plot(100, title='Frequency plot of words')


In [None]:
stop_words=["the", "of", "is", "to", "and","in", "be", "at", "that", "from", "it", "a", "as","this", "was" ] # we can consider these as stop words or we can use nltk stop words.

In [None]:
# nltk stop words
from nltk.corpus import stopwords
stop_words_nltk=set(stopwords.words('english'))

# List of nltk stop words {‘ourselves’, ‘hers’, ‘between’, ‘yourself’, ‘but’, ‘again’, ‘there’, ‘about’, ‘once’, ‘during’, ‘out’, ‘very’, ‘having’, ‘with’, ‘they’, ‘own’, ‘an’, ‘be’, ‘some’, ‘for’, ‘do’, ‘its’, ‘yours’, ‘such’, ‘into’, ‘of’, ‘most’, ‘itself’, ‘other’, ‘off’, ‘is’, ‘s’, ‘am’, ‘or’, ‘who’, ‘as’, ‘from’, ‘him’, ‘each’, ‘the’, ‘themselves’, ‘until’, ‘below’, ‘are’, ‘we’, ‘these’, ‘your’, ‘his’, ‘through’, ‘don’, ‘nor’, ‘me’, ‘were’, ‘her’, ‘more’, ‘himself’, ‘this’, ‘down’, ‘should’, ‘our’, ‘their’, ‘while’, ‘above’, ‘both’, ‘up’, ‘to’, ‘ours’, ‘had’, ‘she’, ‘all’, ‘no’, ‘when’, ‘at’, ‘any’, ‘before’, ‘them’, ‘same’, ‘and’, ‘been’, ‘have’, ‘in’, ‘will’, ‘on’, ‘does’, ‘yourselves’, ‘then’, ‘that’, ‘because’, ‘what’, ‘over’, ‘why’, ‘so’, ‘can’, ‘did’, ‘not’, ‘now’, ‘under’, ‘he’, ‘you’, ‘herself’, ‘has’, ‘just’, ‘where’, ‘too’, ‘only’, ‘myself’, ‘which’, ‘those’, ‘i’, ‘after’, ‘few’, ‘whom’, ‘t’, ‘being’, ‘if’, ‘theirs’, ‘my’, ‘against’, ‘a’, ‘by’, ‘doing’, ‘it’, ‘how’, ‘further’, ‘was’, ‘here’, ‘than’} 


In [None]:
# We can consider these words like 'the', 'of',... as noises. so we temporary remove those!  
new_tokens =[]     

for w in tokens:
    if w not in stop_words_nltk: new_tokens.append(w)
        

  


In [None]:
# Frequency plot of all tokens in text
freqdist = nltk.FreqDist(new_tokens) 
plt.figure(figsize=(25,5)) 
freqdist.plot(100, title='Frequency plot of words after removing noise')



In [None]:
# This shows where the search words occur in the file which the current entry belongs to. We concentrate on some causal words
mytext = nltk.Text(tokens) 
mytext.dispersion_plot(["since", "as", "for", "because", "cause", "raise", "affects"])



In [None]:
# A measure of the lexical richness of the text
from __future__ import division  
def lexical_diversity(text):
    a=len(text)/len(set(text))
    return a

In [None]:
lexical_diversity("mytext")

In [None]:
unique_tokens=sorted(set(mytext)) 

In [None]:
print(unique_tokens[0:5] , unique_tokens[5000:5005])  # Samples of unique tokens, see some times two numbers is considered as a token!

In [None]:
len(sorted(set(mytext))) 

In [None]:
causal_words=list([mytext.count("since"), mytext.count("as") ,mytext.count("for"), mytext.count("because"),  mytext.count("cause"), mytext.count( "raise"),  mytext.count("affects")])

In [None]:
bar_list=plt.bar(["since", "as" , "for", "because", "cause", "raise","affects"],causal_words) 
plt.title('Frequeny Distribution of some possible causal words ')
plt.xlabel('Words')
plt.ylabel('Frequency')
plt.show()

### Distribution of the different labels

In [None]:
from collections import Counter
a=Counter(labels)
print(a)
keys=a.keys()
values=a.values()

In [None]:
# frequency distribution of all the lables
bar_list=plt.bar(a.keys(),a.values()) 
bar_list[0].set_color('r')
plt.title('Frequeny Distribution of all labels ')
plt.xlabel('Labels')
plt.ylabel('Frequency')
plt.show()

In [None]:
del a['None']
bar_list=plt.bar(a.keys(),a.values() ) 
bar_list[2].set_color('g')
plt.title('Frequeny Distribution of the known labels')
plt.xlabel('Labels')
plt.ylabel('Frequency')
plt.show()

In [None]:
def pre_process_2(s):
    w = nltk.pos_tag(s)
    return w

In [None]:
(keys,values)

In [None]:
token_tags = pre_process_2(tokens)


In [None]:
data=[(i, j, k) for ((i, j), k) in zip(token_tags, labels)]

In [None]:
data[1:3]   # This is very similar to nltk data format, but we need another format which has sentences as a list in order to feed in CRF classifier. 


### Make a consistant data with nltk data type

In [None]:
b=df[1500:1503] # Sentences of the data without POS tagging, It needs to be in the same format of nltk typical data
b


In [None]:
# Put the pos tag of each token in the second component of each tuple.
position_to_add = 1 # 
result = []
for lst in df:
    ret_li = []
    for tpl in lst:
        # new_tpl = [*tpl]
        # new_tpl.append('None')
        new_tpl = tuple([*tpl[0:position_to_add]] + [nltk.tag.pos_tag([tpl[0]])[0][1]] + [*tpl[position_to_add:]])
        ret_li.append(new_tpl)

    result.append(ret_li)

In [None]:
result[0:2] # Here you see, we have the almost same format of data with nltk data set

In [None]:
len(result)

In [None]:
result[0:3]

### Train and test split

In [None]:
# We one consider two third of data as the training set and one-third of data as the testing set
train_sents=result[0:4000]
test_sents=result[4001:]

### Features 

Next, define some features. In this example we use word identity, word suffix, word shape and word POS tag; also, some information from nearby words is used.

This makes a simple baseline, but you certainly can add and remove some features to get (much?) better results - experiment with it.

sklearn-crfsuite (and python-crfsuite) supports several feature formats; here we use feature dicts.

In [None]:
def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]
    
    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'postag': postag,
        'postag[:2]': postag[:2],        
    }
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:postag': postag1,
            '-1:postag[:2]': postag1[:2],
        })
    else:
        features['BOS'] = True
        
    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:postag': postag1,
            '+1:postag[:2]': postag1[:2],
        })
    else:
        features['EOS'] = True
                
    return features


def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, postag, label in sent]

def sent2tokens(sent):
    return [token for token, postag, label in sent]

In [None]:
#train and test split based on features
X_train = [sent2features(s) for s in train_sents]
y_train = [sent2labels(s) for s in train_sents]

X_test = [sent2features(s) for s in test_sents]
y_test = [sent2labels(s) for s in test_sents]

### Model 

In [None]:
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs', 
    c1=0.1, 
    c2=0.1, 
    max_iterations=100, 
    all_possible_transitions=True
)
crf.fit(X_train, y_train)

### Prediction and Evaluation

In [None]:
# There is much more "None" entities in data set, but we're more interested in other entities. To account for this we'll use averaged F1 scores computed for all labels except for "None".
labels = list(crf.classes_)
labels.remove('None')
labels

In [None]:
# Prediction based on crf model
y_pred = crf.predict(X_test)
print(y_pred[13], y_test[13]) # for intution! a sample of our prediction and real value. It is correct! but how can  we see it for all the data, see next followincells!

In [None]:
metrics.flat_f1_score(y_test, y_pred, 
                      average='weighted', labels=labels)

In [None]:
# Is it accuracy a proper metric here?
metrics.flat_accuracy_score(y_test, y_pred)

In [None]:
# Result of classification report for evaluation
sorted_labels = sorted(
    labels, 
    key=lambda name: (name[1:], name[0])
)
print(metrics.flat_classification_report(
    y_test, y_pred, labels=sorted_labels, digits=3
))

In [None]:
y_test_flatten = sum(y_test, [])
y_pred_flatten = sum(y_pred, [])



In [None]:
# The result of the confusion matrix, it basically shows how many errors has the prediction through each labels
cm=nltk.ConfusionMatrix(y_test_flatten, y_pred_flatten)
print(cm)

### Inspect Model weights


In [None]:
eli5.show_weights(crf, top=30)

### Hyperparameter Optimization


In [None]:
%%time
# Define fixed parameters and parameters to search
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs', 
    max_iterations=100, 
    all_possible_transitions=True
)
params_space = {
    'c1': sp.stats.expon(scale=0.5),
    'c2': sp.stats.expon(scale=0.05),
}

# use the same metric for evaluation
f1_scorer = make_scorer(metrics.flat_f1_score, 
                        average='weighted', labels=labels)

# search
rs = RandomizedSearchCV(crf, params_space, 
                        cv=3, 
                        verbose=1, 
                        
                        n_jobs=-1, 
                        n_iter=100, 
                        scoring=f1_scorer)
rs.fit(X_train, y_train)

## Best result:

In [None]:
print('best params:', rs.best_params_)
print('best CV score:', rs.best_score_)
print('model size: {:0.2f}M'.format(rs.best_estimator_.size_ / 1000000))

In [None]:
rs.cv_results_

x = [s.['c1'] for s in rs.cv_results__]
_y = [s.params['c2'] for s in rs.cv_results_]
_c = [s.mean_validation_score for s in rs.cv_results_]

fig = plt.figure()
fig.set_size_inches(12, 12)
ax = plt.gca()
ax.set_yscale('log')
ax.set_xscale('log')
ax.set_xlabel('C1')
ax.set_ylabel('C2')
ax.set_title("Randomized Hyperparameter Search CV Results (min={:0.3}, max={:0.3})".format(
    min(_c), max(_c)
))

ax.scatter(_x, _y, c=_c, s=60, alpha=0.9, edgecolors=[0,0,0])

print("Dark blue => {:0.4}, dark red => {:0.4}".format(min(_c), max(_c)))

### Prediction and Evaluation after parameter tuning 

In [None]:
crf = rs.best_estimator_
y_pred = crf.predict(X_test)
print(metrics.flat_classification_report(
    y_test, y_pred, labels=sorted_labels, digits=3
))

In [None]:
y_test_flatten = sum(y_test, [])
y_pred_flatten = sum(y_pred, [])

In [None]:
cm_tuned=nltk.ConfusionMatrix(y_test_flatten,y_pred_flatten)
print(cm_tuned)

In [None]:
dfAstroNova.text.iloc[201]

In [None]:
eli5.show_weights(rs.best_estimator_, top=30)

In [None]:
eli5.show_weights(rs.best_estimator_, top=10, targets=['GEOM', 'NUM', 'ANG'])   

### Let's check what classifier learned


In [None]:
from collections import Counter

def print_transitions(trans_features):
    for (label_from, label_to), weight in trans_features:
        print("%-6s -> %-7s %0.6f" % (label_from, label_to, weight))

print("Top likely transitions:")
print_transitions(Counter(crf.transition_features_).most_common(20))

print("\nTop unlikely transitions:")
print_transitions(Counter(crf.transition_features_).most_common()[-20:])

 Check the state features:

In [None]:
def print_state_features(state_features):
    for (attr, label), weight in state_features:
        print("%0.6f %-8s %s" % (weight, label, attr))    

print("Top positive:")
print_state_features(Counter(crf.state_features_).most_common(30))

print("\nTop negative:")
print_state_features(Counter(crf.state_features_).most_common()[-30:])