In [1]:
# I noticed that there is a certain string that is pervasive throughout the dataset
#  ("-DOCSTART- -X- O O") indicating document divisions, so I went through and 
#  removed all instances of it in the txt file so that they would not show up in 
#  the data frame rows when I read it in using read_csv.
with open("DataNer.txt", "r+") as f:
    d = f.readlines()
    f.seek(0)
    for i in d:
        if i != "-DOCSTART- -X- O O":
            f.write(i)
    f.truncate()

# Read in the dataset with correct column names
import pandas as pd
colnames = ['Word', 'Part_of_speech', 'Chunking', 'NER_tag']
data2 = pd.read_csv("DataNer.txt", names = colnames, delimiter = " ")
data2.head()

Unnamed: 0,Word,Part_of_speech,Chunking,NER_tag
0,He,PRP,I-NP,O
1,said,VBD,I-VP,O
2,further,JJ,I-NP,O
3,scientific,JJ,I-NP,O
4,study,NN,I-NP,O


In [2]:
# Parse through txt file to create a list of sentence numbers
import itertools
sentence_num_list = []
curr_sentence_num = 1
with open('DataNer.txt') as f:
    for line in f:
        #print(type(line))
        #print(line)
        if len(line.strip()) == 0:
            curr_sentence_num = curr_sentence_num + 1
        else:
            sentence_num_list.append(curr_sentence_num)
    print("done")


done


In [3]:
# Add the list created above as a column of sentence number in our dataframe
data2.insert(4, "Sentence_num", sentence_num_list)
data2.head(5)

Unnamed: 0,Word,Part_of_speech,Chunking,NER_tag,Sentence_num
0,He,PRP,I-NP,O,1
1,said,VBD,I-VP,O,1
2,further,JJ,I-NP,O,1
3,scientific,JJ,I-NP,O,1
4,study,NN,I-NP,O,1


In [4]:
# Fill any NA entries and vectorize predictor variables in the dataset
# Here I separated the predictors from the y-values and created two different matrices. For
#  the predictors' matrix (X), I used the to_dict function to create a numerical matrix with
#  the columns containing the counts of each word and the rows referring to each of the 
#  individual words in our dataset. This was done because the first class of models that
#  I built below (SGD, Multinomial Naive Bayes, and Perceptron classifier) take in numerical
#  matrices as opposed to characters/strings that we have in our dataframe above.

data2 = data2.fillna(method='ffill')
data2.head()

from sklearn.feature_extraction import DictVectorizer
X1 = data2.drop('NER_tag', axis=1)
DictVector = DictVectorizer(sparse=True)
X = DictVector.fit_transform(X1.to_dict('records'))
y = data2.NER_tag.values   



In [5]:
# Split X and y above into training and testing set:
# As specified in the assignment, I used 70% of the dataset as the training set and 
# 30% for the testing set.  These sets were used directly for the first three models below.
import numpy as np
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3)
#print(X_train)

In [None]:
# Description of the model building/training/testing/evaluation process for the 1st class 
# of models (SGD, MNB, and Perceptron):
#  Since using cross validation and grid search to find the best values for 
#  parameters/hyperparameters for these models would take too long, I used the default
#  values.  I fit each of the models on the training set I created above, and then predicted
#  the target values for the testing set (X_test).  I compared the predicted values 
#  to the test set's actual values to compute the weighted f1 score. I evaluated the models
#  based on this score and also viewed a classification report for each of the models 
#  so that I could see the precision and recall for each of the 8 target value groups.
#  Note that when calculating the f1 scores, I did not remove the 'O' tags, 
#  as they are abundant in real life (and in datasets apart from the testing one 
#  used here), and I wanted to evaluate the model's ability to predict 
#  all types tags (even among noise that that 'O' tags bring).


In [6]:
# SGD (Stochastic Gradient Descent) linear classifier:
from sklearn import linear_model
from sklearn.metrics import f1_score
from sklearn import metrics

sgd = linear_model.SGDClassifier()
sgd.partial_fit(X_train, y_train, classes=np.unique(y))
result_y = sgd.predict(X_test)
# Can remove all 'O' entries before calculating the f-score (Did not do this as explained below)
#labels = list(result_y)
#labels.remove('O')
print("f1 score:", f1_score(y_test, result_y, average='weighted'))

#classes = [i for i in list(data2.NER_tag.unique()) if i != 'O']   
print(metrics.classification_report(y_test, result_y))

# Note:
#  For the first three models, I was getting a dead kernel so I took the posted advice and 
#  used partial_fit instead of fit.  I then got an error and found that I needed to 
#  specify the "classes" parameter in partial_fit, which I set to classes=np.unique(y) ,
#  which specified the number of unique classes that we have.  
#  Then I predicted on the test set and printed the f1 score and got a warning 
#  : "UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 in labels with no 
#  predicted samples.  'precision', 'predicted', average, warn_for).""
#  I searched on google and found that this is because some of the results in the test 
#  set are not in the predicted set, and thus the f score for these cases is 0, and they 
#  are included in the calculation of the average f1 score.
#  Thus, in the f1_score() function, I could have
#  the parameter 'labels=np.unique(result_y)' to get rid of the labels that were not 
#  predicted set, so that these zero values are not included in the average calculation.  
#  But in lecture it was said that we should not do this, so I did not include this.


# A previous output gave a difference precision for the I-MISC tag.
#f1 score: 0.7534990387203918
#               precision    recall  f1-score   support

#        B-LOC       0.00      0.00      0.00         5
#       B-MISC       0.00      0.00      0.00        10
#        B-ORG       0.00      0.00      0.00        10
#        I-LOC       0.00      0.00      0.00      2565
#       I-MISC       0.50      0.00      0.00      1403
#        I-ORG       0.00      0.00      0.00      3119
#        I-PER       0.00      0.00      0.00      3434
#            O       0.83      1.00      0.91     51635

#    micro avg       0.83      0.83      0.83     62181
#    macro avg       0.17      0.13      0.11     62181
# weighted avg       0.70      0.83      0.75     62181


  'precision', 'predicted', average, warn_for)


f1 score: 0.7558173297713038


  'precision', 'predicted', average, warn_for)


              precision    recall  f1-score   support

       B-LOC       0.00      0.00      0.00         3
      B-MISC       0.00      0.00      0.00        11
       B-ORG       0.00      0.00      0.00         5
       I-LOC       0.00      0.00      0.00      2574
      I-MISC       0.00      0.00      0.00      1391
       I-ORG       0.00      0.00      0.00      3057
       I-PER       1.00      0.00      0.00      3399
           O       0.83      1.00      0.91     51741

    accuracy                           0.83     62181
   macro avg       0.23      0.12      0.11     62181
weighted avg       0.75      0.83      0.76     62181



In [68]:
# Multinomial Naive Bayes Model Classifier
from sklearn.naive_bayes import MultinomialNB
mnb = MultinomialNB()
mnb.partial_fit(X_train, y_train, classes=np.unique(y))
result_y = mnb.predict(X_test)
print("f1 score:", f1_score(y_test, result_y, average='weighted', labels=np.unique(result_y)))
print(metrics.classification_report(y_test, result_y))

f1 score: 0.9213951759106611


  'precision', 'predicted', average, warn_for)


              precision    recall  f1-score   support

       B-LOC       0.00      0.00      0.00         5
      B-MISC       0.00      0.00      0.00        10
       B-ORG       0.00      0.00      0.00         4
       I-LOC       0.89      0.62      0.73      2456
      I-MISC       0.65      0.19      0.30      1490
       I-ORG       0.77      0.63      0.70      3035
       I-PER       0.73      0.82      0.77      3406
           O       0.95      0.99      0.97     51775

    accuracy                           0.93     62181
   macro avg       0.50      0.41      0.43     62181
weighted avg       0.92      0.93      0.92     62181



In [69]:
# Perceptron Classifier
from sklearn.linear_model import Perceptron
perc = Perceptron()
perc.partial_fit(X_train, y_train, classes=np.unique(y))
result_y = perc.predict(X_test)
print("f1 score:", f1_score(y_test, result_y, average='weighted', labels=np.unique(result_y)))
print(metrics.classification_report(y_test, result_y, labels=np.unique(result_y)))

f1 score: 0.8583100132983884
              precision    recall  f1-score   support

       I-ORG       0.05      0.00      0.00      3035
           O       0.83      1.00      0.91     51775

   micro avg       0.83      0.94      0.88     54810
   macro avg       0.44      0.50      0.46     54810
weighted avg       0.79      0.94      0.86     54810



In [None]:
# Comparing the weighted f1 scores for the first three models, we can see that the 
#  Multinomial Naive Bayes performs the best as it has the highest score at about .919. 
#  (We are looking for the highest f1 score:
#  The higher the f-score, the better the precision and recall of the model, or the 
#  higher the proportion of true positives over the combination of true and false positives, 
#  and the proportion of true positives over the combination of true positives and false 
#  negatives, respectively.)
#  The next best performing is the perceptron with an f1 score of .857, and the SGD has an
#  f score of about .758, which is still relatively somewhat strong.
#  In the classification reports, we can see the individual precision and recall for each
#  of the different classes of NER tags. Aside from the 'O' tag which are prevalent and 
#  can cause noise in the dataset, the Multinomial Naive Bayes has the highest precision for
#  'I-LOC' group (.9) and the highest recall for the 'I-PER' group (.79), and so it can 
#  predict these classes of NER tags well.  Though not important, we can see that all three 
#  the models above do well in predicting 'O' tags, mainly because they are so prevalent.

In [7]:
# Conditional Random Fields (CRF)

# Find features to include in the CRF model:
#  CRF models use specific features to predict the probability of the target label 
#  sequence (NER tag).
#  Here I specifically looked at whether a word was lowercase, uppercase, a digit, or a 
#  title, and nearby words. The functions below return a dictionary for each sentence, 
#  where each feature is a dictionary value.

def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]

    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'postag': postag,
        'postag[:2]': postag[:2],
    }
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:postag': postag1,
            '-1:postag[:2]': postag1[:2],
        })
    else:
        features['BOS'] = True

    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:postag': postag1,
            '+1:postag[:2]': postag1[:2],
        })
    else:
        features['EOS'] = True

    return features

def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, postag, label in sent]

def sent2tokens(sent):
    return [token for token, postag, label in sent]

In [8]:
# Create a training and test set:
# len(data2) is equal to 207270 (number of words) and we want to put 207270*(.7) = 145089 
#  or 70% into the training set

training_crf = data2.iloc[0:145089,]
testing_crf = data2.iloc[145089:,]

print(testing_crf.head(200))

           Word Part_of_speech Chunking NER_tag  Sentence_num
145089      the             DT     I-NP       O         11192
145090     Cubs            NNP     I-NP   I-ORG         11192
145091        '            POS     B-NP       O         11192
145092  playoff             NN     I-NP       O         11192
145093    hopes            NNS     I-NP       O         11192
...         ...            ...      ...     ...           ...
145284      1-0             JJ     I-NP       O         11201
145285     down             JJ     I-NP       O         11201
145286    after             IN     I-PP       O         11201
145287     four             CD     I-NP       O         11201
145288  minutes            NNS     I-NP       O         11201

[200 rows x 5 columns]


In [9]:
# Create the right formats for the training and testing sets so they are compatible to be
#  put through the above functions to create features:
#  I am creating a list of list of tuples for the training set and 
#  testing set.  Each tuple contains a row of the original
#  dataframe, and the tuple is an element of a list containing tuples (words) that are all  
#  in the same sentence.  This list is then part of the outer list of all sentences in the
#  original data (split between training and testing data).

sentence_num_counter = 1
outer_list_training = []
curr_same_sentence_list = []
for index, row in training_crf.iterrows():
    curr_sentence_num = row['Sentence_num']
    if (index > 0):
        if (len(curr_list) != 0):
            curr_same_sentence_list.append(curr_list)
    if (curr_sentence_num == sentence_num_counter):
        curr_list = tuple([row['Word'], row['Part_of_speech'], row['NER_tag']])
        curr_same_sentence_list.append(curr_list)
        curr_list = []
    else:
        outer_list_training.append(curr_same_sentence_list)
        sentence_num_counter = sentence_num_counter + 1
        curr_list = tuple([row['Word'], row['Part_of_speech'], row['NER_tag']])
        curr_same_sentence_list=[]
#print(outer_list_training[0:3])
#print("\n")

sentence_num_counter = 11192
outer_list_testing = []
curr_same_sentence_list = []
for index, row in testing_crf.iterrows():
    curr_sentence_num = row['Sentence_num']
    if (index > 0):
        if (len(curr_list) != 0):
            curr_same_sentence_list.append(curr_list)
    if (curr_sentence_num == sentence_num_counter):
        curr_list = tuple([row['Word'], row['Part_of_speech'], row['NER_tag']])
        curr_same_sentence_list.append(curr_list)
        curr_list = []
    else:
        outer_list_testing.append(curr_same_sentence_list)
        curr_list = tuple([row['Word'], row['Part_of_speech'], row['NER_tag']])
        curr_same_sentence_list=[]

print("done")
#print(outer_list_testing[0:3])

done


In [10]:
# Conver our training and testing set into a list of dictionaries using
#  the functions above where each dictionary
#  corresponds to a sentence and the dictionary keys correspond to a feature. 

# Please note that in the above step, I ended up not including the chunking or sentence 
#  number columns as the functions below would not finish when I included them.  Thus,
#  I decided to keep the three columns I thought were most important, mainly the word, 
#  the part of speech, and the NER tag.

X_train = [sent2features(s) for s in outer_list_training]
y_train = [sent2labels(s) for s in outer_list_training]

X_test = [sent2features(s) for s in outer_list_testing]
y_test = [sent2labels(s) for s in outer_list_testing]

print("done")

done


In [75]:
#X_train[0:1]

In [11]:
# Train the CRF model on our training set (X_train):
#  Similar to the process I used when building evaluating the first three models, I
#  fit the CRF model on the training set created above, and then predicted the target
#  values for the testing set.  I compared these predicted values to the test set's 
#  actual values in order to calculate the f1 score, with which I used to evaluate the 
#  CRF's performance among those of the other models. I again used default parameters
#  for the model, which I explicitly wrote out below. To be consistent with the other
#  models above, I did not remove the 'O' labels, for the same reason explained 
#  previously.  

from itertools import chain

import nltk
import sklearn
import scipy.stats
from sklearn.metrics import make_scorer
from sklearn.model_selection import cross_val_score
#from sklearn.grid_search import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV

import sklearn
import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics

crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)
print("done")

done


In [12]:
# Evaluate the CRF model:

y_pred = crf.predict(X_test)
print("f1 score:", (metrics.flat_f1_score(y_test, y_pred, average='weighted')))

f1 score: 0.9355090635291945


In [17]:
# Sort the NER tags and look at the flat classification report
sorted_labels = sorted(
    labels,
    key=lambda name: (name[1:], name[0])
)
print(metrics.flat_classification_report(
    y_test, y_pred, labels=sorted_labels, digits=3
))

# By looking at the f1 score, we can see that this model performs well,
#  with a score of .936.  It outperforms the Multinomial Naive Bayes model, the
#  Perceptron model, and the CRF model.  Looking at the flat classification report, 
#  we can see that the precision for predicting I-LOC (.884) and I-MISC (.894) 
#  are the highest among all NER tags with this CRF model, and the recall is highest 
#  for the I-PER tag (.908).  Thus, the model is able to make the best predidctions for
#  these categories of NER tags.

  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


              precision    recall  f1-score   support

       B-LOC      0.000     0.000     0.000         1
       I-LOC      0.884     0.830     0.856      2444
      B-MISC      0.000     0.000     0.000         8
      I-MISC      0.894     0.735     0.806      1429
       B-ORG      0.000     0.000     0.000         0
       I-ORG      0.802     0.764     0.783      2809
       I-PER      0.842     0.908     0.874      2743

   micro avg      0.847     0.818     0.832      9434
   macro avg      0.489     0.462     0.474      9434
weighted avg      0.848     0.818     0.831      9434



In [None]:
# Citations for sources used:

# “Tutorial¶.” Sklearn, https://sklearn-crfsuite.readthedocs.io/en/latest/tutorial.html.

# “Depends on the Definition.” Depends on the Definition - It's about Machine Learning, 
#  Data Science and More, https://www.depends-on-the-definition.com/sequence-tagging-lstm-crf/.

# Safdari, Nasir. “Named Entity Recognition (NER), Meeting Industry's Requirement by 
#  Applying State-of-the-Art Deep...” Medium, Towards Data Science, 12 Dec. 2018, 
#  https://towardsdatascience.com/named-entity-recognition-ner-meeting-industrys-requirement-by-applying-state-of-the-art-deep-698d2b3b4ede.