In [7]:
import numpy, json, argparse
from keras.models import Sequential
from keras.layers.core import Dense, Activation
from keras.optimizers import SGD
from sklearn.metrics import accuracy_score, classification_report, f1_score, confusion_matrix, precision_score, recall_score
from sklearn.preprocessing import LabelBinarizer
numpy.random.seed(1337)

In [8]:
# Read in the NE data, with either 2 or 6 classes
def read_corpus(corpus_file, binary_classes):
    print('Reading in data from {0}...'.format(corpus_file))
    words = []
    labels = []
    with open(corpus_file, encoding='utf-8') as f:
        for line in f:
            parts = line.strip().split()
            words.append(parts[0])
            if binary_classes:
                if parts[1] in ['GPE', 'LOC']:
                    labels.append('LOCATION')
                else:
                    labels.append('NON-LOCATION')
            else:
                labels.append(parts[1])	
    print('Done!')
    return words, labels

# Read in word embeddings 
def read_embeddings(embeddings_file):
    print('Reading in embeddings from {0}...'.format(embeddings_file))
    embeddings = json.load(open(embeddings_file, 'r'))
    embeddings = {word:numpy.array(embeddings[word]) for word in embeddings}
    print('Done!')
    return embeddings

# Turn words into embeddings, i.e. replace words by their corresponding embeddings
def vectorizer(words, embeddings):
    vectorized_words = []
    for word in words:
        try:
            vectorized_words.append(embeddings[word.lower()])
        except KeyError:
            vectorized_words.append(embeddings['UNK'])
    return numpy.array(vectorized_words)

In [9]:
data = 'named_entity_data.txt'
embeddings = 'embeddings.json'
binary = True

In [10]:
# Read in the data and embeddings
X, Y = read_corpus(data, binary_classes = binary)
embeddings = read_embeddings(embeddings)
# Transform words to embeddings
X = vectorizer(X, embeddings)
# Transform string labels to one-hot encodings
encoder = LabelBinarizer()
Y = encoder.fit_transform(Y) # Use encoder.classes_ to find mapping of one-hot indices to string labels
if binary:
    Y = numpy.where(Y == 1, [0,1], [1,0])
# Split in training and test data
split_point = int(0.75*len(X))
Xtrain = X[:split_point]
Ytrain = Y[:split_point]
Xtest = X[split_point:]
Ytest = Y[split_point:]
# Define the properties of the perceptron model
model = Sequential()
model.add(Dense(input_dim = X.shape[1], units = Y.shape[1]))
model.add(Activation("linear"))
sgd = SGD(lr = 0.01)
loss_function = 'mean_squared_error'
model.compile(loss = loss_function, optimizer = sgd, metrics=['accuracy'])
# Train the perceptron
model.fit(Xtrain, Ytrain, verbose = 1, epochs = 1, batch_size = 32)
# Get predictions
Yguess = model.predict(Xtest)
# Convert to numerical labels to get scores with sklearn in 6-way setting
Yguess = numpy.argmax(Yguess, axis = 1)
Ytest = numpy.argmax(Ytest, axis = 1)
print('Classification accuracy on test: {0}'.format(accuracy_score(Ytest, Yguess)))
print('Classification report: ')
print(classification_report(Ytest, Yguess))

Reading in data from named_entity_data.txt...
Done!
Reading in embeddings from embeddings.json...
Done!
Epoch 1/1
Classification accuracy on test: 0.9177435666928868
Classification report: 
             precision    recall  f1-score   support

          0       0.85      0.93      0.89      3092
          1       0.96      0.91      0.94      5807

avg / total       0.92      0.92      0.92      8899



In [11]:
# Define a baseline classification by the zero rule algorithm
counts = numpy.zeros(len(Ytrain[0]),dtype=int)
for i in Ytrain:
    idx = numpy.nonzero(i)
    counts[idx] = counts[idx] + 1

print(counts)
most_common = numpy.argmax(counts)
print(most_common)

Yguess = numpy.full(len(Ytest), most_common)

print('Classification accuracy on baseline: {0}'.format(accuracy_score(Ytest, Yguess)))
print('Classification report: ')
print(classification_report(Ytest, Yguess))

[ 8913 17783]
1
Classification accuracy on baseline: 0.6525452298011013
Classification report: 
             precision    recall  f1-score   support

          0       0.00      0.00      0.00      3092
          1       0.65      1.00      0.79      5807

avg / total       0.43      0.65      0.52      8899



  'precision', 'predicted', average, warn_for)


In [12]:
data = 'named_entity_data.txt'
embeddings = 'embeddings.json'
binary = False
# Read in the data and embeddings
X, Y = read_corpus(data, binary_classes = binary)
embeddings = read_embeddings(embeddings)
# Transform words to embeddings
X = vectorizer(X, embeddings)
# Transform string labels to one-hot encodings
encoder = LabelBinarizer()
Y = encoder.fit_transform(Y) # Use encoder.classes_ to find mapping of one-hot indices to string labels
if binary:
    Y = numpy.where(Y == 1, [0,1], [1,0])
# Split in training and test data
split_point = int(0.75*len(X))
Xtrain = X[:split_point]
Ytrain = Y[:split_point]
Xtest = X[split_point:]
Ytest = Y[split_point:]
# Define the properties of the perceptron model
model = Sequential()
model.add(Dense(input_dim = X.shape[1], units = Y.shape[1]))
model.add(Activation("linear"))
sgd = SGD(lr = 0.01)
loss_function = 'mean_squared_error'
model.compile(loss = loss_function, optimizer = sgd, metrics=['accuracy'])
# Train the perceptron
model.fit(Xtrain, Ytrain, verbose = 1, epochs = 1, batch_size = 32)
# Get predictions
Yguess = model.predict(Xtest)
# Convert to numerical labels to get scores with sklearn in 6-way setting
Yguess = numpy.argmax(Yguess, axis = 1)
Ytest = numpy.argmax(Ytest, axis = 1)
print('Classification accuracy on test: {0}'.format(accuracy_score(Ytest, Yguess)))
print('Classification report: ')
print(classification_report(Ytest, Yguess))

Reading in data from named_entity_data.txt...
Done!
Reading in embeddings from embeddings.json...
Done!
Epoch 1/1
Classification accuracy on test: 0.6580514664569053
Classification report: 
             precision    recall  f1-score   support

          0       0.71      0.83      0.76      1311
          1       0.65      0.68      0.66      1017
          2       0.75      0.82      0.78      2915
          3       0.01      0.01      0.01       177
          4       0.66      0.37      0.48      2072
          5       0.57      0.64      0.61      1407

avg / total       0.67      0.66      0.65      8899



In [13]:

# Define a baseline classification by the zero rule algorithm
counts = numpy.zeros(len(Ytrain[0]),dtype=int)
for i in Ytrain:
    idx = numpy.nonzero(i)
    counts[idx] = counts[idx] + 1

print(counts)
most_common = numpy.argmax(counts)
print(most_common)

Yguess = numpy.full(len(Ytest), most_common)

print('Classification accuracy on baseline: {0}'.format(accuracy_score(Ytest, Yguess)))
print('Classification report: ')
print(classification_report(Ytest, Yguess))

[3980 3196 8477  436 6059 4548]
2
Classification accuracy on baseline: 0.32756489493201485
Classification report: 
             precision    recall  f1-score   support

          0       0.00      0.00      0.00      1311
          1       0.00      0.00      0.00      1017
          2       0.33      1.00      0.49      2915
          3       0.00      0.00      0.00       177
          4       0.00      0.00      0.00      2072
          5       0.00      0.00      0.00      1407

avg / total       0.11      0.33      0.16      8899



  'precision', 'predicted', average, warn_for)


In [14]:
data = 'named_entity_data.txt'
embeddings = 'embeddings.json'
binary = True

# Read in the data and embeddings
X, Y = read_corpus(data, binary_classes = binary)
embeddings = read_embeddings(embeddings)
# Transform words to embeddings
X = vectorizer(X, embeddings)
# Transform string labels to one-hot encodings
encoder = LabelBinarizer()
Y = encoder.fit_transform(Y) # Use encoder.classes_ to find mapping of one-hot indices to string labels
if binary:
    Y = numpy.where(Y == 1, [0,1], [1,0])

epochs = range(1,10)
acc = []
fscore = []
for e in epochs:    
    # Split in training and test data
    split_point = int(0.75*len(X))
    Xtrain = X[:split_point]
    Ytrain = Y[:split_point]
    Xtest = X[split_point:]
    Ytest = Y[split_point:]


    # Define the properties of the perceptron model
    model = Sequential()
    model.add(Dense(input_dim = X.shape[1], units = Y.shape[1]))
    model.add(Activation("linear"))
    sgd = SGD(lr = 0.01)
    loss_function = 'mean_squared_error'
    model.compile(loss = loss_function, optimizer = sgd, metrics=['accuracy'])

    # Train the perceptron
    model.fit(Xtrain, Ytrain, verbose = 1, epochs = e, batch_size = 32)
    # Get predictions
    Yguess = model.predict(Xtest)
    # Convert to numerical labels to get scores with sklearn in 6-way setting
    Yguess = numpy.argmax(Yguess, axis = 1)
    Ytest = numpy.argmax(Ytest, axis = 1)
    acc.append(accuracy_score(Ytest, Yguess))
    fscore.append(f1_score(Ytest, Yguess))

Reading in data from named_entity_data.txt...
Done!
Reading in embeddings from embeddings.json...
Done!
Epoch 1/1
Epoch 1/2
Epoch 2/2
Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7
Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8
Epoch 1/9
Epoch 2/9
Epoch 3/9
Epoch 4/9
Epoch 5/9
Epoch 6/9
Epoch 7/9
Epoch 8/9
Epoch 9/9


In [18]:
import pyplot as plt
plt.plot(epochs, acc)
plt.plot(epochs, fscore)
plt.ylabel('scores')
plt.xlabel('epochs')
plt.legend(['accuracy', 'f1-score'])
plt.show()

ImportError: No module named 'pyplot'