In [160]:
from nltk.grammar import DependencyGrammar
from nltk.parse.dependencygraph import DependencyGraph

In [161]:
import nltk
nltk.download('dependency_treebank')

[nltk_data] Downloading package dependency_treebank to
[nltk_data]     /Users/simone/nltk_data...
[nltk_data]   Package dependency_treebank is already up-to-date!


True

In [162]:
from nltk.corpus import dependency_treebank
t = dependency_treebank.parsed_sents()
print(t[0].tree())

(will
  (Vinken Pierre , (old (years 61)) ,)
  (join (board the) (as (director a nonexecutive)) (Nov. 29))
  .)


We can download the GloVe embeddings to vectorize our data.

**Uncomment to actually download!**

In [163]:
#!wget http://nlp.stanford.edu/data/glove.6B.zip
#!unzip -q glove.6B.zip

In [164]:
# We can now get a list of the input documents
import os
docs = os.listdir('data/dependency_treebank')
docs

['wsj_0095.dp',
 'wsj_0184.dp',
 'wsj_0177.dp',
 'wsj_0037.dp',
 'wsj_0126.dp',
 'wsj_0066.dp',
 'wsj_0052.dp',
 'wsj_0112.dp',
 'wsj_0003.dp',
 'wsj_0143.dp',
 'wsj_0153.dp',
 'wsj_0013.dp',
 'wsj_0102.dp',
 'wsj_0042.dp',
 'wsj_0076.dp',
 'wsj_0136.dp',
 'wsj_0027.dp',
 'wsj_0167.dp',
 'wsj_0194.dp',
 'wsj_0085.dp',
 'wsj_0007.dp',
 'wsj_0147.dp',
 'wsj_0056.dp',
 'wsj_0116.dp',
 'wsj_0180.dp',
 'wsj_0091.dp',
 'wsj_0122.dp',
 'wsj_0062.dp',
 'wsj_0173.dp',
 'wsj_0033.dp',
 'wsj_0023.dp',
 'wsj_0163.dp',
 'wsj_0072.dp',
 'wsj_0132.dp',
 'wsj_0081.dp',
 'wsj_0190.dp',
 'wsj_0106.dp',
 'wsj_0046.dp',
 'wsj_0157.dp',
 'wsj_0017.dp',
 'wsj_0006.dp',
 'wsj_0146.dp',
 'wsj_0057.dp',
 'wsj_0117.dp',
 'wsj_0181.dp',
 'wsj_0090.dp',
 'wsj_0123.dp',
 'wsj_0063.dp',
 'wsj_0172.dp',
 'wsj_0032.dp',
 'wsj_0022.dp',
 'wsj_0162.dp',
 'wsj_0073.dp',
 'wsj_0133.dp',
 'wsj_0080.dp',
 'wsj_0191.dp',
 'wsj_0107.dp',
 'wsj_0047.dp',
 'wsj_0156.dp',
 'wsj_0016.dp',
 'wsj_0094.dp',
 'wsj_0185.dp',
 'wsj_01

In [165]:
# And split train, dev, and test sets
import random
random.shuffle(docs)
train_docs = docs[:int(0.5*len(docs))]
dev_docs = docs[int(0.5*len(docs)):int(0.75*len(docs))]
test_docs = docs[int(0.75*len(docs)):]
len(train_docs), len(dev_docs), len(test_docs)

(99, 50, 50)

In [166]:
import numpy as np
def parse_dataset(docs, dir):
    """
    Parse the dependency treebank dataset.
    """
    X = []
    y = []
    for doc in docs:
        np_doc = np.loadtxt(dir+doc, str, delimiter='\t')
        X.append(" ".join(np_doc[:,0]))
        y.append(" ".join(np_doc[:,1]))
    return np.array(X),np.array(y)

In [167]:
X_train, y_train = parse_dataset(train_docs, 'data/dependency_treebank/')
X_test, y_test = parse_dataset(test_docs, 'data/dependency_treebank/')
X_dev, y_dev = parse_dataset(dev_docs, 'data/dependency_treebank/')

In [168]:
from tensorflow.keras.layers import TextVectorization
import tensorflow as tf

X_vectorizer = TextVectorization(max_tokens=20000, output_sequence_length=200)
X_train_ds = tf.data.Dataset.from_tensor_slices(X_train)
X_vectorizer.adapt(X_train_ds)
X_vectorizer.get_vocabulary()[:5]

['', '[UNK]', 'the', 'of', 'to']

In [169]:
y_vectorizer = TextVectorization(max_tokens=100, output_sequence_length=200)
y_train_ds = tf.data.Dataset.from_tensor_slices(y_train)
y_vectorizer.adapt(y_train_ds)
y_vectorizer.get_vocabulary()[:5]

['', '[UNK]', 'nn', 'in', 'nnp']

In [170]:
import numpy as np
embeddings_index = {}
embedding_dim = 100
with open("glove.6B."+str(embedding_dim) + "d.txt") as f:
    for line in f:
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs, "f", sep=" ")
        embeddings_index[word] = coefs
embeddings_index["the"]

array([-0.038194, -0.24487 ,  0.72812 , -0.39961 ,  0.083172,  0.043953,
       -0.39141 ,  0.3344  , -0.57545 ,  0.087459,  0.28787 , -0.06731 ,
        0.30906 , -0.26384 , -0.13231 , -0.20757 ,  0.33395 , -0.33848 ,
       -0.31743 , -0.48336 ,  0.1464  , -0.37304 ,  0.34577 ,  0.052041,
        0.44946 , -0.46971 ,  0.02628 , -0.54155 , -0.15518 , -0.14107 ,
       -0.039722,  0.28277 ,  0.14393 ,  0.23464 , -0.31021 ,  0.086173,
        0.20397 ,  0.52624 ,  0.17164 , -0.082378, -0.71787 , -0.41531 ,
        0.20335 , -0.12763 ,  0.41367 ,  0.55187 ,  0.57908 , -0.33477 ,
       -0.36559 , -0.54857 , -0.062892,  0.26584 ,  0.30205 ,  0.99775 ,
       -0.80481 , -3.0243  ,  0.01254 , -0.36942 ,  2.2167  ,  0.72201 ,
       -0.24978 ,  0.92136 ,  0.034514,  0.46745 ,  1.1079  , -0.19358 ,
       -0.074575,  0.23353 , -0.052062, -0.22044 ,  0.057162, -0.15806 ,
       -0.30798 , -0.41625 ,  0.37972 ,  0.15006 , -0.53212 , -0.2055  ,
       -1.2526  ,  0.071624,  0.70565 ,  0.49744 , 

In [171]:
vocabulary = X_vectorizer.get_vocabulary()
word_index = dict(zip(vocabulary, range(len(vocabulary))))
embedding_matrix = np.zeros((len(vocabulary)+2, embedding_dim))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [172]:
from tensorflow.keras.layers import Embedding
from tensorflow.keras import initializers

embedding_layer = Embedding(
    len(vocabulary)+2, # Number of tokens in the vocabulary
    embedding_dim, # Dimensions of the embedding
    embeddings_initializer=initializers.Constant(embedding_matrix),
    trainable=False,
)

In [173]:
# Prepare the training set
X_train = X_vectorizer(np.array([[s] for s in X_train])).numpy()
y_train = y_vectorizer(np.array([[s] for s in y_train])).numpy()
# Transforming y_train into one-hot vectors
one_hot_depth = np.max(y_train) + 1
y_train = tf.one_hot(y_train, one_hot_depth).numpy()


In [174]:
# Prepare the test set
X_test = X_vectorizer(np.array([[s] for s in X_test])).numpy()
y_test = y_vectorizer(np.array([[s] for s in y_test])).numpy()
# Transforming y_test into one-hot vectors
y_test = tf.one_hot(y_test, one_hot_depth).numpy()

In [182]:
from tensorflow.keras import layers, models, Input

model = models.Sequential()
model.add(embedding_layer)
model.add(layers.Bidirectional(layers.LSTM(32, return_sequences=True)))
model.add(layers.Dense(one_hot_depth, activation="softmax"))
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
model.summary()

Model: "sequential_11"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_6 (Embedding)     (None, None, 100)         709700    
                                                                 
 bidirectional_11 (Bidirecti  (None, None, 64)         34048     
 onal)                                                           
                                                                 
 dense_11 (Dense)            (None, None, 35)          2275      
                                                                 
Total params: 746,023
Trainable params: 36,323
Non-trainable params: 709,700
_________________________________________________________________


In [183]:
# Train model
model.fit(X_train, y_train, epochs=50, batch_size=1024, validation_data=(X_test, y_test))

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x7fb8bbbfab50>

In [181]:
for i, word_tag in enumerate(model.predict(X_test)[0]):
    print(np.argmax(word_tag))
    print(np.argmax(y_test[0][i]))
    print()

6
4

6
3

6
4

28
4

2
4

6
9

6
12

28
9

6
12

6
2

6
3

6
4

2
4

28
4

6
4

6
3

6
3

2
8

6
8

28
4

6
3

6
4

6
21

6
10

6
15

6
8

6
6

6
13

6
8

6
8

6
3

6
6

6
9

6
5

6
2

6
16

6
11

6
5

6
4

6
4

28
2

28
17

6
2

6
3

6
4

6
4

2
4

6
15

6
8

6
8

6
3

6
6

6
13

6
8

6
6

6
5

6
4

2
4

6
2

6
6

6
19

6
11

6
16

6
11

6
3

6
12

6
7

6
6

6
3

27
7

6
8

6
26

6
5

6
19

6
11

6
5

6
4

6
3

6
4

6
2

6
4

6
7

6
4

6
3

6
4

6
9

6
12

6
19

6
11

6
5

6
8

2
2

27
6

6
3

6
5

6
7

6
4

2
4

6
2

6
2

6
4

6
3

6
4

6
9

6
5

6
3

6
5

6
6

6
19

6
11

6
7

6
6

6
3

6
4

6
3

6
4

6
13

6
12

6
9

6
3

6
2

6
6

6
16

6
5

6
6

6
10

6
17

6
3

6
5

6
2

6
19

6
11

6
4

6
3

6
4

28
20

6
2

2
6

6
3

6
8

6
3

6
8

6
8

6
16

6
8

6
8

6
13

6
8

6
6

6
16

6
8

6
6

6
5

27
2

28
0

28
0

28
0

28
0

28
0

28
0

28
0

28
0

28
0

28
0

28
0

28
0

28
0

28
0

28
0

28
0

28
0

28
0

28
0

28
0

28
0

28
0

28
0

28
0

28
0

28
0

28
0

28
0

28
0

28
0

28
0


In [None]:
vocab = X_vectorizer.get_vocabulary()
for word in X_train[0]:
    print(vocab[word], end=" ")

hadson corp said it expects to report a thirdquarter net loss of 17 million to 19 million because of special reserves and continued low naturalgas prices the oklahoma city energy and defense concern said it will record a 75 million reserve for its defense group including a 47 million charge related to problems under a fixedprice development contract and 28 million in overhead costs that wo nt be reimbursed in addition hadson said it will write off about 35 million in costs related to international exploration leases where exploration efforts have been unsuccessful the company also cited interest costs and amortization of goodwill as factors in the loss a year earlier net income was 21 million or six cents a share on revenue of 1699 million                                                                           