In [1]:
from nltk.grammar import DependencyGrammar
from nltk.parse.dependencygraph import DependencyGraph

In [2]:
import nltk
nltk.download('dependency_treebank')

[nltk_data] Downloading package dependency_treebank to
[nltk_data]     /Users/simone/nltk_data...
[nltk_data]   Package dependency_treebank is already up-to-date!


True

In [3]:
from nltk.corpus import dependency_treebank
t = dependency_treebank.parsed_sents()
print(t[0].tree())

(will
  (Vinken Pierre , (old (years 61)) ,)
  (join (board the) (as (director a nonexecutive)) (Nov. 29))
  .)


We can download the GloVe embeddings to vectorize our data.

**Uncomment to actually download!**

In [4]:
#!wget http://nlp.stanford.edu/data/glove.6B.zip
#!unzip -q glove.6B.zip

In [5]:
# We can now get a list of the input documents
import os
docs = os.listdir('data/dependency_treebank')
docs

['wsj_0095.dp',
 'wsj_0184.dp',
 'wsj_0177.dp',
 'wsj_0037.dp',
 'wsj_0126.dp',
 'wsj_0066.dp',
 'wsj_0052.dp',
 'wsj_0112.dp',
 'wsj_0003.dp',
 'wsj_0143.dp',
 'wsj_0153.dp',
 'wsj_0013.dp',
 'wsj_0102.dp',
 'wsj_0042.dp',
 'wsj_0076.dp',
 'wsj_0136.dp',
 'wsj_0027.dp',
 'wsj_0167.dp',
 'wsj_0194.dp',
 'wsj_0085.dp',
 'wsj_0007.dp',
 'wsj_0147.dp',
 'wsj_0056.dp',
 'wsj_0116.dp',
 'wsj_0180.dp',
 'wsj_0091.dp',
 'wsj_0122.dp',
 'wsj_0062.dp',
 'wsj_0173.dp',
 'wsj_0033.dp',
 'wsj_0023.dp',
 'wsj_0163.dp',
 'wsj_0072.dp',
 'wsj_0132.dp',
 'wsj_0081.dp',
 'wsj_0190.dp',
 'wsj_0106.dp',
 'wsj_0046.dp',
 'wsj_0157.dp',
 'wsj_0017.dp',
 'wsj_0006.dp',
 'wsj_0146.dp',
 'wsj_0057.dp',
 'wsj_0117.dp',
 'wsj_0181.dp',
 'wsj_0090.dp',
 'wsj_0123.dp',
 'wsj_0063.dp',
 'wsj_0172.dp',
 'wsj_0032.dp',
 'wsj_0022.dp',
 'wsj_0162.dp',
 'wsj_0073.dp',
 'wsj_0133.dp',
 'wsj_0080.dp',
 'wsj_0191.dp',
 'wsj_0107.dp',
 'wsj_0047.dp',
 'wsj_0156.dp',
 'wsj_0016.dp',
 'wsj_0094.dp',
 'wsj_0185.dp',
 'wsj_01

In [6]:
# And split train, dev, and test sets
import random
random.shuffle(docs)
train_docs = docs[:int(0.5*len(docs))]
dev_docs = docs[int(0.5*len(docs)):int(0.75*len(docs))]
test_docs = docs[int(0.75*len(docs)):]
len(train_docs), len(dev_docs), len(test_docs)

(99, 50, 50)

In [7]:
import numpy as np
def parse_dataset(docs, dir):
    """
    Parse the dependency treebank dataset.
    """
    X = []
    y = []
    for doc in docs:
        np_doc = np.loadtxt(dir+doc, str, delimiter='\t')
        X.append(" ".join(np_doc[:,0]))
        y.append(" ".join(np_doc[:,1]))
    return np.array(X),np.array(y)

In [8]:
X_train, y_train = parse_dataset(train_docs, 'data/dependency_treebank/')
X_test, y_test = parse_dataset(test_docs, 'data/dependency_treebank/')
X_dev, y_dev = parse_dataset(dev_docs, 'data/dependency_treebank/')

In [9]:
from tensorflow.keras.layers import TextVectorization
import tensorflow as tf
sequence_length = 500
X_vectorizer = TextVectorization(max_tokens=20000, output_sequence_length=sequence_length)
X_train_ds = tf.data.Dataset.from_tensor_slices(X_train)
X_vectorizer.adapt(X_train_ds)
X_vectorizer.get_vocabulary()[:5]

2022-03-11 09:45:27.456615: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


['', '[UNK]', 'the', 'of', 'a']

In [10]:
y_vectorizer = TextVectorization(max_tokens=100, output_sequence_length=sequence_length)
y_train_ds = tf.data.Dataset.from_tensor_slices(y_train)
y_vectorizer.adapt(y_train_ds)
y_vectorizer.get_vocabulary()[:5]

['', '[UNK]', 'nn', 'in', 'nnp']

In [11]:
import numpy as np
embeddings_index = {}
embedding_dim = 100
with open("glove.6B."+str(embedding_dim) + "d.txt") as f:
    for line in f:
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs, "f", sep=" ")
        embeddings_index[word] = coefs
embeddings_index["the"]

array([-0.038194, -0.24487 ,  0.72812 , -0.39961 ,  0.083172,  0.043953,
       -0.39141 ,  0.3344  , -0.57545 ,  0.087459,  0.28787 , -0.06731 ,
        0.30906 , -0.26384 , -0.13231 , -0.20757 ,  0.33395 , -0.33848 ,
       -0.31743 , -0.48336 ,  0.1464  , -0.37304 ,  0.34577 ,  0.052041,
        0.44946 , -0.46971 ,  0.02628 , -0.54155 , -0.15518 , -0.14107 ,
       -0.039722,  0.28277 ,  0.14393 ,  0.23464 , -0.31021 ,  0.086173,
        0.20397 ,  0.52624 ,  0.17164 , -0.082378, -0.71787 , -0.41531 ,
        0.20335 , -0.12763 ,  0.41367 ,  0.55187 ,  0.57908 , -0.33477 ,
       -0.36559 , -0.54857 , -0.062892,  0.26584 ,  0.30205 ,  0.99775 ,
       -0.80481 , -3.0243  ,  0.01254 , -0.36942 ,  2.2167  ,  0.72201 ,
       -0.24978 ,  0.92136 ,  0.034514,  0.46745 ,  1.1079  , -0.19358 ,
       -0.074575,  0.23353 , -0.052062, -0.22044 ,  0.057162, -0.15806 ,
       -0.30798 , -0.41625 ,  0.37972 ,  0.15006 , -0.53212 , -0.2055  ,
       -1.2526  ,  0.071624,  0.70565 ,  0.49744 , 

In [12]:
vocabulary = X_vectorizer.get_vocabulary()
word_index = dict(zip(vocabulary, range(len(vocabulary))))
embedding_matrix = np.zeros((len(vocabulary)+2, embedding_dim))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [13]:
from tensorflow.keras.layers import Embedding
from tensorflow.keras import initializers

embedding_layer = Embedding(
    len(vocabulary)+2, # Number of tokens in the vocabulary
    embedding_dim, # Dimensions of the embedding
)

In [14]:
# Prepare the training set
X_train = X_vectorizer(np.array([[s] for s in X_train])).numpy()
y_train = y_vectorizer(np.array([[s] for s in y_train])).numpy()
# Transforming y_train into one-hot vectors
one_hot_depth = np.max(y_train) + 1
y_train = tf.one_hot(y_train, one_hot_depth).numpy()


In [15]:
X_train.shape

(99, 500)

In [16]:
# Prepare the test set
X_test = X_vectorizer(np.array([[s] for s in X_test])).numpy()
y_test = y_vectorizer(np.array([[s] for s in y_test])).numpy()
# Transforming y_test into one-hot vectors
y_test = tf.one_hot(y_test, one_hot_depth).numpy()

In [17]:
from tensorflow.keras import layers, models, Input
from tensorflow.keras.optimizers import Adam

model = models.Sequential()
model.add(layers.InputLayer(input_shape=(sequence_length,)))
model.add(embedding_layer)
model.add(layers.Bidirectional(layers.LSTM(512, return_sequences=True, kernel_regularizer=tf.keras.regularizers.l2(0.001))))
model.add(layers.TimeDistributed(layers.Dense(one_hot_depth)))
model.add(layers.Activation('softmax'))
model.compile(loss="binary_crossentropy", optimizer=Adam(0.001),  metrics=["accuracy"])
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 500, 100)          710100    
                                                                 
 bidirectional (Bidirectiona  (None, 500, 1024)        2510848   
 l)                                                              
                                                                 
 time_distributed (TimeDistr  (None, 500, 37)          37925     
 ibuted)                                                         
                                                                 
 activation (Activation)     (None, 500, 37)           0         
                                                                 
Total params: 3,258,873
Trainable params: 3,258,873
Non-trainable params: 0
_________________________________________________________________


In [18]:
# Train model
model.fit(X_train, y_train, epochs=100, batch_size=12, validation_data=(X_test, y_test))

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100

KeyboardInterrupt: 

In [None]:
for i, word_tag in enumerate(model.predict(X_test)[0]):
    print(np.argmax(word_tag))
    print(np.argmax(y_test[0][i]))
    print()

2
4

2
4

2
9

2
12

2
9

2
15

2
11

2
8

0
8

0
6

0
7

0
29

0
8

0
8

0
27

0
3

0
8

0
2

0
6

0
7

0
6

0
4

0
8

0
8

0
10

0
3

0
8

0
2

0
2

0
7

0
5

0
6

0
6

0
2

0
2

0
9

0
5

0
2

0
2

0
16

0
5

0
2

0
15

0
11

0
8

0
2

0
2

0
3

0
7

0
3

0
2

0
13

0
14

0
2

0
15

0
5

0
2

0
3

0
2

0
5

0
7

0
18

0
4

0
8

0
8

0
5

0
2

0
19

0
11

0
14

0
3

0
6

0
4

0
7

0
3

0
5

0
2

0
5

0
2

0
9

0
16

0
22

0
3

0
8

0
2

0
5

0
2

0
9

0
15

0
11

0
6

0
7

0
3

0
5

0
2

0
18

0
4

0
4

0
4

0
13

0
4

0
4

0
4

0
4

0
5

0
6

0
2

0
7

0
4

0
9

0
12

0
16

0
15

0
11

0
5

0
2

0
3

0
5

0
2

0
3

0
5

0
2

0
0

0
0

0
0

0
0

0
0

0
0

0
0

0
0

0
0

0
0

0
0

0
0

0
0

0
0

0
0

0
0

0
0

0
0

0
0

0
0

0
0

0
0

0
0

0
0

0
0

0
0

0
0

0
0

0
0

0
0

0
0

0
0

0
0

0
0

0
0

0
0

0
0

0
0

0
0

0
0

0
0

0
0

0
0

0
0

0
0

0
0

0
0

0
0

0
0

0
0

0
0

0
0

0
0

0
0

0
0

0
0

0
0

0
0

0
0

0
0

0
0

0
0

0
0

0
0

0
0

0
0

0
0

0
0

0
0

0
0

0
0

0
0

0
0


In [None]:
vocab = X_vectorizer.get_vocabulary()
for word in X_train[0]:
    print(vocab[word], end=" ")

companies listed below reported quarterly profit substantially different from the average of analysts estimates the companies are followed by at least three analysts and had a minimum fivecent change in actual earnings per share estimated and actual results involving losses are omitted the percent difference compares actual profit with the 30day estimate where at least three analysts have issues forecasts in the past 30 days otherwise actual profit is compared with the 300day estimate                                                                                                                               