## Next steps
- Decent preprocessing
- OOV

In [1]:
from nltk.grammar import DependencyGrammar
from nltk.parse.dependencygraph import DependencyGraph

In [2]:
import os, random
import numpy as np
import tensorflow as tf
import tensorflow_datasets as tfds
from tensorflow.keras.layers import TextVectorization


class DataInput():
    def __init__(self, data_url, train_size, dev_size, sequence_length=200):
        docs= self.import_data(data_url)
        self.sequence_length = sequence_length
        self.datasets = self.train_dev_test_split(docs, train_size, dev_size)
        self.X_vocabulary, self.y_vocabulary = self.setup_vectorizers(*self.datasets[0])
        for i, (X,y) in enumerate(self.datasets):
            self.datasets[i] = self.vectorize_dataset(X,y)
        
    def import_data(self, data_url):
        """
        Import POS dataset from URL.
        """
        !wget -O data.zip $data_url
        !unzip -o data.zip -d data
        !rm data.zip
        self.data_dir = "data/" + os.listdir('data')[0] + "/"
        docs = os.listdir(self.data_dir)
        return docs

    def parse_dataset(self, docs):
        """
        Parse the dependency treebank dataset.
        """
        X = []
        y = []
        for doc in docs:
            np_doc = np.loadtxt(self.data_dir+doc, str, delimiter='\t')
            X.append(" ".join(np_doc[:,0]))
            y.append(" ".join(np_doc[:,1]))
        return np.array(X),np.array(y)

    def train_dev_test_split(self, docs, train_size, dev_size):
        """
        Split dataset into train and test.

        Args:
            docs: list of documents
            train_size: float, percentage of train data
            dev_size: float, percentage of dev data (note that test size is 1-train_size-dev_size)

        Returns:
            train_docs: list of train documents
        """
        random.shuffle(docs)
        print(int(train_size*len(docs)))
        train_docs = self.parse_dataset(docs[:int(train_size*len(docs))])
        dev_docs = self.parse_dataset(docs[int(train_size*len(docs)):int((train_size+dev_size)*len(docs))])
        test_docs = self.parse_dataset(docs[int((train_size+dev_size)*len(docs)):])
        return [train_docs, dev_docs, test_docs]

    def setup_vectorizers(self, X_train, y_train):
        self.X_vectorizer = TextVectorization(max_tokens=20000, output_sequence_length=self.sequence_length)
        self.y_vectorizer = TextVectorization(max_tokens=20000, output_sequence_length=self.sequence_length)
        self.X_vectorizer.adapt(tf.data.Dataset.from_tensor_slices(X_train))
        self.y_vectorizer.adapt(tf.data.Dataset.from_tensor_slices(y_train))
        self.one_hot_depth = len(self.y_vectorizer.get_vocabulary())+1
        return self.X_vectorizer.get_vocabulary(), self.y_vectorizer.get_vocabulary()
    def vectorize_dataset(self, X, y):
        X = self.X_vectorizer(np.array([[s] for s in X])).numpy()
        y = self.y_vectorizer(np.array([[s] for s in y])).numpy()
        y = tf.one_hot(y, self.one_hot_depth)
        return X,y

In [3]:
import glob 

class Embedder():
    def __init__(self, glove_url="http://nlp.stanford.edu/data/glove.6B.zip", embedding_dim=100):
        self.download_if_needed(glove_url)
        self.embedding_dim = embedding_dim
        self.embedding = self.parse_embeddings()
    def download_if_needed(self, glove_url):
        if not glob.glob("glove*.txt"):
            !wget $glove_url
            !unzip -q glove.6B.zip
    def parse_embeddings(self):
        import numpy as np
        embeddings_index = {}
        with open("glove.6B."+str(self.embedding_dim) + "d.txt") as f:
            for line in f:
                word, coefs = line.split(maxsplit=1)
                coefs = np.fromstring(coefs, "f", sep=" ")
                embeddings_index[word] = coefs
        return embeddings_index
    def embedding_matrix(self, vocabulary):
        word_index = dict(zip(vocabulary, range(len(vocabulary))))
        embedding_matrix = np.zeros((len(vocabulary)+2, self.embedding_dim))
        for word, i in word_index.items():
            embedding_vector = self.embedding.get(word)
            if embedding_vector is not None:
                embedding_matrix[i] = embedding_vector
        return embedding_matrix
    
        

In [13]:
embedding_matrix.shape

(7393, 100)

In [4]:
di = DataInput("https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/corpora/dependency_treebank.zip", 0.5, 0.25)


--2022-03-11 18:06:13--  https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/corpora/dependency_treebank.zip
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.111.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 457429 (447K) [application/zip]
Saving to: ‘data.zip’


2022-03-11 18:06:13 (10,6 MB/s) - ‘data.zip’ saved [457429/457429]

Archive:  data.zip
  inflating: data/dependency_treebank/wsj_0093.dp  
  inflating: data/dependency_treebank/wsj_0065.dp  
  inflating: data/dependency_treebank/wsj_0039.dp  
  inflating: data/dependency_treebank/wsj_0182.dp  
  inflating: data/dependency_treebank/wsj_0186.dp  
  inflating: data/dependency_treebank/wsj_0041.dp  
  inflating: data/dependency_treebank/wsj_0018.dp  
  inflating: data/dependency_treebank/wsj_0105.dp  
  inflating: data/dependency_treebank

2022-03-11 18:06:14.958658: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [None]:
embedder = Embedder()
embedding_matrix = embedder.embedding_matrix(di.X_vocabulary)
embedding_matrix

array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [-0.038194  , -0.24487001,  0.72812003, ..., -0.1459    ,
         0.82779998,  0.27061999],
       ...,
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ]])

In [16]:
from tensorflow.keras import layers, models, Input
from tensorflow.keras.layers import Embedding
from tensorflow.keras.optimizers import Adam

class NeuralNetwork:
    def __init__(self, X_vocabulary, embedding_dim, one_hot_depth, sequence_length, embedding_matrix):
        embedding_layer = Embedding(
            len(X_vocabulary)+2, # Number of tokens in the vocabulary
            embedding_dim, # Dimensions of the embedding,
            embeddings_initializer=tf.keras.initializers.Constant(embedding_matrix),
            trainable=False
        )
        self.model = models.Sequential()
        self.model.add(layers.InputLayer(input_shape=(sequence_length,)))
        self.model.add(embedding_layer)
        self.model.add(layers.Bidirectional(layers.LSTM(512, return_sequences=True, kernel_regularizer=tf.keras.regularizers.l2(0.001))))
        self.model.add(layers.TimeDistributed(layers.Dense(one_hot_depth)))
        self.model.add(layers.Activation('softmax'))
        self.model.compile(loss="categorical_crossentropy", optimizer=Adam(0.001),  metrics=["accuracy"])
        self.model.summary()


In [17]:
nn = NeuralNetwork(di.X_vocabulary, embedder.embedding_dim, di.one_hot_depth, di.sequence_length, embedding_matrix)
X_train, y_train = di.datasets[0]
nn.model.fit(X_train, y_train, epochs=100, batch_size=12, validation_data=di.datasets[1])

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_3 (Embedding)     (None, 200, 100)          739300    
                                                                 
 bidirectional_3 (Bidirectio  (None, 200, 1024)        2510848   
 nal)                                                            
                                                                 
 time_distributed_3 (TimeDis  (None, 200, 38)          38950     
 tributed)                                                       
                                                                 
 activation_3 (Activation)   (None, 200, 38)           0         
                                                                 
Total params: 3,289,098
Trainable params: 2,549,798
Non-trainable params: 739,300
_________________________________________________________________
Epoch 1/100

KeyboardInterrupt: 