In [0]:
!pip uninstall tensorflow
!pip install tensorflow-gpu==1.15
!pip install git+https://github.com/guillaumegenthial/tf_metrics.git

import tensorflow as tf
print(tf.__version__)
!nvidia-smi

In [0]:
class Config:
    def __init__(self):
        # directories
        self.train_data_dir = '/content/drive/My Drive/Colab Notebooks/ezafe/data/bijankhan_corpus.tsv'
        self.model_dir = '/content/drive/My Drive/Colab Notebooks/ezafe/model_2'
        self.we_model_dir = '/content/drive/My Drive/Colab Notebooks/ezafe/data/cc.fa.300.vec'
        self.we_pickled_model_dir = '/content/drive/My Drive/Colab Notebooks/ezafe/data/cc.fa.300.pickle'

        # general
        self.data_split = .1
        self.num_epochs = 25
        self.batch_size = 16
        self.shuffle_buffer = 320000
        self.num_tags = 15
        self.num_pos_tags = 14
        self.word_max_len = 30
        self.learning_rate = 1e-3
        self.max_len = 1276

        # embeddings
        self.num_words = 100000
        self.word_embed_dim = 300
        self.num_chars = 256  # number of most frequent characters to be kept
        self.char_embed_dim = 32
        self.pos_embed_dim = 16

        # lstm variables
        self.lstm_units = 256  # number of hidden units in the RNN
        self.dropout = .5  # keeping probability

In [0]:
import re
import random
import pickle
from collections import Counter
import sys

import numpy as np
from gensim.models import KeyedVectors


cfg = Config()


class DataLoader:
    def __init__(self):
        # loading word embedding model
        try:
            with open(cfg.we_pickled_model_dir, 'rb') as handle:
                self.word_embedding_model = pickle.load(handle)
        except FileNotFoundError:
            self.word_embedding_model = KeyedVectors.load_word2vec_format(cfg.we_model_dir, binary=False)
            with open(cfg.we_pickled_model_dir, 'wb') as handle:
                pickle.dump(self.word_embedding_model, handle, protocol=pickle.HIGHEST_PROTOCOL)

        sents, all_pos_tags, all_ezafe_tags = self._data_reader(cfg.train_data_dir)

        sents_shuf = []
        all_pos_tags_shuf = []
        all_ezafe_tags_shuf = []
        index_shuf = list(range(len(sents)))

        for i in index_shuf:
            sents_shuf.append(sents[i])
            all_pos_tags_shuf.append(all_pos_tags[i])
            all_ezafe_tags_shuf.append(all_ezafe_tags[i])

        random.seed(17)
        random.shuffle(index_shuf)
        data_split_1 = int(len(sents_shuf) * .1)
        data_split_2 = int(len(sents_shuf) * .2)
        
        self.test_data = sents_shuf[:data_split_1], all_pos_tags_shuf[:data_split_1], all_ezafe_tags_shuf[:data_split_1]
        self.valid_data = sents_shuf[data_split_1:data_split_2], all_pos_tags_shuf[data_split_1:data_split_2], all_ezafe_tags_shuf[data_split_1:data_split_2]
        self.train_data = sents_shuf[data_split_2:], all_pos_tags_shuf[data_split_2:], all_ezafe_tags_shuf[data_split_2:]            
        
        print('train data:', len(self.train_data[0]))
        print('validation data:', len(self.valid_data[0]))
        print('test data:', len(self.test_data[0]))
        
        try:
            with open('/content/drive/My Drive/Colab Notebooks/ezafe/data/indices.pickle', 'rb') as handle:
                self.char_to_index, self.word_to_index, self.pos_tag_to_index, self.ezafe_tag_to_index = pickle.load(handle)
            
            print(self.pos_tag_to_index)

            self.index_to_word = {i: key for key, i in self.word_to_index.items()}
            self.index_to_ezafe_tag = {i: key for key, i in self.ezafe_tag_to_index.items()}
        
            sents, all_pos_tags, all_ezafe_tags = self._data_reader(cfg.train_data_dir)

        except FileNotFoundError:
            print('Building vocabulary...')

            vocab_list = []
            char_list = []
            for sent in self.train_data[0]:
                for word in sent:
                    vocab_list.append(word)
                    for char in word:
                        char_list.append(char)
            
            most_common_words = Counter(vocab_list).most_common(cfg.num_words)
            most_common_chars = Counter(char_list).most_common(cfg.num_chars)
            
            self.word_to_index = {}
            for i, pair in enumerate([('<PAD>', 0)] + most_common_words):
                self.word_to_index[pair[0]] = i + 1

            self.char_to_index = {}
            for i, pair in enumerate([('<PAD>', 0), ('<UNK>', 1)] + most_common_chars):
                self.char_to_index[pair[0]] = i + 1
            
            self.pos_tag_to_index = {}
            for i, tag in enumerate(set(x for y in self.train_data[1] for x in y)):
                self.pos_tag_to_index[tag] = i 

            self.ezafe_tag_to_index = {'0': 0, '1': 1}

            self.index_to_word = {i: key for key, i in self.word_to_index.items()}
            self.index_to_ezafe_tag = {i: key for key, i in self.ezafe_tag_to_index.items()}

            # saving the tokenizers
            with open('/content/drive/My Drive/Colab Notebooks/ezafe/data/indices.pickle', 'wb') as handle:
                indices = self.char_to_index, self.word_to_index, self.pos_tag_to_index, self.ezafe_tag_to_index
                pickle.dump(indices, handle, protocol=pickle.HIGHEST_PROTOCOL)

    def _data_reader(self, directory):
        sents, sent = [], []
        all_ezafe_tags, ezafe_tags = [], []
        all_pos_tags, pos_tags = [], []
        with open(directory) as bijankhan_corpus:
            for line in bijankhan_corpus:
                if line != '\n':
                    word, pos_tag, ezafe_tag = line.strip().split('\t')
                    sent.append(word.replace('ي', 'ی').replace('ك', 'ک').replace('ة', 'ه'))
                    pos_tags.append(pos_tag)
                    ezafe_tags.append(ezafe_tag)
                else:
                    sents.append(sent)
                    all_pos_tags.append(pos_tags)
                    all_ezafe_tags.append(ezafe_tags)
                     
                    sent = []
                    pos_tags = []
                    ezafe_tags = []

        return sents, all_pos_tags, all_ezafe_tags
	
    def _pad(self, word):
        for _ in range(cfg.word_max_len - len(word)):
            word.append(0)
        return word
    
    def _sent_to_index(self, sentence, mode='word'):
        if mode is 'word':
            return [self.word_to_index.get(word, 1) for word in sentence]
        elif mode is 'char':
            indexed_sentence = []
            for word in sentence:
                indexed_word = []
                for char in word:
                    indexed_word.append(self.char_to_index.get(word, 1))
                indexed_sentence.append(self._pad(indexed_word))
            return indexed_sentence

    def _sent_to_embed(self, sentence):
        embed_sent = []
        for word in sentence:
            try:
                embed_sent.append(self.word_embedding_model[word])
            except KeyError:
                embed_sent.append([0 for _ in range(cfg.word_embed_dim)])
        return embed_sent
    
    def _pos_tags_to_index(self, tags):
        return [self.pos_tag_to_index[tag] for tag in tags]

    def _ezafe_tags_to_index(self, tags):
        return [self.ezafe_tag_to_index[tag] for tag in tags]

    def data_generator(self, mode=None, char=False, pos=None):
        if mode is 'train':
            sents, pos_tags, ezafe_tags = self.train_data
        elif mode is 'eval': 
            sents, pos_tags, ezafe_tags = self.valid_data
        else:
            raise ArgumentError("Invalid argument. 'mode' must be either 'train', 'eval', or 'pred'.")
    
        for sent, pos_tag, ezafe_tag in zip(sents, pos_tags, ezafe_tags):
            sent_char = self._sent_to_index(sent, mode='char')
            # sent_word = self._sent_to_index(sent)
            sent_word = self._sent_to_embed(sent)
            length = [1 for _ in range(len(sent))]
            pos_tag = self._pos_tags_to_index(pos_tag)
            ezafe_tag = self._ezafe_tags_to_index(ezafe_tag)
            # weights = [1. if x == 0 else 1.5 for x in tag]
            
            if char:
                yield (np.array(sent_word), np.array(sent_char), np.array(length)), (np.array(pos_tag), np.array(ezafe_tag))
            elif pos is 'cposi':
                yield (np.array(sent_word), np.array(sent_char), np.array(pos_tag), np.array(length), np.array(weights)), np.array(ezafe_tag)
            else:
                yield np.array(sent_word), np.array(tag)

In [0]:
import os
import sys
import logging
import numpy as np
import tensorflow as tf
print(tf.__version__)

from tensorflow.contrib import layers
from pathlib import Path
from tf_metrics import precision, recall, f1


data_loader = DataLoader()


def model_fn(mode, features, labels):
    # Logging
    Path('results').mkdir(exist_ok=True)
    tf.logging.set_verbosity(logging.INFO)
    handlers = [logging.FileHandler('results/main.log'),
                logging.StreamHandler(sys.stdout)]
    logging.getLogger('tensorflow').handlers = handlers
    
    word_inputs, char_inputs, length = features

    training = (mode == tf.estimator.ModeKeys.TRAIN)

    batch_size = tf.shape(word_inputs)[0]
    # input_lengths = tf.count_nonzero(word_inputs, 1, dtype=tf.int32)
    input_lengths = tf.count_nonzero(length, 1, dtype=tf.int32)

    # Char Embeddings
    char_embeddings = tf.get_variable('char_embeddings', [cfg.num_chars + 2, cfg.char_embed_dim])
    embedded_chars = tf.nn.embedding_lookup(char_embeddings, char_inputs)
    # embedded_chars = tf.layers.dropout(embedded_chars, rate=.5, training=training)
    
    # Reshaping for CNN
    output = tf.reshape(embedded_chars, [-1, tf.shape(char_inputs)[2], cfg.char_embed_dim])

    # CNN
    output = tf.layers.conv1d(output, filters=64, kernel_size=2, strides=1, padding="same", activation=tf.nn.relu)
    output = tf.layers.max_pooling1d(output, pool_size=2, strides=2)
    output = tf.layers.conv1d(output, filters=128, kernel_size=2, strides=1, padding="same", activation=tf.nn.relu)
    output = tf.layers.max_pooling1d(output, pool_size=2, strides=2)

    cnn_output = tf.layers.dropout(output, rate=.5, training=training)
    cnn_output = tf.layers.flatten(cnn_output)

    # Word Embeddings
    # word_embeddings = tf.get_variable('word_embeddings', [cfg.num_words + 2, cfg.word_embed_dim])
    # embedded_words = tf.nn.embedding_lookup(word_embeddings, word_inputs)
    # word_inputs = tf.layers.dropout(word_inputs, rate=.5, training=training)
    
    # Reshaping CNN and concatenating for LSTM
    cnn_output = tf.reshape(cnn_output, [-1, tf.shape(char_inputs)[1], 128 * int(cfg.word_max_len / 4)])
    lstm_inputs = tf.concat([word_inputs, cnn_output], axis=-1) 

    # LSTM
    transposed_emb = tf.transpose(lstm_inputs, perm=[1, 0, 2])
    fw_cell = tf.contrib.rnn.LSTMBlockFusedCell(cfg.lstm_units)
    bw_cell = tf.contrib.rnn.TimeReversedFusedRNN(tf.contrib.rnn.LSTMBlockFusedCell(cfg.lstm_units))
    output_fw, _ = fw_cell(transposed_emb, dtype=tf.float32, sequence_length=input_lengths)
    output_bw, _ = bw_cell(transposed_emb, dtype=tf.float32, sequence_length=input_lengths)
    output = tf.concat([output_fw, output_bw], axis=-1)
    output = tf.transpose(output, perm=[1, 0, 2])
    lstm_output = tf.layers.dropout(output, rate=.5, training=training)

    # Dense POS
    pos_output = tf.reshape(lstm_output, [-1, 2 * cfg.lstm_units])
    pos_logits = tf.layers.dense(pos_output, cfg.num_tags)
    pos_pred = tf.reshape(pos_logits, [-1, tf.shape(word_inputs)[1], cfg.num_tags])
    pos_pred_ids = tf.cast(tf.argmax(pos_pred, axis=-1), tf.int32)
    
    # Dense Ezafe
    ezafe_output = tf.reshape(lstm_output, [-1, 2 * cfg.lstm_units])
    ezafe_logits = tf.layers.dense(ezafe_output, 2)
    ezafe_pred = tf.reshape(ezafe_logits, [-1, tf.shape(word_inputs)[1], cfg.num_tags])
    ezafe_pred_ids = tf.cast(tf.argmax(ezafe_pred, axis=-1), tf.int32)

    # Seperating labels
    pos_labels = labels[0]
    ezafe_labels = labels[1]

    # Loss
    pos_loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(labels=pos_labels, logits=pos_pred))
    ezafe_loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(labels=ezafe_labels, logits=ezafe_pred))
    loss = pos_loss + ezafe_loss

    # Metrics
    weights = tf.to_float(tf.sign(length))
    indices = list(range(cfg.num_tags))
    metrics = [('acc', tf.metrics.accuracy(pos_labels, pos_pred_ids, weights))]
    tags_ = list(range(cfg.num_tags))
    tags_.remove(4)
    for i in tags_:
        metrics.extend([('precision_' + str(i), precision(pos_labels, pos_pred_ids, cfg.num_tags, [i], weights)),
                        ('recall_' + str(i), recall(pos_labels, pos_pred_ids, cfg.num_tags, [i], weights)),
                        ('f1_' + str(i), f1(pos_labels, pos_pred_ids, cfg.num_tags, [i], weights))])
        
    metrics.extend([('POS_precision', precision(pos_labels, pos_pred_ids, cfg.num_tags, tags_, weights, average='macro')),
                    ('POS_recall', recall(pos_labels, pos_pred_ids, cfg.num_tags, tags_, weights, average='macro')),
                    ('POS_f1', f1(pos_labels, pos_pred_ids, cfg.num_tags, tags_, weights, average='macro'))])
    
    # Ezafe metrics
    metrics.extend([('ezafe_precision', precision(ezafe_labels, ezafe_pred_ids, cfg.num_tags, [1], weights)),
                    ('ezafe_recall', recall(ezafe_labels, ezafe_pred_ids, cfg.num_tags, [1], weights)),
                    ('ezafe_f1', f1(ezafe_labels, ezafe_pred_ids, cfg.num_tags, [1], weights))])
    
    metrics = {x: y for x, y in metrics}
    
    for metric_name, op in metrics.items():
        tf.summary.scalar(metric_name, op[1])
    
    if mode == tf.estimator.ModeKeys.EVAL:
        return tf.estimator.EstimatorSpec(mode, loss=loss, 
                                          eval_metric_ops=metrics)

    elif mode == tf.estimator.ModeKeys.TRAIN:
        train_op = tf.train.AdamOptimizer().minimize(loss, 
                                                     global_step=tf.train.get_or_create_global_step())
        return tf.estimator.EstimatorSpec(mode, 
                                          loss=loss, 
                                          train_op=train_op)

def input_fn(mode=None):
    data_generator = lambda: data_loader.data_generator(mode=mode, char=True)

    dataset = tf.data.Dataset.from_generator(data_generator, 
                                             output_types=((tf.float32, tf.int32, tf.int32), (tf.int32, tf.int32)),
                                             output_shapes=(([None, cfg.word_embed_dim], [None, None], [None]), ([None], [None])))

    if mode is 'train':
        dataset = dataset.shuffle(cfg.shuffle_buffer).repeat(cfg.num_epochs)
        
    dataset = dataset.padded_batch(cfg.batch_size, padded_shapes=(([None, cfg.word_embed_dim], [None, None], [None]), ([None], [None])))
        
    return dataset
 

def train():
    train_input_func = lambda: input_fn(mode='train')
    eval_input_func = lambda: input_fn(mode='eval')
    
    est_conf = tf.estimator.RunConfig(cfg.model_dir, save_checkpoints_secs=120)
    estimator = tf.estimator.Estimator(model_fn, cfg.model_dir, est_conf)
    
    Path(estimator.eval_dir()).mkdir(parents=True, exist_ok=True)
    
    train_spec = tf.estimator.TrainSpec(input_fn=train_input_func)
    eval_spec = tf.estimator.EvalSpec(input_fn=eval_input_func, throttle_secs=120)

    tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)
  
if __name__ == '__main__':
    train()