<a href="https://colab.research.google.com/github/diamondspark/TfJs-NER/blob/main/Model_Training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')
# !pip install -r /content/gdrive/My\ Drive/requirements.txt
# !pip install tensorflowjs==0.8
# !pip install keras == 2.2.2
# !pip install tensorflow-gpu==1.15.2
import torch
print(torch.device('cuda' if torch.cuda.is_available() else 'cpu'))
!ls "/content/gdrive/My Drive/" 


Mounted at /content/gdrive
cuda
 data			   IMG_20150731_144132.jpg   requirements.gdoc
 glove.6B.100d.txt	   IMG_20150731_145814.jpg   requirements.txt
 group1-shard1of3.bin	   IMG_20150731_145816.jpg   trained_models
 group1-shard2of3.bin	   model.json		    'US Diaries'
 group1-shard3of3.bin	   Model_Training.ipynb      vehicles.zip
 IMG_20150731_144102.jpg  'New folder'		     vocabs.js
 IMG_20150731_144118.jpg   non-vehicles.zip


In [None]:
import keras, tensorflow
keras.__version__, tensorflow.__version__

('2.4.3', '2.3.0')

In [None]:
import os
import re
import numpy as np

MAX_SEQUENCE_LENGTH = 113
EMBEDDING_DIM = 100 # 50 or 100 or 200 or 300
PAD_ID = 0
UNK_ID = 1


def word_preprocessor(word):
    word = re.sub(r'\d+', '1', re.sub(r"[-|.|,|\?|\!]+", '', word))
    word = word.lower()
    if word != '':
        return word
    else:
        return '.'

    
def load_data(path, word_preprocessor=word_preprocessor):
    tags = []
    words = []
    data = {'words': [], 'tags': []}
    with open(path) as f:
        for line in f.readlines()[2:]:
            if line != '\n':
                parts = line.replace('\n', '').split(' ')
                words.append(word_preprocessor(parts[0]))
                if 'MISC' in parts[-1]:
                    tags.append(parts[-1][-4:])
                else:
                    tags.append(parts[-1][-3:])
            else:
                data['words'].append(words)
                data['tags'].append(tags)
                words, tags = [], []

    return data


def make_vocab(sentences, tags=False):
    vocab = {"<PAD>": PAD_ID, "<UNK>": UNK_ID}
    idd = max([PAD_ID, UNK_ID]) + 1
    for sen in sentences:
        for word in sen:
            if word not in vocab:
                vocab[word] = idd
                idd += 1
                
    return vocab


def make_sequences(list_of_words, vocab, word_preprocessor=None):
    sequences = []
    for words in list_of_words:
        seq = []
        for word in words:
            if word_preprocessor:
                word = word_preprocessor(word)
            seq.append(vocab.get(word, UNK_ID))
        sequences.append(seq)
    return sequences


def make_embedding_tensor(glova_path, words_vocab):
    """
        We use GloVe 6B 100d.
        You can download it from: https://nlp.stanford.edu/projects/glove/
    """
    embeddings_index = {}
    with open(os.path.join(glova_path, f"glove.6B.{EMBEDDING_DIM}d.txt")) as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs

    embedding_tensor = np.zeros((len(words_vocab) + 1, EMBEDDING_DIM))
    for word, i in words_vocab.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_tensor[i] = embedding_vector
    
    return embedding_tensor

In [None]:
from tensorflow.keras.layers import (GRU,Dense, Dropout, Embedding, Flatten,
                                     Input, Multiply, Permute, RepeatVector,
                                     Softmax, Bidirectional)
from tensorflow.keras.models import Model,load_model

# from utils import MAX_SEQUENCE_LENGTH
from tensorflow.keras import backend as K


def make_ner_model(embedding_tensor, words_vocab_size, tags_vocab_size,
                   num_hidden_units=128*2, attention_units=64*2):
    EMBEDDING_DIM = embedding_tensor.shape[1]
    #MAX_SEQUENCE_LENGTH = T
    words_input = Input(dtype='int32', shape=[MAX_SEQUENCE_LENGTH])
    #words_input = [1,T] :1 = batchsize (for simplifying explanation)
    #EMBEDDING_DIM = d (200 for used pretrained word2vec)
    #embedding_tensor = weights from pretrained embedding. Dim: |Vocab| x d
    x = Embedding(words_vocab_size + 1,
                    EMBEDDING_DIM,
                    weights=[embedding_tensor],
                    input_length=MAX_SEQUENCE_LENGTH,
                    trainable=False)(words_input)
    #x = vectorized sentence (Sent_i):[1,T,d] :1 = batchsize (for simplifying explanation)
    print(x.shape)
    #num_hidden_units = h
    outputs = GRU(num_hidden_units,
                    return_sequences=True,reset_after=not True,
                    name='RNN_Layer')(x) 
    outputs1 = GRU(num_hidden_units,return_sequences=True,reset_after=not True)(outputs)
    print(outputs.shape, outputs1.shape)
    #outputs = [1,T,h] :1 = batchsize

    # Simple attention
    hidden_layer = Dense(attention_units, activation='tanh')(outputs1)
    hidden_layer = Dropout(0.25)(hidden_layer)
    hidden_layer = Dense(1, activation=None)(hidden_layer)
    hidden_layer = Flatten()(hidden_layer)
    attention_vector = Softmax(name='attention_vector')(hidden_layer)
    attention = RepeatVector(num_hidden_units)(attention_vector)
    attention = Permute([2, 1])(attention)
    encoding = Multiply()([outputs, attention])

    encoding = Dropout(0.25)(encoding)
    ft1 = Dense(num_hidden_units)(encoding)
    ft1 = Dropout(0.25)(ft1)
    ft2 = Dense(tags_vocab_size)(ft1)
    out = Softmax(name='Final_Sofmax')(ft2)
    model = Model(inputs=words_input, outputs=out)
    return model

In [None]:
import argparse
import json
import os
!pip install tensorflowjs
import tensorflowjs as tfjs
print(tfjs.__version__)
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

# from model import make_ner_model
# from utils import (MAX_SEQUENCE_LENGTH, PAD_ID, load_data,
#                    make_embedding_tensor, make_sequences, make_vocab)

Collecting tensorflowjs
[?25l  Downloading https://files.pythonhosted.org/packages/e8/c8/c52e21c49b3baf0845e395241046a993e244dd4b94c9827a8cd2d9b18927/tensorflowjs-2.7.0-py3-none-any.whl (62kB)
[K     |█████▎                          | 10kB 24.5MB/s eta 0:00:01[K     |██████████▌                     | 20kB 30.1MB/s eta 0:00:01[K     |███████████████▊                | 30kB 22.7MB/s eta 0:00:01[K     |█████████████████████           | 40kB 20.7MB/s eta 0:00:01[K     |██████████████████████████▏     | 51kB 21.0MB/s eta 0:00:01[K     |███████████████████████████████▍| 61kB 15.9MB/s eta 0:00:01[K     |████████████████████████████████| 71kB 7.9MB/s 
Collecting tensorflow-hub<0.10,>=0.7.0
[?25l  Downloading https://files.pythonhosted.org/packages/ac/83/a7df82744a794107641dad1decaad017d82e25f0e1f761ac9204829eef96/tensorflow_hub-0.9.0-py2.py3-none-any.whl (103kB)
[K     |████████████████████████████████| 112kB 23.6MB/s 
Installing collected packages: tensorflow-hub, tensorflowjs

In [None]:
# !unzip glove.6B.zip -d /content/gdrive/My Drive/Colab Notebooks/glove.6B
# !pip install tensorflowjs

In [None]:
train_data = load_data( '/content/gdrive/My Drive/data/train.txt')
valid_data = load_data( '/content/gdrive/My Drive/data/valid.txt')

words_vocab = make_vocab(train_data['words'])
tags_vocab = make_vocab(train_data['tags'])

train_data['words_sequences'] = make_sequences(train_data['words'], words_vocab)
valid_data['words_sequences'] = make_sequences(valid_data['words'], words_vocab)

train_data['tags_sequences'] = make_sequences(train_data['tags'], tags_vocab)
valid_data['tags_sequences'] = make_sequences(valid_data['tags'], tags_vocab)

In [None]:
train_X = pad_sequences(train_data['words_sequences'],
                            maxlen=MAX_SEQUENCE_LENGTH,
                            value=PAD_ID, padding='post',
                            truncating='post')
valid_X = pad_sequences(valid_data['words_sequences'],
                        maxlen=MAX_SEQUENCE_LENGTH,
                        value=PAD_ID,
                        padding='post',
                        truncating='post')

train_y = pad_sequences(train_data['tags_sequences'],
                        maxlen=MAX_SEQUENCE_LENGTH,
                        value=PAD_ID,
                        padding='post',
                        truncating='post')
valid_y = pad_sequences(valid_data['tags_sequences'],
                        maxlen=MAX_SEQUENCE_LENGTH,
                        value=PAD_ID,
                        padding='post',
                        truncating='post')

train_y = to_categorical(train_y)
valid_y = to_categorical(valid_y)


# def recall_m(y_true, y_pred):
#     y_true = K.ones_like(y_true) 
#     true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
#     all_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    
#     recall = true_positives / (all_positives + K.epsilon())
#     return recall

# def precision_m(y_true, y_pred):
#     y_true = K.ones_like(y_true) 
#     true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    
#     predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
#     precision = true_positives / (predicted_positives + K.epsilon())
#     return precision

# def f1_score(y_true, y_pred):
#     precision = precision_m(y_true, y_pred)
#     recall = recall_m(y_true, y_pred)
#     return 2*((precision*recall)/(precision+recall+K.epsilon()))

embedding_tensor = make_embedding_tensor('/content/gdrive/My Drive/', words_vocab)
model = make_ner_model(embedding_tensor,
                       len(words_vocab), len(tags_vocab))
model.compile(
    loss='categorical_crossentropy',
    optimizer='Adam',
    metrics=['categorical_accuracy']
    )

(None, 113, 100)
(None, 113, 256) (None, 113, 256)


In [None]:
model.summary()
 

Model: "functional_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 113)]        0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 113, 100)     1719900     input_1[0][0]                    
__________________________________________________________________________________________________
RNN_Layer (GRU)                 (None, 113, 256)     274176      embedding[0][0]                  
__________________________________________________________________________________________________
gru (GRU)                       (None, 113, 256)     393984      RNN_Layer[0][0]                  
_______________________________________________________________________________________

In [None]:
from tensorflow.keras.callbacks import ModelCheckpoint
checkpoint = ModelCheckpoint('/content/gdrive/My Drive/trained_models/n_weights.{epoch:02d}-{val_loss:.2f}.h5', monitor='val_loss', verbose=1, save_best_only=True, mode='min')
callbacks_list = [checkpoint]
# K.clear_session()
model.fit(train_X, train_y,
          epochs=500,
          batch_size=1024,
          validation_data=(valid_X, valid_y),
        callbacks=callbacks_list)

Epoch 1/500
Epoch 00001: val_loss improved from inf to 1.50358, saving model to /content/gdrive/My Drive/trained_models/n_weights.01-1.50.h5
Epoch 2/500
Epoch 00002: val_loss improved from 1.50358 to 0.64909, saving model to /content/gdrive/My Drive/trained_models/n_weights.02-0.65.h5
Epoch 3/500
Epoch 00003: val_loss improved from 0.64909 to 0.35251, saving model to /content/gdrive/My Drive/trained_models/n_weights.03-0.35.h5
Epoch 4/500
Epoch 00004: val_loss improved from 0.35251 to 0.32187, saving model to /content/gdrive/My Drive/trained_models/n_weights.04-0.32.h5
Epoch 5/500
Epoch 00005: val_loss improved from 0.32187 to 0.29667, saving model to /content/gdrive/My Drive/trained_models/n_weights.05-0.30.h5
Epoch 6/500
Epoch 00006: val_loss improved from 0.29667 to 0.27685, saving model to /content/gdrive/My Drive/trained_models/n_weights.06-0.28.h5
Epoch 7/500
Epoch 00007: val_loss improved from 0.27685 to 0.26238, saving model to /content/gdrive/My Drive/trained_models/n_weights.

KeyboardInterrupt: ignored

In [None]:
def export_model(model, words_vocab, tags_vocab, site_path):
    tfjs.converters.save_keras_model(
        model,
        os.path.join(site_path, '/content/gdrive/My Drive/')
        )

    with open(os.path.join(site_path, "/content/gdrive/My Drive/vocabs.js"), 'w') as f:
        f.write('const words_vocab = {\n')
        for l in json.dumps(words_vocab)[1:-1].split(","):
            f.write("\t"+l+',\n')
        f.write('};\n')
        
        f.write('const tags_vocab = {\n')
        for l in json.dumps(tags_vocab)[1:-1].split(","):
            f.write("\t"+l+',\n')
        f.write('};')
    print('model exported to ', site_path)
# load_model('./trained_models/weights.91-0.03.h5')    
# export_model(model, words_vocab, tags_vocab, './')



In [None]:
import tensorflow as tf
tf.test.is_gpu_available(
    cuda_only=False,
    min_cuda_compute_capability=None
)

Instructions for updating:
Use `tf.config.list_physical_devices('GPU')` instead.


False

In [None]:
print(torch.device('cuda' if torch.cuda.is_available() else 'cpu'))


cuda


In [None]:
for k in (train_data):
    print(k)
    print(len(train_data[k]))

In [None]:
train_data['words'][21], train_data['words_sequences'][21], train_data['tags_sequences'][21],tags_vocab

In [None]:
model1 = load_model('/content/gdrive/My Drive/trained_models/n_weights.89-0.03.h5')
model1.summary()

Model: "functional_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 113)]        0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 113, 100)     1719900     input_1[0][0]                    
__________________________________________________________________________________________________
RNN_Layer (GRU)                 (None, 113, 256)     274176      embedding[0][0]                  
__________________________________________________________________________________________________
gru (GRU)                       (None, 113, 256)     393984      RNN_Layer[0][0]                  
_______________________________________________________________________________________

In [None]:
export_model(model1, words_vocab, tags_vocab, './')

  return h5py.File(h5file)


model exported to  ./
