In [1]:
# Inspired by Dr. Scannell codes
# I trained the model with 10000 records, saved it. Then I load the trained model and evaluate test dataset.

In [2]:
# Loading libraries

import nltk
import pandas as pd
import numpy as np
import csv

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
# Load data
# Rename columns

df = pd.read_csv(r"/content/drive/MyDrive/NLP/CELTIC MUTATION/train.tsv", sep="\t", header=None, quoting = csv.QUOTE_NONE)
df.rename(columns={0:"Word", 1:"Label"}, inplace=True)

In [5]:
# Inspecting Null values on train

df.isnull().sum()

Word     27
Label     0
dtype: int64

In [6]:
# Removing null records

df.dropna(inplace=True)

In [7]:
# Slicing data for train

print(len(df))
df = df[:10000]

9999973


In [8]:
# Getting words with labels in the shape of List of List of Tuples

words = [[(df.loc[i, 'Word'], df.loc[i, 'Label'])] for i in df.index]

In [9]:
# list of words in data

vocabs = df['Word'].values.tolist()

In [10]:
# list of tags

tags = df['Label'].unique().tolist()
tags

['N', 'S', 'U', 'H', 'T']

In [11]:
# pre-processing
from tensorflow.keras.preprocessing.sequence import pad_sequences
max_len = 60
word2index = {w: i for i, w in enumerate(vocabs)}
tag2index = {t: i for i, t in enumerate(tags)}
onehot = [[word2index[w[0]] for w in s] for s in words]
X = pad_sequences(maxlen=max_len, sequences=onehot, padding="post", value=len(vocabs)-1)

In [12]:
# pre-processing
from tensorflow.keras.utils import to_categorical
onehot_y = [[tag2index[w[1]] for w in s] for s in words]
y = pad_sequences(maxlen=max_len, sequences=onehot_y, padding="post")
y = to_categorical(y, num_classes=len(tags))

In [13]:
# runned on training process

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)

In [14]:
# building model

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional 
model = Sequential()
model.add(Embedding(input_dim=len(vocabs), output_dim=50, input_length=max_len))
model.add(Bidirectional(LSTM(units=100, return_sequences=True, recurrent_dropout=0.1)))
model.add(TimeDistributed(Dense(len(tags), activation="softmax")))
model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])
print(model.summary())



Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 60, 50)            500000    
                                                                 
 bidirectional (Bidirectiona  (None, 60, 200)          120800    
 l)                                                              
                                                                 
 time_distributed (TimeDistr  (None, 60, 5)            1005      
 ibuted)                                                         
                                                                 
Total params: 621,805
Trainable params: 621,805
Non-trainable params: 0
_________________________________________________________________
None


In [15]:
# fit
import time
start = time.time()
n_epochs=5
history = model.fit(X_train, y_train, batch_size=16, epochs=n_epochs, validation_split=0.1, verbose=1)
end = time.time()
print('Done in:', end-start, 'seconds')

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Done in: 996.841911315918 seconds


In [16]:
# accuracy on train and test (splitted from data)

start = time.time()
loss, accuracy = model.evaluate(X_train, y_train, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))
loss, accuracy = model.evaluate(X_test, y_test, verbose=False)
print("Testing Accuracy: {:.4f}".format(accuracy))
end = time.time()
print('Done in:', end-start, 'seconds')

Training Accuracy: 0.9989
Testing Accuracy: 0.9982
Done in: 11.293855428695679 seconds


In [17]:
# save trained model

model.save(r"/content/drive/MyDrive/NLP/CELTIC MUTATION/RNN_model.h5")

In [18]:
#Loading and run on original test dataset

from tensorflow.keras.models import load_model

RNN_model = load_model(r"/content/drive/MyDrive/NLP/CELTIC MUTATION/RNN_model.h5")



In [19]:
RNN_model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 60, 50)            500000    
                                                                 
 bidirectional (Bidirectiona  (None, 60, 200)          120800    
 l)                                                              
                                                                 
 time_distributed (TimeDistr  (None, 60, 5)            1005      
 ibuted)                                                         
                                                                 
Total params: 621,805
Trainable params: 621,805
Non-trainable params: 0
_________________________________________________________________


In [20]:
# Load test data
# Rename columns

test = pd.read_csv(r"/content/drive/MyDrive/NLP/CELTIC MUTATION/test.tsv", sep="\t", header=None, quoting = csv.QUOTE_NONE)
test.rename(columns={0:"Word", 1:"Label"}, inplace=True)

In [21]:
# Inspecting Null values on test

test.isnull().sum()

Word     5
Label    0
dtype: int64

In [22]:
# Removing null records

test.dropna(inplace=True)

In [23]:
len(test)

999995

In [24]:
# Getting words with labels in the shape of List of List of Tuples

words_test = [[(test.loc[i, 'Word'], test.loc[i, 'Label'])] for i in test.index]

In [25]:
# list of words in test

test_vocabs = test['Word'].values.tolist()

In [26]:
# pre-processing on test

from tensorflow.keras.preprocessing.sequence import pad_sequences
max_len = 60
word2index = {w: i for i, w in enumerate(test_vocabs)}
tag2index = {t: i for i, t in enumerate(tags)}
onehot = [[word2index[w[0]] for w in s] for s in words_test]
X_test = pad_sequences(maxlen=max_len, sequences=onehot, padding="post", value=len(test_vocabs)-1)

In [27]:
# pre-processing on test

from tensorflow.keras.utils import to_categorical
onehot_y = [[tag2index[w[1]] for w in s] for s in words_test]
y = pad_sequences(maxlen=max_len, sequences=onehot_y, padding="post")
y_test = to_categorical(y, num_classes=len(tags))

In [28]:
# evaluate test dataset

start = time.time()
loss, accuracy = RNN_model.evaluate(X_test, y_test, verbose=False)
print("Testing Accuracy: {:.4f}".format(accuracy))
end = time.time()
print('Done in:', end-start, 'seconds')

Testing Accuracy: 0.9976
Done in: 943.3639192581177 seconds
