In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold
from sklearn.model_selection import StratifiedKFold

import seaborn as sns

from gensim.models import KeyedVectors

from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
from keras.models import Sequential
from keras.layers import Embedding
from keras.layers import Dense, Input
from keras.layers import TimeDistributed
from keras.layers import LSTM, GRU, Bidirectional, SimpleRNN, RNN
from keras.models import Model
from keras.preprocessing.text import Tokenizer
from sklearn.model_selection import train_test_split

from sklearn.utils import shuffle
!pip install seqeval
import seqeval
from seqeval.metrics import classification_report, accuracy_score, f1_score, precision_score, recall_score

Collecting seqeval
[?25l  Downloading https://files.pythonhosted.org/packages/9d/2d/233c79d5b4e5ab1dbf111242299153f3caddddbb691219f363ad55ce783d/seqeval-1.2.2.tar.gz (43kB)
[K     |███████▌                        | 10kB 15.8MB/s eta 0:00:01[K     |███████████████                 | 20kB 9.3MB/s eta 0:00:01[K     |██████████████████████▌         | 30kB 7.9MB/s eta 0:00:01[K     |██████████████████████████████  | 40kB 7.0MB/s eta 0:00:01[K     |████████████████████████████████| 51kB 2.7MB/s 
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
  Created wheel for seqeval: filename=seqeval-1.2.2-cp36-none-any.whl size=16171 sha256=156d9d3bcf70ce3abc864274a57a7b00aa4114d0c6b21d240b2d8842400432be
  Stored in directory: /root/.cache/pip/wheels/52/df/1b/45d75646c37428f7e626214704a0e35bd3cfc32eda37e59e5f
Successfully built seqeval
Installing collected packages: seqeval
Successfully installed seqeval-1.2.2


In [2]:
!wget -P /root/input/ -c "https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz"
!pip install gensim

from gensim.models import KeyedVectors
EMBEDDING_FILE = '/root/input/GoogleNews-vectors-negative300.bin.gz' # from above
word2vec = KeyedVectors.load_word2vec_format(EMBEDDING_FILE, binary=True)

# https://drive.google.com/drive/folders/1NYeUaJkhv5LpvUafTgYtZBErMyTka8aq?usp=sh
# Download link for datasets

--2021-02-21 22:17:58--  https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz
Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.216.12.94
Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.216.12.94|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1647046227 (1.5G) [application/x-gzip]
Saving to: ‘/root/input/GoogleNews-vectors-negative300.bin.gz’


2021-02-21 22:18:17 (80.6 MB/s) - ‘/root/input/GoogleNews-vectors-negative300.bin.gz’ saved [1647046227/1647046227]



## HMM 

Define global variables

In [3]:
P_given = {}
P_given_freq = {} # (Tag, Tag)

tag_freq = {} # Tag

word_freq = {} # Word
word_tag_freq = {} # (Word, Tag)
word_tag_prob = {}

vocab = set() # Words

Process dataframe

In [4]:
classes = ["person" ,"product", "company", "geolocation", "movie", "music artist", "tvshow", "facility", "sports team", "other"]
tmp_df = pd.read_table('NER-Dataset-10Types-Train.txt', names=["words", "tags"], skip_blank_lines=False)
tag_set = list(set(tmp_df["tags"]))[1:]
n_tags = len(tag_set)

def process(dataset):
  print("\n", dataset)
  tmp_df = pd.read_table(dataset, names=["words", "tags"], skip_blank_lines=False)
  print(tmp_df.head(10))

  tag_set = list(set(tmp_df["tags"]))[1:]
  n_tags = len(tag_set)

  X = []
  Y = []

  sentence = []
  tags = []

  for word, t in zip(tmp_df["words"], tmp_df["tags"]):
    if word != word: # is NaN
      X.append(sentence)
      Y.append(tags)
      sentence = []
      tags = []
    else:
      sentence.append(word)
      tags.append(t)
  return X,Y

Xt,Yt = process("NER-Dataset--TestSet.txt")
datat = pd.DataFrame({
    'tokenized_sentences': Xt,
    'tags': Yt
})




 NER-Dataset--TestSet.txt
             words  tags
0  @SammieLynnsMom   NaN
1         @tg1.781   NaN
2             they   NaN
3             will   NaN
4               be   NaN
5              all   NaN
6             done   NaN
7               by   NaN
8           Sunday   NaN
9            trust   NaN


Utility functions

In [5]:
def process_sentence(sent, tags):
  for word in sent:
    vocab.add(word)
    word_freq[word] = word_freq.get(word, 0) + 1

  for idx, (word, tag) in enumerate(zip(sent, tags)):
    tag_freq[tag] = tag_freq.get(tag, 0) + 1
    word_tag_freq[(word,tag)] = word_tag_freq.get((word,tag), 0) + 1
    if (idx > 0):
      prev_tag = tags[idx - 1]
      P_given_freq[(tag, prev_tag)] = P_given_freq.get( (tag, prev_tag), 0) + 1

def get_word_tag_prob(word, tag):
	if word not in vocab:
		for tag in tag_set:
			word_tag_prob[word, tag] = 1.0 / len(tag_set)
		vocab.add(word)

	return word_tag_prob.get((word, tag), 0)

Training function

In [6]:
def train(data):
  for sent, tags in zip(data['tokenized_sentences'], data['tags']):
    process_sentence(sent, tags)

  for prev_tag in tag_set:
    for cur_tag in tag_set:
      if prev_tag not in tag_freq:
        P_given[cur_tag, prev_tag] = 1.0 / len(tag_set)
      else:
        try:
          P_given[cur_tag, prev_tag] = P_given_freq[cur_tag, prev_tag] / tag_freq[prev_tag]
        except KeyError:
          P_given[cur_tag, prev_tag] = 0

  for word in vocab:
    for tag in tag_set:
      try:
        word_tag_prob[word, tag] = word_tag_freq[word, tag] / word_freq[word]
      except KeyError:
        word_tag_prob[word, tag] = 0

Prediction function

In [7]:
def predict(sent):
	prev_state = {}
	P = {}
	for idx, word in enumerate(sent):
		P_new = {}
		if idx == 0:
			for cur_tag in tag_set:
				P_new[cur_tag] = get_word_tag_prob(word, cur_tag)
		else:
			for prev_tag in tag_set:
				for cur_tag in tag_set:
					# How may cur_tag occur after prev_tag?
					prob = P[prev_tag]
					prob *= P_given.get( (cur_tag, prev_tag), 0)  # get_P_given(cur_tag, prev_tag)
					prob *= get_word_tag_prob(word, cur_tag)

					if cur_tag not in P_new or prob > P_new[cur_tag]:
						P_new[cur_tag] = prob
						prev_state[cur_tag, idx] = prev_tag
		P = P_new
	
	final_tag = None
	for tag in tag_set:
		if final_tag is None or P[tag] > P[final_tag]:
			final_tag = tag
      
	pred = []
	pred.append(final_tag)
	cur_tag = final_tag
	for idx in range(len(sent) - 1, 0, -1):
		cur_tag = prev_state[cur_tag, idx]
		pred.append(cur_tag)
	
	pred.reverse()
	return pred

Main 5-fold training

## Train on NER-Dataset-Train.txt

In [8]:
max_accuracy = 0
classification_repo = {}
predt = []
X, Y = process('NER-Dataset-Train.txt')
data = pd.DataFrame({
    'tokenized_sentences': X,
    'tags': Y
})
print(data.head())
# prepare cross validation
kfold = KFold(5, True, 1)
final_prediction = []

# enumerate splits
for trn, test in kfold.split(data):
  
  # clear previous runs
  word_freq.clear()
  tag_freq.clear()
  word_tag_freq.clear()
  vocab.clear()
  P_given_freq.clear()
  word_tag_prob.clear()
  P_given.clear()

  pred_labels = [] 
  true_labels = [] 

  # train model for current fold
  train(data.iloc[trn])
  
  # validation for current fold
  for idx, row in data.iloc[test].iterrows():
    pred_labels.append( list( predict(row['tokenized_sentences'] )) )
    true_labels.append(list(row['tags']))
  
  accuracy = accuracy_score(true_labels, pred_labels)
  print("\n\n####################################################################################################################################")
  print("Train Sample Size \t: ", len(trn))
  print("Test Sample Size \t: ", len(test))
  # print("Classification Report:\n", classification_report(true_labels, pred_labels))
  print("Accuracy: ", accuracy)
  print("F1 Score: ", f1_score(true_labels, pred_labels))
  print("Precision Score:", precision_score(true_labels, pred_labels))
  print("Recall Score:", recall_score(true_labels, pred_labels))

  print("\nTransition probability\n")
  trans_prob = np.zeros( (n_tags, n_tags))
  for i in range(n_tags):
    for j in range(n_tags):
      trans_prob[i][j] = P_given.get((tag_set[i], tag_set[j]), 0)

  trans_prob_df = pd.DataFrame( trans_prob, columns=tag_set)
  trans_prob_df.index = tag_set
  pd.set_option("display.precision", 2)
  print(trans_prob_df)


  print("\nEmission probability\n")
  emission_prob = np.zeros( (len(vocab), n_tags))
  vocab_list = list(vocab)
  for i in range(len(vocab_list)):
    for j in range(n_tags):
      emission_prob[i][j] = word_tag_prob.get(( vocab_list[i], tag_set[j]), 0)

  emission_prob_df = pd.DataFrame( emission_prob, columns=tag_set)
  emission_prob_df.index = vocab_list
  pd.set_option("display.precision", 2)
  print(emission_prob_df)

  print("####################################################################################################################################\n")

  if accuracy > max_accuracy:
    max_accuracy = accuracy    
    classification_repo= classification_report(true_labels, pred_labels)
    predt = []
    for idx, row in datat.iterrows():
      predt.append( list( predict(row['tokenized_sentences'] )) )
    
f = open("output-HMM-NER-Dataset-Train.txt", 'w')
for pred in predt:
  for x in pred:
    f.write(x+"\n")
  f.write("\n")
f.close()

print("Max Accuracy : ", max_accuracy, ")")
print("Classification report", classification_repo)


 NER-Dataset-Train.txt
         words tags
0  @LewisDixon    O
1        Trust    O
2           me    O
3            !    O
4           im    O
5        gonna    O
6           be    O
7     bringing    O
8          out    O
9        music    O
                                 tokenized_sentences                                               tags
0  [@LewisDixon, Trust, me, !, im, gonna, be, bri...  [O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...
1  [@joshHnumber1fan, its, okay, then, .., make, ...                  [O, O, O, O, O, O, O, O, O, O, O]
2  [Asprin, ,, check, ,, cup, of, tea, ,, check, ...  [O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...
3  [@angelportugues, LMAO, !, When, is, tht, one,...                     [O, O, O, O, O, O, O, O, O, O]
4  [The, Basic, Step, Before, You, Even, Start, T...  [O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...


##################################################################################################################################

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))




####################################################################################################################################
Train Sample Size 	:  704
Test Sample Size 	:  176
Accuracy:  0.8529411764705882
F1 Score:  0.0
Precision Score: 0.0
Recall Score: 0.0

Transition probability

               I-geo-loc  I-person  ...  B-musicartist  I-tvshow
I-geo-loc           0.05      0.05  ...           0.05      0.05
I-person            0.05      0.05  ...           0.05      0.05
I-musicartist       0.05      0.05  ...           0.05      0.05
B-tvshow            0.05      0.05  ...           0.05      0.05
B-person            0.05      0.05  ...           0.05      0.05
I-other             0.05      0.05  ...           0.05      0.05
B-company           0.05      0.05  ...           0.05      0.05
B-movie             0.05      0.05  ...           0.05      0.05
O                   0.05      0.05  ...           0.05      0.05
B-facility          0.05      0.05  ...           0.05 

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))




####################################################################################################################################
Train Sample Size 	:  704
Test Sample Size 	:  176
Accuracy:  0.8315040047463661
F1 Score:  0.0
Precision Score: 0.0
Recall Score: 0.0

Transition probability

               I-geo-loc  I-person  ...  B-musicartist  I-tvshow
I-geo-loc           0.05      0.05  ...           0.05      0.05
I-person            0.05      0.05  ...           0.05      0.05
I-musicartist       0.05      0.05  ...           0.05      0.05
B-tvshow            0.05      0.05  ...           0.05      0.05
B-person            0.05      0.05  ...           0.05      0.05
I-other             0.05      0.05  ...           0.05      0.05
B-company           0.05      0.05  ...           0.05      0.05
B-movie             0.05      0.05  ...           0.05      0.05
O                   0.05      0.05  ...           0.05      0.05
B-facility          0.05      0.05  ...           0.05 

## Train on NER-Dataset-10Types-Train.txt

In [9]:
max_accuracy = 0
classification_repo = {}
predt = []

X10, Y10 = process("NER-Dataset-10Types-Train.txt")
data10 = pd.DataFrame({
    'tokenized_sentences': X10,
    'tags': Y10
})
print(data10.head())
# prepare cross validation
kfold = KFold(5, True, 1)
final_prediction = []

# enumerate splits
for trn, test in kfold.split(data10):
  
  # clear previous runs
  word_freq.clear()
  tag_freq.clear()
  word_tag_freq.clear()
  vocab.clear()
  P_given_freq.clear()
  word_tag_prob.clear()
  P_given.clear()

  pred_labels = [] 
  true_labels = [] 

  # train model for current fold
  train(data10.iloc[trn])
  
  # validation for current fold
  for idx, row in data10.iloc[test].iterrows():
    pred_labels.append( list( predict(row['tokenized_sentences'] )) )
    true_labels.append(list(row['tags']))
  
  accuracy = accuracy_score(true_labels, pred_labels)
  print("\n\n####################################################################################################################################")
  print("Train Sample Size \t: ", len(trn))
  print("Test Sample Size \t: ", len(test))
  # print("Classification Report:\n", classification_report(true_labels, pred_labels))
  print("Accuracy: ", accuracy)
  print("F1 Score: ", f1_score(true_labels, pred_labels))
  print("Precision Score:", precision_score(true_labels, pred_labels))
  print("Recall Score:", recall_score(true_labels, pred_labels))

  print("\nTransition probability\n")
  trans_prob = np.zeros( (n_tags, n_tags))
  for i in range(n_tags):
    for j in range(n_tags):
      trans_prob[i][j] = P_given.get((tag_set[i], tag_set[j]), 0)

  trans_prob_df = pd.DataFrame( trans_prob, columns=tag_set)
  trans_prob_df.index = tag_set
  pd.set_option("display.precision", 2)
  print(trans_prob_df)


  print("\nEmission probability\n")
  emission_prob = np.zeros( (len(vocab), n_tags))
  vocab_list = list(vocab)
  for i in range(len(vocab_list)):
    for j in range(n_tags):
      emission_prob[i][j] = word_tag_prob.get(( vocab_list[i], tag_set[j]), 0)

  emission_prob_df = pd.DataFrame( emission_prob, columns=tag_set)
  emission_prob_df.index = vocab_list
  pd.set_option("display.precision", 2)
  print(emission_prob_df)

  print("####################################################################################################################################\n")

  if accuracy > max_accuracy:
    max_accuracy = accuracy    
    classification_repo= classification_report(true_labels, pred_labels)
    predt = []
    for idx, row in datat.iterrows():
      predt.append( list( predict(row['tokenized_sentences'] )) )
    
f = open("output-HMM-NER-Dataset-Train10types.txt", 'w')
for pred in predt:
  for x in pred:
    f.write(x+"\n")
  f.write("\n")
f.close()

print("Max Accuracy : ", max_accuracy, ")")
print("Classification report", classification_repo)


 NER-Dataset-10Types-Train.txt
         words tags
0  @LewisDixon    O
1        Trust    O
2           me    O
3            !    O
4           im    O
5        gonna    O
6           be    O
7     bringing    O
8          out    O
9        music    O
                                 tokenized_sentences                                               tags
0  [@LewisDixon, Trust, me, !, im, gonna, be, bri...  [O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...
1  [@joshHnumber1fan, its, okay, then, .., make, ...                  [O, O, O, O, O, O, O, O, O, O, O]
2  [Asprin, ,, check, ,, cup, of, tea, ,, check, ...  [O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...
3  [@angelportugues, LMAO, !, When, is, tht, one,...                     [O, O, O, O, O, O, O, O, O, O]
4  [The, Basic, Step, Before, You, Even, Start, T...  [O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...


##########################################################################################################################

  _warn_prf(average, modifier, msg_start, len(result))




####################################################################################################################################
Train Sample Size 	:  704
Test Sample Size 	:  176
Accuracy:  0.9276710684273709
F1 Score:  0.1627906976744186
Precision Score: 0.14893617021276595
Recall Score: 0.1794871794871795

Transition probability

               I-geo-loc  I-person  ...  B-musicartist  I-tvshow
I-geo-loc           0.14      0.00  ...           0.00      0.00
I-person            0.00      0.03  ...           0.00      0.00
I-musicartist       0.00      0.00  ...           0.65      0.00
B-tvshow            0.00      0.00  ...           0.00      0.00
B-person            0.00      0.00  ...           0.00      0.00
I-other             0.00      0.00  ...           0.00      0.00
B-company           0.00      0.00  ...           0.00      0.00
B-movie             0.00      0.00  ...           0.00      0.00
O                   0.86      0.90  ...           0.35      0.75
B-facilit

  _warn_prf(average, modifier, msg_start, len(result))




####################################################################################################################################
Train Sample Size 	:  704
Test Sample Size 	:  176
Accuracy:  0.9279145654108573
F1 Score:  0.10426540284360189
Precision Score: 0.09166666666666666
Recall Score: 0.12087912087912088

Transition probability

               I-geo-loc  I-person  ...  B-musicartist  I-tvshow
I-geo-loc           0.26      0.00  ...           0.00      0.00
I-person            0.00      0.10  ...           0.00      0.00
I-musicartist       0.00      0.00  ...           0.75      0.00
B-tvshow            0.00      0.00  ...           0.00      0.00
B-person            0.00      0.00  ...           0.00      0.00
I-other             0.00      0.00  ...           0.00      0.00
B-company           0.00      0.00  ...           0.00      0.00
B-movie             0.00      0.00  ...           0.00      0.00
O                   0.74      0.85  ...           0.25      0.71
B-facil

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


                      I-geo-loc  I-person  ...  B-musicartist  I-tvshow
@dolemite4                 0.00      0.00  ...           0.00      0.00
Engineers                  0.00      0.00  ...           0.00      0.00
moments                    0.00      0.00  ...           0.00      0.00
Brothers                   0.00      0.00  ...           0.00      0.00
December                   0.00      0.00  ...           0.00      0.00
...                         ...       ...  ...            ...       ...
Jaybilizer                 0.00      0.00  ...           0.00      0.00
*kisses                    0.00      0.00  ...           0.00      0.00
http://bit.ly/94sBNr       0.00      0.00  ...           0.00      0.00
jail                       0.00      0.00  ...           0.00      0.00
http://bit.ly/9XQgSr       0.05      0.05  ...           0.05      0.05

[5080 rows x 21 columns]
##############################################################################################################

# RNN-Vanilla, GRU, LSTM

In [19]:

def func1(dataset, metrics=False, variant="LSTM"):
  max_accuracy = 0
  df = pd.read_table(f"/content/{dataset}", names=["words", "tags"], skip_blank_lines=False)
  sentences = []
  tags = []

  sen = []
  ta = []

  for word, t in zip(df["words"], df["tags"]):
    if word != word: # is NaN
      sentences.append(sen)
      tags.append(ta)
      sen = []
      ta = []
    else:
      sen.append(word)
      ta.append(t)


  MAX_SEQ_LENGTH = 8
  X_padded = pad_sequences(sentences, maxlen=MAX_SEQ_LENGTH, padding="post", truncating="post", value="O", dtype=object)
  Y_padded = pad_sequences(tags, maxlen=MAX_SEQ_LENGTH, padding="post", truncating="post", value="O", dtype=object)

  X = []
  for sen in X_padded:
    sentence = []
    for word in sen:
      if word in word2vec:
        sentence.append(np.asarray(word2vec[word]).astype(np.float32))
      else:
        sentence.append(np.zeros(300).astype(np.float32))
    X.append(np.array(sentence).tolist())


  tag_encode = {}
  tag_decode = {}
  all_tags = list(set(df["tags"]))[1:]
  all_tags.append("O") # padded tag
  all_tags = list(set(all_tags))
  number_of_tags = len(all_tags)
  for i, t in enumerate(all_tags):
    tag_encode[t] = i
    tag_decode[i] = t

  Y = []
  for seq in Y_padded:
    tag_seq = []
    for t in seq:
      one_hot = [0] * number_of_tags
      one_hot[tag_encode.setdefault(t, tag_encode["O"])] = 1
      tag_seq.append(np.array(one_hot, dtype=np.float))
    Y.append(np.array(tag_seq).tolist())

  n_samples = len(X)

  data = pd.DataFrame({
      'tokenized_sentences': X,
      'tags': Y
  })
  fold = 1
  kfold = KFold(5, True, 1)
  for trn, test in kfold.split(data):
    X_train = []
    X_test = []
    Y_train = []
    Y_test = []
    
    for idx,row in  data.iloc[trn].iterrows():
      X_train.append( np.array(row["tokenized_sentences"]))
      Y_train.append( np.array(row["tags"]))
    
    for idx,row in  data.iloc[test].iterrows():
      X_test.append( np.array(row["tokenized_sentences"]))
      Y_test.append( np.array(row["tags"]))

    X_train = np.asarray(X_train)
    Y_train = np.asarray(Y_train)
    X_test = np.asarray(X_test)
    Y_test = np.asarray(Y_test)

    VALID_SIZE = 0.15
    X_train, X_validation, Y_train, Y_validation = train_test_split(X_train, Y_train, test_size=VALID_SIZE, random_state=4)

    # create architecture
    model = Sequential()
    # add an RNN layer
    num_cells = 16
    if variant == "LSTM":
      model.add(LSTM(num_cells, 
                    return_sequences=True,
                    input_shape=(MAX_SEQ_LENGTH, 300) # True - return whole sequence; False - return single output of the end of the sequence
      ))
    elif variant == "Vanilla RNN":
      model.add(SimpleRNN(num_cells, 
                    return_sequences=True,
                    input_shape=(MAX_SEQ_LENGTH, 300) # True - return whole sequence; False - return single output of the end of the sequence
      ))
    elif variant == "GRU":
      model.add(GRU(num_cells, 
                    return_sequences=True,
                    input_shape=(MAX_SEQ_LENGTH, 300) # True - return whole sequence; False - return single output of the end of the sequence
      ))
    # add time distributed (output aStart probability (π).t each sequence) layer
    model.add(TimeDistributed(Dense(number_of_tags, activation='softmax')))
    model.compile(loss = 'categorical_crossentropy', optimizer =  'adam', metrics = ['acc'])


    training = model.fit(X_train, Y_train, batch_size=4, epochs=50, validation_data=(X_validation, Y_validation), verbose=0)
    Y_pred = model.predict(X_test)

    true_labels = []
    for i in range(Y_test.shape[0]):
      labels = []
      for j in range(Y_test.shape[1]):
        tag = np.argmax(Y_test[i][j])
        assert tag <= number_of_tags
        labels.append( tag_decode[tag] )
      true_labels.append(labels)
    
    pred_labels = []
    unique_pred = set()
    for i in range(Y_pred.shape[0]):
      labels = []
      for j in range(Y_pred.shape[1]):
        tag = np.argmax(Y_pred[i][j])
        assert tag <= number_of_tags
        labels.append( tag_decode[tag] )
        unique_pred.add( tag_decode[tag] )
      pred_labels.append(labels)
    
    print("\n\n####################################################################################################################################")
    print("Train Sample Size \t: ", len(trn))
    print("Test Sample Size \t: ", len(test))
    print("Fold", fold, "for", variant)
    # print("Classification Report:\n", classification_report(true_labels, pred_labels))
    print("Accuracy: ", accuracy_score(true_labels, pred_labels))
    print("F1 Score: ", f1_score(true_labels, pred_labels))
    print("Precision Score:", precision_score(true_labels, pred_labels))
    print("Recall Score:", recall_score(true_labels, pred_labels))
    fold += 1
    print("####################################################################################################################################")

    if max_accuracy > accuracy_score(true_labels, pred_labels):
      max_accuracy = accuracy_score(true_labels, pred_labels)

      # Evaluate the test set
      for test_file in ["NER-Dataset--TestSet.txt"]:
        df_test = pd.read_table(test_file, names=["words"], skip_blank_lines=False)
        sentences = []
        tags = []

        sen = []
        ta = []

        for word in df_test["words"]:
          if word != word: # is NaN
            sentences.append(sen)
            sen = []
          else:
            sen.append(word)

        X_padded = pad_sequences(sentences, maxlen=MAX_SEQ_LENGTH, padding="pre", truncating="post", value="O", dtype=object)
        X = []
        for sen in X_padded:
          sentence = []
          for word in sen:
            if word in word2vec:
              sentence.append(np.asarray(word2vec[word]).astype(np.float32))
            else:
              sentence.append(np.zeros(300).astype(np.float32))
          X.append(np.array(sentence))

        with open("output-{}-{}-trained_on_{}".format(variant, test_file, dataset ), "w") as f:
          Y_pred = model.predict(np.array(X))
          for y, sen in zip(Y_pred, sentences):
            for t, word in zip(y, sen):
              f.write(str(word))
              f.write(" ")
              index = np.argmax(t)
              f.write(all_tags[index])
              f.write("\n")
          f.write("\n")

# Train on NER-Dataset-Train.txt

In [20]:
types_of_RNN = ["LSTM","Vanilla RNN", "GRU"]
for rnn in range(3):
  func1("NER-Dataset-Train.txt", True, types_of_RNN[rnn])



####################################################################################################################################
Train Sample Size 	:  704
Test Sample Size 	:  176
Fold 1 for LSTM
Accuracy:  0.9417613636363636
F1 Score:  0.4273504273504274
Precision Score: 0.4807692307692308
Recall Score: 0.38461538461538464
####################################################################################################################################


####################################################################################################################################
Train Sample Size 	:  704
Test Sample Size 	:  176
Fold 2 for LSTM
Accuracy:  0.9630681818181818
F1 Score:  0.4523809523809524
Precision Score: 0.4523809523809524
Recall Score: 0.4523809523809524
####################################################################################################################################


#####################################################################

# Train on NER-Dataset-10Types-Train.txt

In [21]:
types_of_RNN = ["LSTM","Vanilla RNN", "GRU"]
for rnn in range(3):
  func1("NER-Dataset-10Types-Train.txt", True, types_of_RNN[rnn])



####################################################################################################################################
Train Sample Size 	:  704
Test Sample Size 	:  176
Fold 1 for LSTM
Accuracy:  0.9438920454545454
F1 Score:  0.375
Precision Score: 0.44680851063829785
Recall Score: 0.3230769230769231
####################################################################################################################################


####################################################################################################################################
Train Sample Size 	:  704
Test Sample Size 	:  176
Fold 2 for LSTM
Accuracy:  0.9517045454545454
F1 Score:  0.26506024096385544
Precision Score: 0.2682926829268293
Recall Score: 0.2619047619047619
####################################################################################################################################


#################################################################################

In [13]:
!pwd

/content
