In [None]:
import tensorflow as tf
from tensorflow import keras
import numpy as np
import matplotlib.pyplot as plt
from gensim.models.word2vec import Word2Vec
from gensim.models import KeyedVectors
%matplotlib inline
import os
import pandas as pd

In [None]:
df = pd.read_csv('WSJ_treebank_corpus.csv')
df[:10]

Unnamed: 0,tokenized_sentences,tags
0,"['Pierre', 'Vinken', ',', '61', 'years', 'old'...","['NNP', 'NNP', ',', 'CD', 'NNS', 'JJ', ',', 'M..."
1,"['Mr.', 'Vinken', 'is', 'chairman', 'of', 'Els...","['NNP', 'NNP', 'VBZ', 'NN', 'IN', 'NNP', 'NNP'..."
2,"['Rudolph', 'Agnew', ',', '55', 'years', 'old'...","['NNP', 'NNP', ',', 'CD', 'NNS', 'JJ', 'CC', '..."
3,"['A', 'form', 'of', 'asbestos', 'once', 'used'...","['DT', 'NN', 'IN', 'NN', 'RB', 'VBN', '-NONE-'..."
4,"['The', 'asbestos', 'fiber', ',', 'crocidolite...","['DT', 'NN', 'NN', ',', 'NN', ',', 'VBZ', 'RB'..."
5,"['Lorillard', 'Inc.', ',', 'the', 'unit', 'of'...","['NNP', 'NNP', ',', 'DT', 'NN', 'IN', 'JJ', 'J..."
6,"['Although', 'preliminary', 'findings', 'were'...","['IN', 'JJ', 'NNS', 'VBD', 'VBN', '-NONE-', 'R..."
7,"['A', 'Lorillard', 'spokewoman', 'said', ',', ...","['DT', 'NNP', 'NN', 'VBD', ',', '``', 'DT', 'V..."
8,"['We', ""'re"", 'talking', 'about', 'years', 'ag...","['PRP', 'VBP', 'VBG', 'IN', 'NNS', 'IN', 'IN',..."
9,"['There', 'is', 'no', 'asbestos', 'in', 'our',...","['EX', 'VBZ', 'DT', 'NN', 'IN', 'PRP$', 'NNS',..."


In [None]:
def strToVec(row):
    row = row[1:-1]                               # removing the brackets
    row = row.split(', ')                         # extracting individual tokens
    return np.array([item[1:-1] for item in row])  # removing the single quotes, and returning an np array

df['tokenized_sentences'] = df['tokenized_sentences'].apply(lambda row: strToVec(row))
df['tags'] = df['tags'].apply(lambda row: strToVec(row))

In [None]:
unique_labels = set()
for row in df['tags'].values.flatten():
    for item in row:
        unique_labels.add(item)
unique_labels = np.array(list(unique_labels))
remove_labels = ['``', '.', '$', ',', ':', "''", '#', 'CD']

for label in remove_labels:
    unique_labels = np.delete(unique_labels, np.where(unique_labels == label))

In [None]:
def removePuncLabels(x, y):
    ans = []
    for idx, item in enumerate(y):
        if item not in remove_labels:
            ans.append([x[idx], y[idx]])
    return ans

# Removing the following tags and corresponding tokens from the dataset
df = [removePuncLabels(x, y) for x, y in zip(df['tokenized_sentences'], df['tags'])]
df_back = np.copy(df)

  return array(a, order=order, subok=subok, copy=True)


In [None]:
word2vec_path = '/home/rg99/GoogleNews-vectors-negative300.bin'
model = KeyedVectors.load_word2vec_format(word2vec_path, binary=True)

In [None]:
# initializing unknown token
UNK = np.zeros(300)
UNK[0] = 1

In [None]:
df = [ [ (model[pair[0]], pair[1]) if pair[0] in model.vocab else (UNK, pair[1]) for pair in row] for row in df]
len(df)

3914

In [None]:
del model

In [None]:
# zero-padd the dataset
max_sequence_len = max([len(x) for x in df])
max_sequence_len

178

In [None]:
mean_sequence_len = np.mean([len(x) for x in df])
mean_sequence_len

21.885794583546243

In [None]:
pdd = [np.zeros(300), 'Padding']
for idx, row in enumerate(df):
    while (len(df[idx]) < max_sequence_len):
        df[idx].append(pdd)

In [None]:
np.array(df).shape

(3914, 178, 2)

In [None]:
train_data = df[:2600]
test_data  = df[2600:]
data_in = np.array([[word[0] for word in row] for row in df])
data_out = np.array([[word[1] for word in row] for row in df]).reshape((3914, 178, 1))
data = zip(data_in, data_out)
x_data = tf.constant(data_in)
tokenizer = keras.preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(np.append(unique_labels, 'Padding'))
tokenizer.texts_to_sequences(["Padding", "PRP"])

[[37], [2]]

In [None]:
y_data = tf.constant([[tokenizer.texts_to_sequences([word[0]])[0] for word in row] for row in data_out])
np.array(data_out).shape

(3914, 178, 1)

## 1st 

In [None]:
X_train, X_test = x_data[:2600], x_data[2600:]
y_train, y_test = y_data[:2600], y_data[2600:]
X_train.shape, y_train.shape

(TensorShape([2600, 178, 300]), TensorShape([2600, 178, 1]))

In [None]:
model = keras.models.Sequential([
    keras.layers.GRU(128, return_sequences=True, input_shape=[max_sequence_len, 300], dropout=0.2, recurrent_dropout=0.2),
    keras.layers.GRU(128, return_sequences=True, dropout=0.2, recurrent_dropout=0.2),
    keras.layers.TimeDistributed(keras.layers.Dense(max_sequence_len, activation="softmax"))
])
model.compile(loss="sparse_categorical_crossentropy", optimizer="adam", metrics="accuracy")
history = model.fit(X_train, y_train, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
model.evaluate(X_test, y_test)



[0.06628849357366562, 0.9798668026924133]

In [None]:
Y_pred_1 = model.predict_classes(X_test)
Y_pred_1

Instructions for updating:
Please use instead:* `np.argmax(model.predict(x), axis=-1)`,   if your model does multi-class classification   (e.g. if it uses a `softmax` last-layer activation).* `(model.predict(x) > 0.5).astype("int32")`,   if your model does binary classification   (e.g. if it uses a `sigmoid` last-layer activation).


array([[21, 10, 36, ..., 37, 37, 37],
       [24,  5, 21, ..., 37, 37, 37],
       [24,  2,  5, ..., 37, 37, 37],
       ...,
       [ 9,  9,  9, ..., 37, 37, 37],
       [19, 35, 24, ..., 37, 37, 37],
       [ 9, 35, 23, ..., 37, 37, 37]])

In [None]:
Y_pred_1.shape

(1314, 178)

In [None]:
pad_marker = tokenizer.texts_to_sequences(["Padding"])[0][0]
pad_marker

37

In [None]:
def evaluate_accuracy(y_pred, y_true, y_original):
    accuracy_sum = []
    for counter, _tmp in enumerate(y_pred):
        idx = 0
        while idx != len(y_true[counter]) and y_true[counter][idx] != pad_marker:
            idx += 1
        
        y_pred_ = y_pred[counter][:idx]
        y_true_ = y_true[counter][:idx]

        match_counter = 0
        for idx, _ in enumerate(y_pred_):
            if y_pred_[idx] == y_true_[idx]:
                match_counter += 1

        accuracy_sum.append(match_counter / len(y_true_))
    
    return np.mean(accuracy_sum)

In [None]:
evaluate_accuracy(Y_pred_1, y_test, None) # accuracy without considering the padding label

0.8220390922548225

In [None]:
y_test_1 = y_test

In [None]:
X_train, X_test = x_data[1300:], x_data[:1300]
y_train, y_test = y_data[1300:], y_data[:1300]

model = keras.models.Sequential([
    keras.layers.GRU(128, return_sequences=True, input_shape=[max_sequence_len, 300], dropout=0.2, recurrent_dropout=0.2),
    keras.layers.GRU(128, return_sequences=True, dropout=0.2, recurrent_dropout=0.2),
    keras.layers.TimeDistributed(keras.layers.Dense(max_sequence_len, activation="softmax"))
])
model.compile(loss="sparse_categorical_crossentropy", optimizer="adam", metrics="accuracy")
history = model.fit(X_train, y_train, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


## 2nd 

In [None]:
model.evaluate(X_test, y_test)



[0.07621990889310837, 0.9774935245513916]

In [None]:
Y_pred_2 = model.predict_classes(X_test)
Y_pred_2

array([[ 9,  9, 19, ..., 37, 37, 37],
       [ 9,  9,  5, ..., 37, 37, 37],
       [ 9,  9, 19, ..., 37, 37, 37],
       ...,
       [36, 21, 14, ..., 37, 37, 37],
       [36,  9,  2, ..., 37, 37, 37],
       [ 9,  9,  5, ..., 37, 37, 37]])

In [None]:
Y_pred_2.shape

(1300, 178)

In [None]:
evaluate_accuracy(Y_pred_2, y_test, None) # accuracy without considering the padding label

0.8124230519056513

In [None]:
y_test_2 = y_test

In [None]:
y_train = np.array(np.concatenate((y_data[:1300], y_data[2600:]), axis=0))
X_train = np.array(np.concatenate((x_data[:1300], x_data[2600:]), axis=0))
X_test = x_data[1300:2600]
y_test = y_data[1300:2600]

## 3rd

In [None]:
X_train.shape, y_train.shape

((2614, 178, 300), (2614, 178, 1))

In [None]:
model = keras.models.Sequential([
    keras.layers.GRU(128, return_sequences=True, input_shape=[max_sequence_len, 300], dropout=0.2, recurrent_dropout=0.2),
    keras.layers.GRU(128, return_sequences=True, dropout=0.2, recurrent_dropout=0.2),
    keras.layers.TimeDistributed(keras.layers.Dense(max_sequence_len, activation="softmax"))
])
model.compile(loss="sparse_categorical_crossentropy", optimizer="adam", metrics="accuracy")
history = model.fit(X_train, y_train, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
model.evaluate(X_test, y_test)



[0.06878931075334549, 0.9792351126670837]

In [None]:
Y_pred_3 = model.predict_classes(X_test)
Y_pred_3

array([[19, 23,  6, ..., 37, 37, 37],
       [21, 10, 36, ..., 37, 37, 37],
       [21, 10,  5, ..., 37, 37, 37],
       ...,
       [23,  5, 36, ..., 37, 37, 37],
       [21, 14, 10, ..., 37, 37, 37],
       [10, 24, 19, ..., 37, 37, 37]])

In [None]:
Y_pred_3.shape, y_test.shape

((1300, 178), TensorShape([1300, 178, 1]))

In [None]:
evaluate_accuracy(Y_pred_3, y_test, None) # accuracy without considering the padding label

0.8386474426992111

In [None]:
y_test_3 = y_test

In [None]:
def class_wise_accuracy(y_pred, y_true):
    match_count = [0] * len(unique_labels)
    total_count = [0] * len(unique_labels)
    
    for ctr, _tmp in enumerate(y_pred):
    
        # find the padding point first
        idx = 0
        while idx != len(y_true[ctr]) and y_true[ctr][idx] != pad_marker:
            idx += 1
        
        y_pred_ = y_pred[ctr][:idx]
        y_true_ = y_true[ctr][:idx]
        
        for idx, _ in enumerate(y_pred_):
            
            total_count[y_true_[idx].numpy()[0]] += 1
            if y_pred_[idx] == y_true_[idx]:
                match_count[y_pred_[idx]] += 1
    print(match_count)
    print(total_count)
    return [match_count[idx] / total_count[idx] if total_count[idx] != 0 else 0 for idx, _ in enumerate(match_count)]

In [None]:
class_wise_count = class_wise_accuracy(Y_pred_3, y_test_3)

[0, 79, 927, 0, 560, 686, 408, 0, 0, 3078, 4081, 818, 0, 0, 1500, 429, 0, 0, 0, 1968, 322, 2412, 400, 1828, 901, 0, 0, 356, 11, 2, 113, 101, 0, 131, 0, 796, 2893, 0]
[0, 104, 931, 0, 728, 771, 782, 59, 2, 3220, 4443, 953, 40, 9, 1968, 531, 1, 37, 17, 2106, 324, 2918, 532, 2261, 1087, 34, 47, 817, 74, 56, 225, 128, 8, 156, 30, 868, 3331, 0]


In [None]:
for idx, score in enumerate(class_wise_count):
    print(tokenizer.sequences_to_texts([[idx]])[0], '\t\t--->', score)

 		---> 0
wp 		---> 0.7596153846153846
prp 		---> 0.9957035445757251
sym 		---> 0
vbn 		---> 0.7692307692307693
vbz 		---> 0.8897535667963683
to 		---> 0.5217391304347826
wrb 		---> 0.0
uh 		---> 0.0
nnp 		---> 0.9559006211180124
nn 		---> 0.9185235201440468
vb 		---> 0.8583420776495279
rbr 		---> 0.0
pdt 		---> 0.0
jj 		---> 0.7621951219512195
vbg 		---> 0.807909604519774
fw 		---> 0.0
rrb 		---> 0.0
rbs 		---> 0.0
nns 		---> 0.9344729344729344
md 		---> 0.9938271604938271
dt 		---> 0.8265935572309802
vbp 		---> 0.7518796992481203
none 		---> 0.8084918177797434
rb 		---> 0.828886844526219
lrb 		---> 0.0
jjs 		---> 0.0
cc 		---> 0.4357405140758874
rp 		---> 0.14864864864864866
nnps 		---> 0.03571428571428571
pos 		---> 0.5022222222222222
jjr 		---> 0.7890625
ls 		---> 0.0
wdt 		---> 0.8397435897435898
ex 		---> 0.0
vbd 		---> 0.9170506912442397
in 		---> 0.8685079555688983
padding 		---> 0
