In [1]:
# !pip install git+https://github.com/howl-anderson/tf_crf_layer.git\


In [26]:
from copy import deepcopy
import csv
import os
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense, Activation, Flatten, Input, Concatenate
from tensorflow.keras.losses import BinaryCrossentropy
import tensorflow_addons as tfa 
# from tf_crf_layer.layer import CRF
# from tf_crf_layer.loss import crf_loss
# from tf_crf_layer.metrics import crf_accuracy
# from tf_crf_layer.crf_static_constraint_helper import allowed_transitions

In [7]:
threshold = 0.2
hidden_size = 100
word_embedding_dim = 300
class_labels_number = 5
num_text = 5

labeled_dataset_size = 1830
train_dataset_size = 900
validation_dataset_size = 100
test_dataset_size = 830
unlabeled_dataset_size = 0

pos_list = np.char.lower(["ADJ","ADP","ADV","AUX","CONJ","DET","INTJ","NOUN","NUM","PART","PRON","PROPN","PUNCT","SCONJ","SYM","VERB","X"])
dep_list = np.char.lower(["ROOT", "acl", "acomp", "advcl", "advmod", "agent", "amod", "appos", "attr", "aux", "auxpass", "case", "cc", "ccomp", "compound", "conj", "csubj", "csubjpass", "dative", "dep", "det", "dobj", "expl", "intj", "mark", "meta", "neg", "nmod", "npadvmod", "nsubj", "nsubjpass", "nummod", "oprd", "parataxis", "pcomp", "pobj", "poss", "preconj", "predet", "prep", "prt", "punct", "quantmod", "relcl", "xcomp"])

pos_dim = len(pos_list)
dep_dim = len(dep_list)

In [8]:
def one_hot(vec, dic):
    vec = np.char.lower(vec)
    return np.array([dic == row for row in vec], dtype='i1')

In [29]:
labeled_dataset = []
unlabeled_dataset = []

train_dataset = []
validation_dataset = []
test_dataset = []

for i in range(1, labeled_dataset_size + 1):
    filename = "processed-labeled-tweet-{}.csv".format(i)
    if os.path.exists(filename):
        with open(filename, newline='') as csvfile:
            spamreader = csv.reader(csvfile, delimiter=',', quotechar='|')
            data = [tuple(x) for x in spamreader]
            data = np.array(data, dtype=([("text", 'U20'),("simplified_text", 'U20'), ("best_match", 'U20'), ("index", int), ("pos", 'U20'), ("dep", 'U20'), ("stop", 'U5'), ("label", 'i1')]))
            labeled_dataset.append(data)


for i in range(len(labeled_dataset)):
    tweet = labeled_dataset[i]
    text = tf.reshape(tweet["index"], (1, -1, 1))
    pos = tf.reshape(one_hot(tweet["pos"], pos_list), (1, -1, pos_dim))
    dep = tf.reshape(one_hot(tweet["dep"], dep_list), (1, -1, dep_dim))
    label = tf.reshape(tf.one_hot(tweet["label"], 2), (1, -1, 2))
    train_dataset.append((np.concatenate((text, pos, dep), axis=-1), label))

validation_dataset = train_dataset[901:1001]
test_dataset = train_dataset[1001:]
train_dataset = train_dataset[:901]

# for i in range(1, 5):
#     filename = "processed-tweet-{}.csv".format(i)
#     with open(filename, newline='') as csvfile:
#         spamreader = csv.reader(csvfile, delimiter=',', quotechar='|')
#         unlabeled_dataset.append(np.array(list(spamreader)))

In [30]:
# tag_decoded_labels = {0:"B", 1:"I", 2:"O", 3:"U", 4:"L"}
# constraints = allowed_transitions("BIOUL", tag_decoded_labels)

inputs = Input(shape=(None, pos_dim+dep_dim+1))
x = Embedding(380000, word_embedding_dim)(inputs[:,:,0])
x = Concatenate(axis=-1)([inputs[:,:,1:], x])
x = Bidirectional(LSTM(100, return_sequences=True))(x)
outputs = Dense(2, activation=tf.nn.sigmoid)(x)
model = tf.keras.Model(inputs=inputs, outputs=outputs)

# model = Sequential()
# model.add(Embedding(380000, word_embedding_dim, input_shape=(None,)))
# model.add(Bidirectional(LSTM(100, return_sequences=True)))
# model.add(Dense(2))
# model.add(Activation('sigmoid'))
# TODO: add constraints to CRF layer
# model.add(CRF(class_labels_number))
model.summary()

opt = tf.keras.optimizers.Adam(
    learning_rate=0.01, beta_1=0.9, beta_2=0.999, epsilon=1e-07, amsgrad=False
)
# model.compile(loss=BinaryCrossentropy(), optimizer=opt)
model.compile(loss=BinaryCrossentropy(),metrics=[tf.keras.metrics.Precision(), tf.keras.metrics.Recall()], optimizer=opt)
# model.fit(np.array([[0,1], [3,2]]), np.array([[[0,1], [1,0]], [[1,0], [1,0]]]))

Model: "functional_5"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_3 (InputLayer)            [(None, None, 63)]   0                                            
__________________________________________________________________________________________________
tf_op_layer_strided_slice_4 (Te [(None, None)]       0           input_3[0][0]                    
__________________________________________________________________________________________________
tf_op_layer_strided_slice_5 (Te [(None, None, 62)]   0           input_3[0][0]                    
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, None, 300)    114000000   tf_op_layer_strided_slice_4[0][0]
_______________________________________________________________________________________

In [31]:
# for i in range(1, 5):
#   filename = "processed-labeled-tweet-{}.csv".format(i)
#   with open(filename, newline='') as csvfile:
#       spamreader = csv.reader(csvfile, delimiter=',', quotechar='|')
#       processed_text = np.array(list(spamreader))
#   labels = np.array(processed_text[:,-1], dtype=float)
#   print(tf.reshape(tf.one_hot(labels, 2), (-1, 1, 2)))
#   c = tf.one_hot(labels, 2)
#   model.fit(np.array([0,1,100,3,1,4,2,4,1,5]), tf.reshape(tf.one_hot(labels, 2), (-1, 1, 2)))

In [32]:
def train_step(tweet):
    x,y = tweet[0], tweet[1]
    model.fit(x, y)

In [45]:
def eval_step(tweet):
    x,y = tweet[0], tweet[1]
    # model.evaluate(np.array([0,1,100,3,1,4,2,4,1,5]), tf.reshape(tf.one_hot(labels, 2), (-1, 1, 2)))
    return model.evaluate(x, y)

In [34]:
def predict_step(tweet):
    return model.predict(tweet)

In [35]:
def neg_log(prediction):
    return -np.mean(np.log(np.amax(prediction, axis=-1)))

In [36]:
def semi_supervised():
    for tweet in train_dataset:
        train_step(tweet)
    for tweet in validation_dataset:
        eval_step(tweet)
    for tweet in unlabeled_dataset:
        prediction = predict_step(tweet)
        if neg_log(prediction) < threshold:
            print("here")
            # label = 
    # for tweet in validation_dataset:
    #     eval_step(tweet)

In [37]:
semi_supervised()



In [48]:
sum = 0
p,r=0,0
for tweet in test_dataset:
    eval = eval_step(tweet)
    precision, recall = eval[0], eval[1]
    p+=precision
    r+=recall
print(p/len(test_dataset))
print(r/len(test_dataset))

0.5083945582790131
0.7677753240039403


In [49]:
2/(1/0.508 + 1/0.768)

0.6115109717868339