<a href="https://colab.research.google.com/github/g95wang/twitter_keyword_extraction/blob/main/rnn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# !pip install git+https://github.com/howl-anderson/tf_crf_layer.git\


In [2]:
import zipfile
with zipfile.ZipFile("processed-data-labeled.zip","r") as zip_ref:
    zip_ref.extractall("./")

In [6]:
from copy import deepcopy
import csv
import os
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense, Activation, Flatten, Input, Concatenate
from tensorflow.keras.losses import BinaryCrossentropy
import tensorflow_addons as tfa 
# from tf_crf_layer.layer import CRF
# from tf_crf_layer.loss import crf_loss
# from tf_crf_layer.metrics import crf_accuracy
# from tf_crf_layer.crf_static_constraint_helper import allowed_transitions

In [7]:
threshold = 0.2
hidden_size = 100
word_embedding_dim = 300
class_labels_number = 5
num_text = 5
epochs = 10

labeled_dataset_size = 1830
train_dataset_size = 900
validation_dataset_size = 100
test_dataset_size = 830
unlabeled_dataset_size = 0

pos_list = np.char.lower(["ADJ","ADP","ADV","AUX","CONJ","DET","INTJ","NOUN","NUM","PART","PRON","PROPN","PUNCT","SCONJ","SYM","VERB","X"])
dep_list = np.char.lower(["ROOT", "acl", "acomp", "advcl", "advmod", "agent", "amod", "appos", "attr", "aux", "auxpass", "case", "cc", "ccomp", "compound", "conj", "csubj", "csubjpass", "dative", "dep", "det", "dobj", "expl", "intj", "mark", "meta", "neg", "nmod", "npadvmod", "nsubj", "nsubjpass", "nummod", "oprd", "parataxis", "pcomp", "pobj", "poss", "preconj", "predet", "prep", "prt", "punct", "quantmod", "relcl", "xcomp"])

pos_dim = len(pos_list)
dep_dim = len(dep_list)

In [8]:
def one_hot(vec, dic):
    vec = np.char.lower(vec)
    return np.array([dic == row for row in vec], dtype='i1')

In [9]:
labeled_dataset = []
unlabeled_dataset = []

train_dataset = []
validation_dataset = []
test_dataset = []

for i in range(1, labeled_dataset_size + 1):
    filename = "processed-data-labeled/processed-labeled-tweet-{}.csv".format(i)
    if os.path.exists(filename):
        with open(filename, newline='') as csvfile:
            spamreader = csv.reader(csvfile, delimiter=',', quotechar='|')
            data = [tuple(x) for x in spamreader]
            data = np.array(data, dtype=([("text", 'U20'),("simplified_text", 'U20'), ("best_match", 'U20'), ("index", int), ("pos", 'U20'), ("dep", 'U20'), ("stop", 'U5'), ("label", 'i1')]))
            if len(data):
                labeled_dataset.append(data)


for i in range(len(labeled_dataset)):
    tweet = labeled_dataset[i]
    text = tf.reshape(tweet["index"], (1, -1, 1))
    pos = tf.reshape(one_hot(tweet["pos"], pos_list), (1, -1, pos_dim))
    dep = tf.reshape(one_hot(tweet["dep"], dep_list), (1, -1, dep_dim))
    label = tf.reshape(tf.one_hot(tweet["label"], 2), (1, -1, 2))
    train_dataset.append((np.concatenate((text, pos, dep), axis=-1), label))

validation_dataset = train_dataset[901:1001]
test_dataset = train_dataset[1001:]
train_dataset = train_dataset[:901]

# for i in range(1, 5):
#     filename = "processed-tweet-{}.csv".format(i)
#     with open(filename, newline='') as csvfile:
#         spamreader = csv.reader(csvfile, delimiter=',', quotechar='|')
#         unlabeled_dataset.append(np.array(list(spamreader)))

In [10]:
inputs = Input(shape=(None, pos_dim+dep_dim+1))
x = Embedding(380000, word_embedding_dim)(inputs[:,:,0])
x = Concatenate(axis=-1)([inputs[:,:,1:], x])
x = Bidirectional(LSTM(100, return_sequences=True))(x)
outputs = Dense(2, activation=tf.nn.sigmoid)(x)
model = tf.keras.Model(inputs=inputs, outputs=outputs)
model.summary()

opt = tf.keras.optimizers.Adam(
    learning_rate=0.01, beta_1=0.9, beta_2=0.999, epsilon=1e-07, amsgrad=False
)
model.compile(loss=BinaryCrossentropy(),metrics=[tf.keras.metrics.Precision(), tf.keras.metrics.Recall()], optimizer=opt)

Model: "functional_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, None, 63)]   0                                            
__________________________________________________________________________________________________
tf_op_layer_strided_slice (Tens [(None, None)]       0           input_1[0][0]                    
__________________________________________________________________________________________________
tf_op_layer_strided_slice_1 (Te [(None, None, 62)]   0           input_1[0][0]                    
__________________________________________________________________________________________________
embedding (Embedding)           (None, None, 300)    114000000   tf_op_layer_strided_slice[0][0]  
_______________________________________________________________________________________

In [12]:
def train_step(tweet):
    x,y = tweet[0], tweet[1]
    model.fit(x, y, verbose=1)

In [13]:
def eval_step(tweet):
    x,y = tweet[0], tweet[1]
    return model.evaluate(x, y, verbose=0)

In [14]:
def predict_step(tweet):
    return model.predict(tweet)

In [15]:
def neg_log(prediction):
    return -np.mean(np.log(np.amax(prediction, axis=-1)))

In [16]:
p, r, f = [], [], []

def semi_supervised():
    for epoch in range(epochs):

        print("Training epoch {}".format(epoch+1))
        count = 0
        for tweet in train_dataset:
            if count % 100 == 0:
                print("Training iter {}".format(count))
            train_step(tweet)
            count += 1

        print("Validation")
        total = 0
        precision, recall = 0, 0
        for tweet in validation_dataset:
            res = eval_step(tweet)
            print(tweet.shape[1])
            total += tweet.shape[1]
            precision += res[0]
            recall += res[1]

        p.append(precision / total)
        r.append(recall / total)
        f.append(2 / (total / precision + total / recall))
        print(p,r,f)

        print("Enlarging training set")
        for tweet in unlabeled_dataset:
            prediction = predict_step(tweet)
            if neg_log(prediction) < threshold:
                print("here")
                # label = 
    # for tweet in validation_dataset:
    #     eval_step(tweet)

In [None]:
semi_supervised()

In [None]:
sum = 0
p,r=0,0
for tweet in test_dataset:
    eval = eval_step(tweet)
    precision, recall = eval[0], eval[1]
    p+=precision
    r+=recall
print(p/len(test_dataset))
print(r/len(test_dataset))