In [1]:
from google.colab import drive
from keras.models import Sequential
from keras.layers.core import Dense
from keras.layers import Embedding, LSTM
from keras.initializers import Constant
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelBinarizer
from tensorflow.keras.optimizers import SGD
from tensorflow.keras.layers import TextVectorization
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import *
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
from keras.layers import Dropout
import numpy as np
import tensorflow as tf
import random
import csv





In [2]:
drive.mount('/content/gdrive')


# paths
embeddings_file = '/content/gdrive/MyDrive/embeddings/glove.6B.100d.txt'

train_file_OD = '/content/gdrive/MyDrive/LFD_DATA/trainOD.csv'
dev_file_OD = '/content/gdrive/MyDrive/LFD_DATA/devOD.csv'
test_file_OD = '/content/gdrive/MyDrive/LFD_DATA/test.csv'

train_file_ID = '/content/gdrive/MyDrive/LFD_DATA/trainID.csv'
dev_file_ID = '/content/gdrive/MyDrive/LFD_DATA/devID.csv'
test_file_ID = '/content/gdrive/MyDrive/LFD_DATA/testID.csv'


Mounted at /content/gdrive


In [3]:
def split_data(X_full, Y_full, test_percentage, shuffle):
    '''Splits the data into train & test sets, everything up to the split point is used as training, the rest for testing. The shuffle flag can be used to shuffle the sets beforehand'''
    split_point = int((1.0 - test_percentage)*len(X_full))
    #  shuffles the lists before splitting, keeping the dependency so each instance still has correct label 
    if shuffle:
        X_full, Y_full = shuffle_dependent_lists(X_full, Y_full)
    X_train = X_full[:split_point]
    Y_train = Y_full[:split_point]
    X_test = X_full[split_point:]
    Y_test = Y_full[split_point:]
    return X_train, Y_train, X_test, Y_test

In [4]:
def read_csv(filepath):
  documents = []
  labels = []
  with open(filepath) as csvfile:
   reader = csv.reader(csvfile, delimiter=',')
   for row in reader:
      documents.append(row[0])
      labels.append(row[1])
  return documents, labels


In [5]:
def shuffle_dependent_lists(l1, l2):
    '''Shuffle two lists, but keep the dependency between them'''
    tmp = list(zip(l1, l2))
    # Seed the random generator so results are consistent between runs
    random.Random(123).shuffle(tmp)
    return zip(*tmp)

In [6]:
def read_embeddings(embeddings_file):
    '''Read in word embeddings from file and save as numpy array'''
    with open(embeddings_file, 'r') as embeddings:
      embeddingsdict = dict()
      for line in embeddings:
        line = line.split()
        word = line[0]
        embeds = line [1:]
        embeddingsdict[word] = np.array(embeds)

      return embeddingsdict

In [7]:
def read_corpus(corpus_file):
    '''Read in review data set and returns docs and labels'''
    documents = []
    labels = []
    with open(corpus_file, encoding='utf-8') as f:
        lines = csv.reader(f, delimiter=',')
        for line in lines:
            line[0] = line[0].replace('timescontent.com', '').replace('MATP', '').replace('Reprint', '').replace('â€', '' ).replace('â€¢', '').replace('Â', '').replace('™️', '').replace('Herald', '')
            documents.append(line[0])
            labels.append(line[-1])
    return documents, labels

In [8]:
def get_emb_matrix(voc, emb):
    '''Get embedding matrix given vocab and the embeddings'''
    num_tokens = len(voc) + 2
    word_index = dict(zip(voc, range(len(voc))))
    # Bit hacky, get embedding dimension from the word "the"
    embedding_dim = len(emb["the"])
    # Prepare embedding matrix to the correct size
    embedding_matrix = np.zeros((num_tokens, embedding_dim))
    for word, i in word_index.items():
        embedding_vector = emb.get(word)
        if embedding_vector is not None:
            # Words not found in embedding index will be all-zeros.
            embedding_matrix[i] = embedding_vector
    # Final matrix with pretrained embeddings that we can feed to embedding layer
    return embedding_matrix

In [9]:
def create_model(Y_train, emb_matrix):
    '''Create the Keras model to use'''
    # Define settings, you might want to create cmd line args for them
    learning_rate = 0.01
    loss_function = 'binary_crossentropy'
    optim = SGD(learning_rate=learning_rate)
    # Take embedding dim and size from emb_matrix
    embedding_dim = len(emb_matrix[0])
    num_tokens = len(emb_matrix)
    num_labels = len(Y_train[0])
    # Now build the model
    model = Sequential()
    model.add(Embedding(num_tokens, embedding_dim, embeddings_initializer=Constant(emb_matrix),trainable=False))
    # Here you should add LSTM layers (and potentially dropout)
   # model.add(LSTM(units=num_labels))
    model.add(LSTM(units=64))
    #Dropout    # Ultimately, end with dense layer with softmax
    model.add(Dense(1, activation='sigmoid'))
    # Compile model using our settings, check for accuracy
    model.compile(loss=loss_function, optimizer="adam", metrics=['accuracy'])
    return model

In [10]:
def train_model(model, X_train, Y_train, X_dev, Y_dev):
    '''Train the model here. Note the different settings you can experiment with!'''
    # Potentially change these to cmd line args again
    # And yes, don't be afraid to experiment!
    verbose = 1
    batch_size = 16
    epochs = 50
    # Early stopping: stop training when there are three consecutive epochs without improving
    # It's also possible to monitor the training loss with monitor="loss"
    callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5)
    # Finally fit the model to our data
    model.fit(X_train, Y_train, verbose=verbose, epochs=epochs, batch_size=batch_size, validation_data=(X_dev, Y_dev))
    # Print final accuracy for the model (clearer overview)
    return model

In [11]:
def test_set_predict(model, X_test, Y_test, ident, labels):
    '''Do predictions and measure accuracy on our own test set (that we split off train)'''
    # Get predictions using the trained model
    Y_pred = model.predict(X_test)
    class_one = Y_pred > 0.5
    digits = 3

    ac = accuracy_score(Y_test, class_one)
    pr = precision_score(Y_test, class_one, average='macro', zero_division=0)
    re = recall_score(Y_test, class_one, average='macro', zero_division=0)
    f1 = f1_score(Y_test, class_one, average='macro', zero_division=0)
    
    msg = f'''
    {classification_report(Y_test, class_one, digits=digits, zero_division=0, target_names=labels)}
    Accuracy:   {round(ac, digits)}
    Precision:  {round(pr, digits)}
    Recall:     {round(re, digits)}
    F-score:    {round(f1, digits)}
    '''
    return msg

In [12]:
embeddings = read_embeddings(embeddings_file)

X_train, Y_train = read_corpus(train_file_OD)
X_dev, Y_dev = read_corpus(dev_file_ID)
X_test, Y_test = read_corpus(test_file_ID)


#vectorizer = TextVectorization(standardize="lower_and_strip_punctuation", output_sequence_length=100)
vectorizer = TextVectorization(standardize="lower_and_strip_punctuation", output_sequence_length=1000)
text_ds = tf.data.Dataset.from_tensor_slices(X_train + X_dev + X_test)
vectorizer.adapt(text_ds)
voc = vectorizer.get_vocabulary()




In [13]:
emb_matrix = get_emb_matrix(voc, embeddings)
encoder = LabelBinarizer()
Y_train_bin = encoder.fit_transform(Y_train)
Y_dev_bin = encoder.fit_transform(Y_dev)
X_train_vect = vectorizer(np.array([[s] for s in X_train])).numpy()
X_dev_vect = vectorizer(np.array([[s] for s in X_dev])).numpy()


In [None]:
model = create_model(Y_train, emb_matrix)
model = train_model(model, X_train_vect, Y_train_bin, X_dev_vect, Y_dev_bin)


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [None]:
Y_test_bin = encoder.fit_transform(Y_test)
X_test_vect = vectorizer(np.array([[s] for s in X_test])).numpy()

scores = model.evaluate(X_test_vect, Y_test_bin, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

results = test_set_predict(model, X_test_vect, Y_test_bin, "test", labels=encoder.classes_)
print(results)