In [15]:
import numpy as np
import re

from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score, precision_score, f1_score

from keras.preprocessing.text import text_to_word_sequence

from keras.layers import Embedding, Input, Dense, Dropout, Concatenate, Activation, Conv1D, GlobalMaxPooling1D
from keras.models import Model
from functions import balance_data, tokenize_and_transform, embed_matrix

In [27]:
#Params

num_words = 100000

max_len = 10514

dim = 200

#Either 'glove' or 'w2v'
embedding_type = 'w2v'

In [20]:
with open('data/task-1/task1.train.txt') as f:
        lines = f.readlines()
        
docs = []
labels = []
for line in lines:
    line = re.split(r'\t+', line)
    docs.append(line[0])
    label = 0
    if line[2].strip() == 'non-propaganda':
        label = 0
    else:
        label = 1
    labels.append(label)
    
docs, labels = balance_data(docs, labels)
  
docs = [text_to_word_sequence(doc) for doc in docs]
lens = [len(doc) for doc in docs]

In [21]:
data, word_index = tokenize_and_transform(docs, num_words, max_len)

In [22]:
X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=0.15)

In [28]:
#Prep NN Input

matrix = embed_matrix(embedding_type, word_index, dim)

seq_input = Input(shape = (max_len,), dtype = 'int32')

embedding = Embedding(len(word_index) + 1, 
                      dim, 
                      weights = [matrix], 
                      input_length = max_len, 
                      trainable = True)

embedded_seq = embedding(seq_input)

Generating W2V Embedding


In [29]:
x = Conv1D(filters = 200, 
            kernel_size = 2, 
            padding = 'valid', 
            activation = 'relu', 
            strides = 1)(embedded_seq)

x = GlobalMaxPooling1D()(x)

x = Dropout(0.5)(x)

x = Dense(64, activation = 'relu')(x)

preds = Dense(1, activation = 'sigmoid',
              kernel_initializer = 'normal')(x)

model = Model(seq_input, preds)

model.compile(loss = 'binary_crossentropy', 
               optimizer = 'adam', 
               metrics = ['acc'])

history = model.fit(X_train, y_train, 
           epochs = 2, 
           validation_split = 0.3,
           shuffle = True, 
           batch_size = 50, 
           verbose = 1)

Train on 4760 samples, validate on 2040 samples
Epoch 1/2
Epoch 2/2


In [31]:
bigram = Conv1D(filters = 200, 
                kernel_size = 2, 
                padding = 'valid', 
                activation = 'relu', 
                strides = 1)(embedded_seq)

bigram = GlobalMaxPooling1D()(bigram)

trigram = Conv1D(filters = 200, 
                 kernel_size = 3, 
                 padding = 'valid', 
                 activation = 'relu', 
                 strides = 1)(embedded_seq)

trigram = GlobalMaxPooling1D()(trigram)

x_combo = Concatenate(axis = 1)([bigram, trigram])

x_combo = Dense(64, activation = 'relu')(x_combo)

x_combo = Dropout(0.5)(x_combo)

preds_combo = Dense(1, activation = 'sigmoid',
              kernel_initializer = 'normal')(x_combo)

model_combo = Model(seq_input, preds_combo)

model_combo.compile(loss = 'binary_crossentropy', 
              optimizer = 'adam', 
              metrics = ['acc'])

combo_history = model_combo.fit(X_train, y_train, 
          epochs = 2, 
          validation_data = (X_test, y_test), 
          shuffle = True, 
          batch_size = 50, 
          verbose = 1)

Train on 6800 samples, validate on 1200 samples
Epoch 1/2
Epoch 2/2


In [30]:
y_pred = model.predict(X_test)

y_pred = [0 if y < 0.5 else 1 for y in y_pred]

print("Simple Model:")
print("Precision: %s" % precision_score(y_test, y_pred))
print("Recall: %s" % recall_score(y_test, y_pred))
print("F1 Score: %s" % f1_score(y_test, y_pred))

Simple Model:
Precision: 0.777947932618683
Recall: 0.8424543946932007
F1 Score: 0.8089171974522293


In [32]:
y_pred = model_combo.predict(X_test)

y_pred = [0 if y < 0.5 else 1 for y in y_pred]

print("Complex Model:")
print("Precision: %s" % precision_score(y_test, y_pred))
print("Recall: %s" % recall_score(y_test, y_pred))
print("F1 Score: %s" % f1_score(y_test, y_pred))

Complex Model:
Precision: 0.8833333333333333
Recall: 0.87893864013267
F1 Score: 0.8811305070656691
