In [1]:
import pandas as pd
import numpy as np
import pickle

import keras
import keras.utils
import spacy

from keras.callbacks import TensorBoard
from keras import optimizers

from keras.models import Sequential
from keras.models import Model, load_model
from keras.layers import Input
from keras.layers import Embedding
from keras.layers import Dense,Flatten,LSTM,Conv1D,GlobalMaxPool1D,Dropout,Bidirectional
from keras.utils import pad_sequences
from keras_preprocessing import sequence

In [4]:
# Load data from csv to pandas dataframe
# train_data = pd.read_csv('processed_train_data.tsv', sep='\t', index_col=0)
# val_data = pd.read_csv('processed_val_data.tsv', sep='\t', index_col=0)
# test_data = pd.read_csv('processed_test_data.tsv', sep='\t', index_col=0)

# APR 5 TODO
    - [ ] finalise experiments set up
    - [ ] run experiments
    - [ ] go back to preprocessing to add dep parse
    - [ ] include dep in experiments set up
    - [ ] 2nd run

In [2]:
# Alternative way to load data - pickle to pandas dataframe
train_data = pd.read_pickle('processed_train_data.p')
val_data = pd.read_pickle('processed_val_data.p')
test_data = pd.read_pickle('processed_test_data.p')

In [5]:
def format_string2np(string_numpy):
    """
    Converts string representation of a numpy array to a numpy array.
    Necessary because pandas dataframe cannot store numpy arrays.

    :param string_numpy: {str} string representation of an array
    :return: numpy array
    """
    """formatting : Conversion of String List to List

    Args:
        string_numpy (str)
    Returns:
        l (list): list of values
    """
    list_values = string_numpy.strip('[]').split(', ')
    return np.array(list_values).astype(int)

In [27]:
# reformat dataframe columns from string to numpy arrays
columns = ['word_id_custom', 'word_id_spacy', 'pos_id', 'pos_id_DEFAULT']

for col in columns:
    train_data[col] = train_data[col].apply(format_string2np)
    val_data[col] = val_data[col].apply(format_string2np)
    test_data[col] = test_data[col].apply(format_string2np)

In [3]:
# Load POS dictionaries
with open('pos_dicts.pickle', 'rb') as f:
    pos_dict_custom, pos_dict_default = pickle.load(f)

In [5]:
# Load embedding matrixes
embedding_matrix_custom_100d = np.load('embedding_matrix_custom_100d.npy')
embedding_matrix_spacy_100d = np.load('embedding_matrix_spacy_100d.npy')
embedding_matrix_custom_300d = np.load('embedding_matrix_custom_300d.npy')
embedding_matrix_spacy_300d = np.load('embedding_matrix_spacy_300d.npy')

print("Shape of the embedding matrixes:", embedding_matrix_custom_100d.shape, embedding_matrix_spacy_300d.shape)

Shape of the embedding matrixes: (9607, 100) (9496, 300)


In [6]:
# Load vocabulary dictionaries
vocabulary_dict_custom = pickle.load(open('vocabulary_statement_custom.p', 'rb'))
vocabulary_dict_spacy = pickle.load(open('vocabulary_statement_spacy.p', 'rb'))
vocab_length = len(vocabulary_dict_custom)

print("Length of the vocabulary dictionary:", vocab_length)

Length of the vocabulary dictionary: 9606


In [7]:
train_data.head()

Unnamed: 0,index,id,label,statement,subject,speaker,job_title,state_info,party,barely true,...,job_id,state_id,party_id,context_id,pos_id,pos_id_DEFAULT,statement_custom,statement_spacy,word_id_custom,word_id_spacy
0,0,2635.json,false,Says the Annies List political group supports ...,abortion,dwayne-bohac,State representative,Texas,republican,0.0,...,1,1,0,2,"[1, 14, 8, 8, 2, 0, 1, 2, 10, 0, 0, 4, 0, 10]","[16, 5, 11, 11, 0, 7, 16, 0, 12, 7, 7, 1, 7, 12]",say annies list political group support third ...,say annies list political group support trimes...,"[3, 5440, 717, 493, 396, 54, 274, 4039, 155, 1...","[1, 5315, 633, 423, 332, 37, 3919, 120, 936]"
1,1,10540.json,half-true,When did the decline of coal start? It started...,"energy,history,job-accomplishments",scott-surovell,State delegate,Virginia,democrat,0.0,...,6,7,1,2,"[7, 1, 14, 0, 4, 0, 0, 10, 5, 1, 7, 2, 0, 1, 4...","[14, 16, 5, 7, 1, 7, 7, 12, 10, 16, 14, 0, 7, ...",when do decline coal start start when natural ...,decline coal start start natural gas take star...,"[37, 9, 804, 861, 308, 308, 37, 981, 254, 39, ...","[720, 773, 249, 249, 891, 204, 46, 249, 527, 1..."
2,2,324.json,mostly-true,"Hillary Clinton agrees with John McCain ""by vo...",foreign-policy,barack-obama,President,Illinois,democrat,70.0,...,2,3,1,8,"[8, 8, 1, 4, 8, 8, 10, 4, 1, 12, 1, 8, 8, 14, ...","[11, 11, 16, 1, 11, 11, 12, 1, 16, 9, 16, 11, ...",hillary clinton agree john mccain vote give ge...,hillary clinton agree john mccain vote george ...,"[104, 69, 734, 160, 201, 18, 89, 262, 137, 258...","[74, 49, 649, 125, 157, 12, 212, 103, 208, 274..."
3,3,1123.json,false,Health care reform legislation is likely to ma...,health-care,blog-posting,,,none,7.0,...,0,0,2,0,"[0, 0, 0, 0, 11, 2, 12, 1, 2, 0, 0, 0, 10]","[7, 7, 7, 7, 3, 0, 9, 16, 0, 7, 7, 7, 12]",health care reform legislation be likely manda...,health care reform legislation likely mandate ...,"[19, 22, 209, 252, 1, 592, 406, 361, 439, 176,...","[13, 16, 165, 202, 514, 342, 301, 372, 140, 2747]"
4,4,9028.json,half-true,The economic turnaround started at the end of ...,"economy,jobs",charlie-crist,,Florida,democrat,15.0,...,0,2,1,1,"[14, 2, 0, 1, 4, 14, 0, 4, 5, 0, 10]","[5, 0, 7, 16, 1, 5, 7, 1, 10, 7, 12]",economic turnaround start end term,economic turnaround start end term,"[282, 3331, 308, 247, 248]","[224, 3208, 249, 198, 199]"


## to test the different preprocessing methods, the following experiments will be run:
1. siddarth - baseline

        - ogdict
        - nltk stopword removal only

****
****

## todo
****
### INITIALISE INPUT/OUTPUT
1. [x] word_id custom AS X_train_custom
  - [x] embedding_matrix_custom
  - [x] vocabulary_dict_custom
2. [x] word_id spacy AS X_train_spacy
  - [x] embedding_matrix_spacy
  - [x] vocabulary_dict_spacy
  
3. [x] pos_id custom AS X_train_pos_custom
4. [x] pos_id spacy AS X_train_pos_spacy
    
#### Input variables to be processed: 
- [ ] meta
- [ ] dep parse

****
### GENERAL
- [x] change everything to python 3

### variables, init, etc.
- [ ] pass in vocabulary.p
****
### BILSTM MODEL
- [ ] functions: train(), etc
        (CODE CELLS COULD BE BETTER, INVESTIGATE)
- [ ] black box everything to understand la
- [ ]

****
*decide which word id to use*

1. fathan - pos tag check [ ]
    - custom pos tag dict
    - spacy preprocess - spacy word id
    - glove
 2. fathan - preprocess [ ]
    - custom pos tag dict
    - custom preprocess - custom word id
    - glove
    * check best results with default pos tag [ ]
****

# HYPERPARAMS

In [27]:
# 100d GLOVE
EMBED_DIM = 100

# vocab_length = len(vocabulary_dict.keys())
custom_vocabLen = len(vocabulary_dict_custom.keys())
spacy_vocabLen = len(vocabulary_dict_spacy.keys())
hidden_size = EMBED_DIM #Has to be same as EMBED_DIM
lstm_size = 100
num_steps = 15
num_epochs = 30
batch_size = 40

embedding_matrix = embedding_matrix_custom_100d

## TODO: preprocess metadata
#Meta data related hyper params
num_party = len(train_data['party_id'].unique()) # APRIL 2 todo 1 - process party id in preprocessing.ipynb
num_state = len(train_data['state_id'].unique())
num_context = len(train_data['context_id'].unique())
num_job = len(train_data['job_id'].unique())
num_sub = len(train_data['subject_id'].unique())
num_speaker = len(train_data['speaker_id'].unique())

print(num_party)
print(num_state)
print(num_context)
print(num_job)
print(num_sub)
print(num_speaker)
print(train_data.columns)

# print num_party
# print num_state
# print num_venue
# print num_job
# print num_sub
# print num_speaker
# print train_data.columns

9
30
18
19
25
26
Index(['index', 'id', 'label', 'statement', 'subject', 'speaker', 'job_title',
       'state_info', 'party', 'barely true', 'false', 'half-true',
       'mostly-true', 'pants-on-fire', 'context', 'output', 'subject_id',
       'speaker_id', 'job_id', 'state_id', 'party_id', 'context_id', 'pos_id',
       'pos_id_DEFAULT', 'statement_custom', 'statement_spacy',
       'word_id_custom', 'word_id_spacy'],
      dtype='object')


In [9]:
print(train_data['word_id_custom'])

0        [3, 5440, 717, 493, 396, 54, 274, 4039, 155, 1...
1        [37, 9, 804, 861, 308, 308, 37, 981, 254, 39, ...
2        [104, 69, 734, 160, 201, 18, 89, 262, 137, 258...
3        [19, 22, 209, 252, 1, 592, 406, 361, 439, 176,...
4                               [282, 3331, 308, 247, 248]
                               ...                        
10235    [1, 161, 133, 5206, 314, 84, 86, 1, 408, 212, ...
10236       [157, 2, 52, 283, 213, 464, 1399, 653, 4, 389]
10237    [3, 2024, 154, 110, 1205, 3983, 124, 38, 2, 42...
10238       [1604, 28, 13, 2857, 3115, 4, 153, 1391, 1473]
10239    [216, 1429, 2087, 2, 5142, 826, 494, 327, 441,...
Name: word_id_custom, Length: 10240, dtype: object


# Input/Output Processing

In [10]:
def input_pad_sequences(data):
    padded_data = sequence.pad_sequences(data, maxlen=num_steps, padding='post', truncating='post')
    return padded_data

X_train_custom = input_pad_sequences(train_data['word_id_custom'])
X_val_custom = input_pad_sequences(val_data['word_id_custom'])
X_test_custom = input_pad_sequences(test_data['word_id_custom'])

X_train_spacy = input_pad_sequences(train_data['word_id_spacy'])
X_val_spacy = input_pad_sequences(val_data['word_id_spacy'])
X_test_spacy = input_pad_sequences(test_data['word_id_spacy'])

X_train_pos = input_pad_sequences(train_data['pos_id'])
X_val_pos = input_pad_sequences(val_data['pos_id'])
X_test_pos = input_pad_sequences(test_data['pos_id'])

X_train_pos_DEFAULT = input_pad_sequences(train_data['pos_id_DEFAULT'])
X_val_pos_DEFAULT = input_pad_sequences(val_data['pos_id_DEFAULT'])
X_test_pos_DEFAULT = input_pad_sequences(test_data['pos_id_DEFAULT'])

#TODO: preprocess dependency parse
# X_train_dep = input_pad_sequences(train_data['dep_id'])
# X_val_dep = input_pad_sequences(val_data['dep_id'])
# X_test_dep = input_pad_sequences(test_data['dep_id'])

#Meta data preparation
party_train = keras.utils.to_categorical(train_data['party_id'], num_classes=num_party)
state_train = keras.utils.to_categorical(train_data['state_id'], num_classes=num_state)
context_train = keras.utils.to_categorical(train_data['context_id'], num_classes=num_context)
job_train = keras.utils.to_categorical(train_data['job_id'], num_classes=num_job)
subject_train = keras.utils.to_categorical(train_data['subject_id'], num_classes=num_sub)
speaker_train = keras.utils.to_categorical(train_data['speaker_id'], num_classes=num_speaker)
#X_train_meta = party_train
X_train_meta = np.hstack((party_train, state_train, context_train, job_train, subject_train, speaker_train))

party_val = keras.utils.to_categorical(val_data['party_id'], num_classes=num_party)
state_val = keras.utils.to_categorical(val_data['state_id'], num_classes=num_state)
context_val = keras.utils.to_categorical(val_data['context_id'], num_classes=num_context)
job_val = keras.utils.to_categorical(val_data['job_id'], num_classes=num_job)
subject_val = keras.utils.to_categorical(val_data['subject_id'], num_classes=num_sub)
speaker_val = keras.utils.to_categorical(val_data['speaker_id'], num_classes=num_speaker)
#X_val_meta = party_val
X_val_meta = np.hstack((party_val, state_val, context_val, job_val, subject_val, speaker_val))

party_test = keras.utils.to_categorical(test_data['party_id'], num_classes=num_party)
state_test = keras.utils.to_categorical(test_data['state_id'], num_classes=num_state)
context_test = keras.utils.to_categorical(test_data['context_id'], num_classes=num_context)
job_test = keras.utils.to_categorical(test_data['job_id'], num_classes=num_job)
subject_test = keras.utils.to_categorical(test_data['subject_id'], num_classes=num_sub)
speaker_test = keras.utils.to_categorical(test_data['speaker_id'], num_classes=num_speaker)
#X_test_meta = party_test
X_test_meta = np.hstack((party_test, state_test, context_test, job_test, subject_test, speaker_test))

In [13]:
print(X_train_custom)

[[   3 5440  717 ...    0    0    0]
 [  37    9  804 ...   16  262  517]
 [ 104   69  734 ...    0    0    0]
 ...
 [   3 2024  154 ...   24 1311 1173]
 [1604   28   13 ...    0    0    0]
 [ 216 1429 2087 ... 1195  430  184]]


In [11]:
# Initialise input/output

## OUTPUT
Y_train = train_data['output']
Y_train = keras.utils.to_categorical(Y_train, num_classes=6)

Y_val = val_data['output']
Y_val = keras.utils.to_categorical(Y_val, num_classes=6)

In [12]:
print(X_test_custom)
print(X_test_spacy)
print(X_test_pos)
print(X_test_pos_DEFAULT)

[[ 350  425   28 ...    0    0    0]
 [  64    1 1930 ...    0    0    0]
 [   3  160  201 ...    0    0    0]
 ...
 [ 472 1547  191 ...  142  524  372]
 [   3 1406  928 ...   78    0    0]
 [   3   93    1 ...  258   43  107]]
[[ 291  358   19 ...    0    0    0]
 [  45 1829  234 ...    0    0    0]
 [   1  125  157 ...    0    0    0]
 ...
 [ 405 1448  153 ...  106  451  310]
 [   1 1306  836 ...    0    0    0]
 [   1   64   34 ... 1571   89   77]]
[[ 1 14  0 ...  0  0  0]
 [ 8 11  4 ... 10  0  0]
 [ 1  8  8 ...  0  0  0]
 ...
 [ 4 14  2 ...  2  0  1]
 [ 1 14  8 ...  1  5  1]
 [ 1 14  0 ... 14  5 11]]
[[16  5  7 ...  0  0  0]
 [11  3  1 ... 12  0  0]
 [16 11 11 ...  0  0  0]
 ...
 [ 1  5  0 ...  0  7 16]
 [16  5 11 ... 16 10 16]
 [16  5  7 ... 13 10  3]]


In [14]:
print(X_train_custom.shape, X_val_custom.shape, X_test_custom.shape)
print(X_train_spacy.shape, X_val_spacy.shape, X_test_spacy.shape)
print(X_train_pos.shape, X_val_pos.shape, X_test_pos.shape)
print(X_train_pos_DEFAULT.shape, X_val_pos_DEFAULT.shape, X_test_pos_DEFAULT.shape)
print(Y_train.shape, Y_val.shape)

(10240, 15) (1284, 15) (1267, 15)
(10240, 15) (1284, 15) (1267, 15)
(10240, 15) (1284, 15) (1267, 15)
(10240, 15) (1284, 15) (1267, 15)
(10240, 6) (1284, 6)


#### 1. fathan - pos tag check [ ]
    - custom word id [ ]
    - custom pos id [ ]
    - glove [ ]

In [23]:
# def train(model, name, *args, **kwargs):
#     # sgd = optimizers.SGD(lr=0.025, clipvalue=0.3, nesterov=True)
#     # adam = optimizers.Adam(lr=0.000075, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0, amsgrad=False)
#     # model.compile(optimizer=sgd, loss='categorical_crossentropy', metrics=['categorical_accuracy'])
#     # tb = TensorBoard()
#     # csv_logger = keras.callbacks.CSVLogger('training.log')
#     # filepath = name + "_weights_best.hdf5"
#     # checkpoint = keras.callbacks.ModelCheckpoint(filepath, monitor='val_categorical_accuracy',
#     #                                               verbose=1, save_best_only=True, mode='max')
#     # inputs = {}
#     # for arg in args:
#     #     inputs.update(arg)
#     # inputs.update(kwargs)
#     #
#     # validation_inputs = {}
#     # for arg in args:
#     #     validation_inputs.update(arg)
#     # validation_inputs.update(kwargs)
#     #
#     # model.fit(
#     #     inputs,
#     #     {'main_output': Y_train},
#     #     epochs=num_epochs,
#     #     batch_size=batch_size,
#     #     validation_data=(
#     #         validation_inputs,
#     #         {'main_output': Y_val}
#     #     ),
#     #     callbacks=[tb, csv_logger, checkpoint]
#     # )
#

In [15]:
def compile_model(model):
    """
    Compiles the given model with SGD optimizer, categorical_crossentropy loss, and categorical_accuracy metrics.

    :param model: {tf.keras.Model} the model to be compiled
    """
    sgd = optimizers.SGD(lr=0.025, clipvalue=0.3, nesterov=True)
    model.compile(optimizer=sgd, loss='categorical_crossentropy', metrics=['categorical_accuracy'])

def create_callbacks(name):
    """
    Creates a list of callbacks for model training.

    :param name: {str} model name used for saving best weights

    :return: {list} list of callbacks
    """
    tb = TensorBoard()
    csv_logger = keras.callbacks.CSVLogger('training.log')
    filepath = name + "_weights_best.hdf5"
    checkpoint = keras.callbacks.ModelCheckpoint(filepath, monitor='val_categorical_accuracy',
                                                  verbose=1, save_best_only=True, mode='max')
    return [tb, csv_logger, checkpoint]

def merge_dicts(*args, **kwargs):
    """
    Merges the input dictionaries from args and kwargs.

    :param *args: {tuple} variable length argument list for input dictionaries
    :param **kwargs: {dict} arbitrary keyword arguments for input dictionaries

    :return: {dict} merged dictionary
    """
    inputs = {}
    for arg in args:
        inputs.update(arg)
    inputs.update(kwargs)
    return inputs

def train(model, name, *args, **kwargs):
    """
    Trains the given model with the provided input arguments and keyword arguments.

    :param model: {tf.keras.Model} the model to be trained
    :param name: {str} model name used for saving best weights
    :param *args: {tuple} variable length argument list for input dictionaries
    :param **kwargs: {dict} arbitrary keyword arguments for input dictionaries
    """
    compile_model(model)
    callbacks = create_callbacks(name)
    inputs = merge_dicts(*args, **kwargs)
    validation_inputs = merge_dicts(*args, **kwargs)

    model.fit(
        inputs,
        {'main_output': Y_train},
        epochs=num_epochs,
        batch_size=batch_size,
        validation_data=(
            validation_inputs,
            {'main_output': Y_val}
        ),
        callbacks=callbacks
    )


In [16]:
def load_and_predict(name, inputs):
    """
    Load the model and predict with the provided inputs
    :param name: {str} name of the model to load the model weights
    :param inputs: {dict} dictionary of inputs to be passed to the model

    :return: preds {np.array} array of predicted values/
    """
    model = load_model(name + "_weights_best.hdf5")
    preds = model.predict(inputs,
                          batch_size=batch_size,
                          verbose=1)
    return preds

In [17]:
def calculate_accuracy(predictions, Y_test_gt):
    """
    Calculate the accuracy of the predictions

    :param predictions: {np.array} array of predicted values
    :param Y_test_groundtruth: {np.array} array of ground truth values

    :return: accuracy {float} accuracy of the predictions
    """
    correct = 0
    for i in range(len(predictions)):
        if predictions[i] == Y_test_gt[i]:
            correct += 1
    accuracy = correct / len(predictions)
    return accuracy

In [18]:
def extract_fw_tb(preds):
    """
    Extracts the worst false predictions and the best true predictions

    :param preds: {np.array} array of predicted values

    :return: false_worst: {dict} dictionary of worst false predictions
    :return: true_best: {dict} dictionary of best true predictions
    """
    false_worst = {}
    true_best = {}

    for p in range(len(preds)):
        if np.argmax(preds[p])==0:
            false_worst[p]=preds[p][0]
        elif np.argmax(preds[p])==5:
            true_best[p]=preds[p][5]

    return false_worst, true_best

In [19]:
def evaluate(model_name, *args, **kwargs):
    """
    Evaluates the given model with provided input configurations

    :param model_name: {str} used to load model weights
    :param args: {list} of dictionaries of inputs
    :param kwargs:

    :return: false_worst: {dict} of the worst false predictions
    :return: true_best: {dict} of the best true predictions
    """
    inputs = {}
    for arg in args:
        inputs.update(arg)
    inputs.update(kwargs)

    preds = load_and_predict(model_name, inputs)

    Y_test_groundtruth = list(test_data['output'])
    predictions = np.array([np.argmax(pred) for pred in preds])

    accuracy = calculate_accuracy(predictions, Y_test_groundtruth)
    print("Correctly Predicted: ", np.sum(predictions == Y_test_groundtruth), "/", len(Y_test_groundtruth))
    print("Accuracy: ", accuracy)

    false_worst, true_best = extract_fw_tb(preds)

    pickle.dump(predictions, open(model_name + "_predictions.p", "wb"))
    return false_worst, true_best


In [None]:
# def evaluate(name, *args, **kwargs):
#     model = load_model(name + "_weights_best.hdf5")
#     inputs = {}
#     for arg in args:
#         inputs.update(arg)
#     inputs.update(kwargs)
#
#     preds = model.predict(inputs,
#                           batch_size=batch_size,
#                           verbose=1)
#
#     false_worst = {}
#     true_best = {}
#
#     Y_test_gt = list(test_data['output'])
#     predictions = np.array([np.argmax(pred) for pred in preds])
#
#     for p in range(len(preds)):
#         if np.argmax(preds[p])==0:
#             false_worst[p]=preds[p][0]
#         elif np.argmax(preds[p])==5:
#             true_best[p]=preds[p][5]
#
#     # print(len(predictions))==len(Y_test_gt)
#     correct = np.sum(predictions == Y_test_gt)
#     print("Correctly Predicted: ", correct,"/",len(Y_test_gt))
#     print("Accuracy: ", correct*100.0/len(Y_test_gt))
#
#     pickle.dump(predictions, open(name+"_predictions.p", "wb"))
#     return false_worst, true_best

In [None]:
configurations = [
	{
        "name": "fathan_1",
        "inputs": [X_test_custom, X_test_pos],
    },
	{
        "name": "fathan_2",
        "inputs": [X_test_spacy, X_test_pos],
    },
	{
        "name": "fathan_3",
        "inputs": [X_test_custom, X_test_pos_DEFAULT],
    },
    {
        "name": "fathan_4",
        "inputs": [X_test_custom, X_test_pos_DEFAULT],
    },
	# {
    #     "name": "fathan_5",
    #     "inputs": [X_test_best, X_test_pos_best, X_test_meta],
    # },
	# {
    #     "name": "fathan_6",
    #     "inputs": [X_test_best, X_test_pos_best, X_test_meta, X_test_dep],
    # },
]



In [None]:
# model_name = "bilstm"
#
# for config in configurations:
#     print(f"Running {config['name']}...")
#     (fw, tb) = evaluate(model_name, *config["inputs"])


In [None]:
(fw, tb) = evaluate('bilstm',
                    X_test_custom,
                    X_test_pos,
                    # X_test_dep,
                    # X_test_meta
                    )

In [20]:
import operator

def print_best_false_true_predicted(fw, tb):
  sorted_false = sorted(fw.items(), key=operator.itemgetter(1), reverse=True)
  sorted_true = sorted(tb.items(), key=operator.itemgetter(1), reverse=True)
  print('*****************************************************************')
  print('******************** False statements *************************')

  for t in sorted_false[:5]:
    print(t[1])
    print(test_data.loc[t[0]])
    print('=============')
  print('*****************************************************************')
  print('******************** True Statements *************************')
  for t in sorted_true[:5]:
    print(t[1])
    print(test_data.loc[t[0]])
    print('=============')

In [None]:
# BILSTM model
model_bilstm = Sequential()
model_bilstm.add(Embedding(vocab_length+1, hidden_size, input_length=num_steps))
model_bilstm.add(Bidirectional(LSTM(hidden_size)))
model_bilstm.add(Dense(6, activation='softmax'))

pos_dict = pos_dict_custom
pos_embeddings = np.identity(max(pos_dict.values()), dtype=int)

# statement embed biLSTM
statement_input = Input(shape=(num_steps,), dtype='int32', name='main_input')
x = Embedding(vocab_length+1,EMBED_DIM,weights=[embedding_matrix],input_length=num_steps,trainable=False)(statement_input)
bilstm_word_input = LSTM(lstm_size, dropout=0.2)(x)

# pos embed biLSTM
pos_input = Input(shape=(num_steps,), dtype='int32', name='pos_input')
x2 = Embedding(max(pos_dict.values()), max(pos_dict.values()), weights=[pos_embeddings], input_length=num_steps, trainable=False)(pos_input)
bilstm_pos_in = LSTM(lstm_size, dropout=0.2)(x2)


# todo: DEP AND META

# # dep embed LSTM X3
# dep_input = Input(shape=(num_steps,), dtype='int32', name='dep_input')
# x3 = Embedding(max(dep_dict.values()), max(dep_dict.values()), weights=[dep_embeddings], input_length=num_steps, trainable=False)(dep_input)
# lstm_in3 = LSTM(lstm_size, dropout=0.2)(x3)

# meta data Dense
meta_input = Input(shape=(X_train_meta.shape[1],), name='aux_input')
x_meta = Dense(64, activation='relu')(meta_input)

In [None]:
def concat_lstm_layers(bilstm_input, layers_to_concat):
    layers = [bilstm_input] + layers_to_concat
    x = keras.layers.concatenate(layers)
    return x

extra_layers = [bilstm_pos_in] #, lstm_in3, x_meta]
# exp1_layers = [bilstm_pos_in, lstm_in3, x_meta]
# exp2_layers = [bilstm_pos_in, x_meta]
x = concat_lstm_layers(bilstm_word_input, extra_layers)

In [None]:
main_output = Dense(6, activation='softmax', name='main_output')(x)

In [None]:
# model_lstm = Model(inputs=[statement_input, pos_input, dep_input, meta_input], outputs=[main_output])
# model_lstm = Model(inputs=[statement_input, pos_input, meta_input], outputs=[main_output])
# model_lstm = Model(inputs=[statement_input, dep_input, meta_input], outputs=[main_output])
# model_lstm = Model(inputs=[statement_input, meta_input], outputs=[main_output])

experiment1input = [statement_input, pos_input, dep_input, meta_input]
experiment2input = [statement_input, pos_input, meta_input]
experiment3input = [statement_input, dep_input, meta_input]

model_bilstm = Model(inputs=experiment3input, outputs=[main_output])

In [None]:
train(model_bilstm,
      'bilstm',
      {'main_input': X_train_custom, 'pos_input': X_train_pos},
      # {'aux_input': X_train_meta},
      validation_data=(
	      {'main_input': X_val_custom, 'pos_input': X_val_pos},
	      # {'aux_input': X_val_meta}
      )
      )
train(model_bilstm,
      'bilstm',
      {'main_input': X_train_spacy, 'pos_input': X_train_pos},
      # {'aux_input': X_train_meta},
      validation_data=(
	      {'main_input': X_val_spacy, 'pos_input': X_val_pos},
	      # {'aux_input': X_val_meta}
      )
      )

In [None]:
# def train(model, name, *args, use_pos=False, use_meta=False, use_dep=False, **kwargs):
#     sgd = optimizers.SGD(lr=0.025, clipvalue=0.3, nesterov=True)
#     adam = optimizers.Adam(lr=0.000075, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0, amsgrad=False)
#     model.compile(optimizer=sgd, loss='categorical_crossentropy', metrics=['categorical_accuracy'])
#     tb = TensorBoard()
#     csv_logger = keras.callbacks.CSVLogger('training.log')
#     filepath = name + "_weights_best.hdf5"
#     checkpoint = keras.callbacks.ModelCheckpoint(filepath, monitor='val_categorical_accuracy',
#                                                   verbose=1, save_best_only=True, mode='max')
#
#     inputs = {}
#     for arg in args:
#         inputs.update(arg)
#
#     inputs.update(kwargs)
#
#     model.fit(
#         inputs,
#         {'main_output': Y_train}, epochs=num_epochs, batch_size=batch_size,
#         validation_data=(
#             inputs,
#             {'main_output': Y_val}
#         ), callbacks=[tb, csv_logger, checkpoint]
#     )

In [None]:
# train(model, name, {'main_input': X_train, 'pos_input': X_train_pos}, {'aux_input': X_train_meta})
# train(model, name, main_input=X_train, pos_input=X_train_pos, aux_input=X_train_meta)

In [None]:
# def train(model, name, use_pos=False, use_meta=False, use_dep=False):
#     sgd = optimizers.SGD(lr=0.025, clipvalue=0.3, nesterov=True)
#     adam = optimizers.Adam(lr=0.000075, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0, amsgrad=False)
#     model.compile(optimizer=sgd, loss='categorical_crossentropy', metrics=['categorical_accuracy'])
#     tb = TensorBoard()
#     csv_logger = ker
#     as.callbacks.CSVLogger('training.log')
#     filepath = name + "_weights_best.hdf5"
#     checkpoint = keras.callbacks.ModelCheckpoint(filepath, monitor='val_categorical_accuracy', verbose=1, save_best_only=True, mode='max')
#
#     if use_pos and use_meta:
#         if use_dep:
#             model.fit(
#                 {'main_input': X_train, 'pos_input': X_train_pos, 'aux_input': X_train_meta, 'dep_input': X_train_dep},
#                 {'main_output': Y_train},
#                 epochs=num_epochs,
#                 batch_size=batch_size,
#                 validation_data=(
#                     {'main_input': X_val, 'pos_input': X_val_pos, 'aux_input': X_val_meta, 'dep_input': X_val_dep},
#                     {'main_output': Y_val}),
#                 callbacks=[tb, csv_logger, checkpoint])
#         else:
#             model.fit(
#                 {'main_input': X_train, 'pos_input': X_train_pos, 'aux_input': X_train_meta},
#                 {'main_output': Y_train},
#                 epochs=num_epochs,
#                 batch_size=batch_size,
#                 validation_data=(
#                     {'main_input': X_val, 'pos_input': X_val_pos, 'aux_input': X_val_meta},
#                     {'main_output': Y_val}),
#                 callbacks=[tb, csv_logger, checkpoint])
#     elif use_meta:
#         if use_dep:
#             model.fit(
#                 {'main_input': X_train, 'aux_input': X_train_meta, 'dep_input': X_train_dep},
#                 {'main_output': Y_train},
#                 epochs=num_epochs,
#                 batch_size=batch_size,
#                 validation_data=(
#                     {'main_input': X_val, 'aux_input': X_val_meta, 'dep_input': X_val_dep},
#                     {'main_output': Y_val}),
#                 callbacks=[tb, csv_logger, checkpoint])
#         else:
#             model.fit(
#                 {'main_input': X_train, 'aux_input': X_train_meta},
#                 {'main_output': Y_train},
#                 epochs=num_epochs,
#                 batch_size=batch_size,
#                 validation_data=(
#                     {'main_input': X_val, 'aux_input': X_val_meta},
#                     {'main_output': Y_val}),
#                 callbacks=[tb, csv_logger, checkpoint])
#     elif use_pos:
#         if use_dep:
#             model.fit(
#                 {'main_input': X_train, 'pos_input': X_train_pos, 'dep_input': X_train_dep},
#                 {'main_output': Y_train},
#                 epochs=num_epochs,
#                 batch_size=batch_size,
#                 validation_data=(
#                     {'main_input': X_val, 'pos_input': X_val_pos, 'dep_input': X_val_dep},
#                     {'main_output': Y_val}),
#                 callbacks=[tb, csv_logger, checkpoint])
#         else:
#             model.fit(
#                 {'main_input': X_train, 'pos_input': X_train_pos},
#                 {'main_output': Y_train},
#                 epochs=num_epochs,
#                 batch_size=batch_size,
#                 validation_data=(
#                     {'main_input': X_val, 'pos_input': X_val_pos},
#                     {'main_output': Y_val}),
#                 callbacks=[tb, csv_logger, checkpoint])
#     else:
#         if use_dep:
#             model.fit(
#             {'main_input': X_train,'dep_input':X_train_dep},
#             {'main_output': Y_train}, epochs = num_epochs, batch_size = batch_size,
#             validation_data = (
#                                 {'main_input': X_val, 'dep_input':X_val_dep},
#                                 {'main_output': Y_val}
#                                 ), callbacks=[tb,csv_logger,checkpoint])
#     else:
#       model.fit(
#         {'main_input': X_train},
#         {'main_output': Y_train}, epochs = num_epochs, batch_size = batch_size,
#         validation_data = (
#             {'main_input': X_val},
#             {'main_output': Y_val}
#         ), callbacks=[tb,csv_logger,checkpoint])
#

In [None]:
# use_pos=False
# use_meta=True
# use_dep=True
#
# # LSTM model
# model_lstm = Sequential()
# model_lstm.add(Embedding(vocab_length+1, hidden_size, input_length=num_steps))
# model_lstm.add(Bidirectional(LSTM(hidden_size)))
# model_lstm.add(Dense(6, activation='softmax'))
#
#
# # statement embed LSTM
# statement_input = Input(shape=(num_steps,), dtype='int32', name='main_input')
# x = Embedding(vocab_length+1,EMBED_DIM,weights=[embedding_matrix],input_length=num_steps,trainable=False)(statement_input)
# lstm_in = LSTM(lstm_size,dropout=0.2)(x)
#
#
#
# # pos embed LSTM
# pos_input = Input(shape=(num_steps,), dtype='int32', name='pos_input')
# x2 = Embedding(max(pos_dict.values()), max(pos_dict.values()), weights=[pos_embeddings], input_length=num_steps, trainable=False)(pos_input)
# lstm_in2 = LSTM(lstm_size, dropout=0.2)(x2)
#
#
# # dep embed LSTM
# dep_input = Input(shape=(num_steps,), dtype='int32', name='dep_input')
# x3 = Embedding(max(dep_dict.values()), max(dep_dict.values()), weights=[dep_embeddings], input_length=num_steps, trainable=False)(dep_input)
# lstm_in3 = LSTM(lstm_size, dropout=0.2)(x3)
#
#
# # meta data Dense
# meta_input = Input(shape=(X_train_meta.shape[1],), name='aux_input')
# x_meta = Dense(64, activation='relu')(meta_input)
#
#
# if use_pos and use_meta:
#   if use_dep:
#     x = keras.layers.concatenate([lstm_in, lstm_in2, lstm_in3, x_meta])
#   else:
#     x = keras.layers.concatenate([lstm_in, lstm_in2, x_meta])
# elif use_meta:
#   if use_dep:
#     x = keras.layers.concatenate([lstm_in, lstm_in3, x_meta])
#   else:
#     x = keras.layers.concatenate([lstm_in, x_meta])
# elif use_pos:
#   if use_dep:
#     x = keras.layers.concatenate([lstm_in, lstm_in3, lstm_in2])
#   else:
#     x = keras.layers.concatenate([lstm_in, lstm_in2])
# else:
#   if use_dep:
#     x = keras.layers.concatenate([lstm_in, lstm_in3])
#   else:
#     x = lstm_in
#
#
#
#
# main_output = Dense(6, activation='softmax', name='main_output')(x)
#
# if use_pos and use_meta:
#   if use_dep:
#     model_lstm = Model(inputs=[statement_input, pos_input, dep_input, meta_input], outputs=[main_output])
#   else:
#     model_lstm = Model(inputs=[statement_input, pos_input, meta_input], outputs=[main_output])
# elif use_meta:
#   if use_dep:
#     model_lstm = Model(inputs=[statement_input, dep_input, meta_input], outputs=[main_output])
#   else:
#     model_lstm = Model(inputs=[statement_input, meta_input], outputs=[main_output])
# elif use_pos:
#   if use_dep:
#     model_lstm = Model(inputs=[statement_input, dep_input, pos_input], outputs=[main_output])
#   else:
#     model_lstm = Model(inputs=[statement_input, pos_input], outputs=[main_output])
# else:
#   if use_dep:
#     model_lstm = Model(inputs=[statement_input, dep_input], outputs=[main_output])
#   else:
#     model_lstm = Model(inputs=[statement_input], outputs=[main_output])

In [None]:
# print model_lstm.summary()