In [744]:
import pandas as pd
import math
import keras
from keras.layers import Dense,Embedding, Flatten, Conv1D, GlobalMaxPooling1D, LSTM, Bidirectional, Dropout,GaussianNoise
from keras.preprocessing.text import text_to_word_sequence,Tokenizer
from keras.preprocessing.sequence import pad_sequences

In [745]:
from pymagnitude import MagnitudeUtils, Magnitude

In [746]:
MAX_WORDS = 15 # The maximum number of words the sequence model will consider
STD_DEV = 0.01 # Deviation of noise for Gaussian Noise applied to the embeddings
HIDDEN_UNITS = 100 # The number of hidden units from the LSTM
DROPOUT_RATIO = .5 # The ratio to dropout
BATCH_SIZE = 128 # The number of examples per train/validation step
EPOCHS = 200 # The number of times to repeat through all of the training data
LEARNING_RATE = .01 # The learning rate for the optimizer

In [747]:
vectors = Magnitude("./wiki-news-300d-1M.Magnitude", pad_to_length = MAX_WORDS)

In [748]:
df = pd.read_pickle("./cryptic_dataset/combined_fifteen_times_final_filtered.pickle").drop_duplicates()

In [749]:
anagram_df = df[
    df.is_anagram &
    ~df.is_homophone &
    ~df.is_double &
    ~df.is_cryptic & 
    ~df.is_contain & 
    ~df.is_reverse & 
    ~df.is_alternate &
    ~df.is_init & 
    ~df.is_delete & 
    ~df.is_charade & 
    ~df['is_&lit'] & 
    ~df.is_hidden & 
    ~df.is_spoonerism & 
    ~df.is_palindrome
]

In [750]:
homophone_df = df[
    ~df.is_anagram &
    df.is_homophone &
    ~df.is_double &
    ~df.is_cryptic & 
    ~df.is_contain & 
    ~df.is_reverse & 
    ~df.is_alternate &
    ~df.is_init & 
    ~df.is_delete & 
    ~df.is_charade & 
    ~df['is_&lit'] & 
    ~df.is_hidden & 
    ~df.is_spoonerism & 
    ~df.is_palindrome
]

In [751]:
double_df = df[
    ~df.is_anagram &
    ~df.is_homophone &
    df.is_double &
    ~df.is_cryptic & 
    ~df.is_contain & 
    ~df.is_reverse & 
    ~df.is_alternate &
    ~df.is_init & 
    ~df.is_delete & 
    ~df.is_charade & 
    ~df['is_&lit'] & 
    ~df.is_hidden & 
    ~df.is_spoonerism & 
    ~df.is_palindrome
]

In [752]:
cryptic_df = df[
    ~df.is_anagram &
    ~df.is_homophone &
    ~df.is_double &
    df.is_cryptic & 
    ~df.is_contain & 
    ~df.is_reverse & 
    ~df.is_alternate &
    ~df.is_init & 
    ~df.is_delete & 
    ~df.is_charade & 
    ~df['is_&lit'] & 
    ~df.is_hidden & 
    ~df.is_spoonerism & 
    ~df.is_palindrome
]

In [753]:
contain_df = df[
    ~df.is_anagram &
    ~df.is_homophone &
    ~df.is_double &
    ~df.is_cryptic & 
    df.is_contain & 
    ~df.is_reverse & 
    ~df.is_alternate &
    ~df.is_init & 
    ~df.is_delete & 
    ~df.is_charade & 
    ~df['is_&lit'] & 
    ~df.is_hidden & 
    ~df.is_spoonerism & 
    ~df.is_palindrome
]

In [754]:
reverse_df = df[
    ~df.is_anagram &
    ~df.is_homophone &
    ~df.is_double &
    ~df.is_cryptic & 
    ~df.is_contain & 
    df.is_reverse & 
    ~df.is_alternate &
    ~df.is_init & 
    ~df.is_delete & 
    ~df.is_charade & 
    ~df['is_&lit'] & 
    ~df.is_hidden & 
    ~df.is_spoonerism & 
    ~df.is_palindrome
]

In [755]:
alternate_df = df[
    ~df.is_anagram &
    ~df.is_homophone &
    ~df.is_double &
    ~df.is_cryptic & 
    ~df.is_contain & 
    ~df.is_reverse & 
    df.is_alternate &
    ~df.is_init & 
    ~df.is_delete & 
    ~df.is_charade & 
    ~df['is_&lit'] & 
    ~df.is_hidden & 
    ~df.is_spoonerism & 
    ~df.is_palindrome
]

In [756]:
init_df = df[
    ~df.is_anagram &
    ~df.is_homophone &
    ~df.is_double &
    ~df.is_cryptic & 
    ~df.is_contain & 
    ~df.is_reverse & 
    ~df.is_alternate &
    df.is_init & 
    ~df.is_delete & 
    ~df.is_charade & 
    ~df['is_&lit'] & 
    ~df.is_hidden & 
    ~df.is_spoonerism & 
    ~df.is_palindrome
]

In [757]:
delete_df = df[
    ~df.is_anagram &
    ~df.is_homophone &
    ~df.is_double &
    ~df.is_cryptic & 
    ~df.is_contain & 
    ~df.is_reverse & 
    ~df.is_alternate &
    ~df.is_init & 
    df.is_delete & 
    ~df.is_charade & 
    ~df['is_&lit'] & 
    ~df.is_hidden & 
    ~df.is_spoonerism & 
    ~df.is_palindrome
]

In [758]:
charade_df = df[
    ~df.is_anagram &
    ~df.is_homophone &
    ~df.is_double &
    ~df.is_cryptic & 
    ~df.is_contain & 
    ~df.is_reverse & 
    ~df.is_alternate &
    ~df.is_init & 
    ~df.is_delete & 
    df.is_charade & 
    ~df['is_&lit'] & 
    ~df.is_hidden & 
    ~df.is_spoonerism & 
    ~df.is_palindrome
]

In [759]:
lit_df = df[
    ~df.is_anagram &
    ~df.is_homophone &
    ~df.is_double &
    ~df.is_cryptic & 
    ~df.is_contain & 
    ~df.is_reverse & 
    ~df.is_alternate &
    ~df.is_init & 
    ~df.is_delete & 
    ~df.is_charade & 
    df['is_&lit'] & 
    ~df.is_hidden & 
    ~df.is_spoonerism & 
    ~df.is_palindrome
]

In [760]:
hidden_df = df[
    ~df.is_anagram &
    ~df.is_homophone &
    ~df.is_double &
    ~df.is_cryptic & 
    ~df.is_contain & 
    ~df.is_reverse & 
    ~df.is_alternate &
    ~df.is_init & 
    ~df.is_delete & 
    ~df.is_charade & 
    ~df['is_&lit'] & 
    df.is_hidden & 
    ~df.is_spoonerism & 
    ~df.is_palindrome
]

In [761]:
spoonerism_df = df[
    ~df.is_anagram &
    ~df.is_homophone &
    ~df.is_double &
    ~df.is_cryptic & 
    ~df.is_contain & 
    ~df.is_reverse & 
    ~df.is_alternate &
    ~df.is_init & 
    ~df.is_delete & 
    ~df.is_charade & 
    ~df['is_&lit'] & 
    ~df.is_hidden & 
    df.is_spoonerism & 
    ~df.is_palindrome
]

In [762]:
palindrome_df = df[
    ~df.is_anagram &
    ~df.is_homophone &
    ~df.is_double &
    ~df.is_cryptic & 
    ~df.is_contain & 
    ~df.is_reverse & 
    ~df.is_alternate &
    ~df.is_init & 
    ~df.is_delete & 
    ~df.is_charade & 
    ~df['is_&lit'] & 
    ~df.is_hidden & 
    ~df.is_spoonerism & 
    df.is_palindrome
]

In [763]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df.clue.tolist())

In [764]:
cc_types_dfs = [anagram_df,homophone_df,double_df,cryptic_df,contain_df,reverse_df,alternate_df,init_df,delete_df,lit_df,hidden_df,spoonerism_df,palindrome_df]

In [765]:
cc_types = 'is_anagram	is_homophone	is_double	is_cryptic	is_contain	is_reverse	is_alternate	is_init	is_delete	is_charade	is_&lit	is_hidden	is_spoonerism	is_palindrome'.split('	')

In [766]:
for df,cc_type in zip(cc_types_dfs,cc_types):
    df['category'] = cc_type

In [767]:
def get_input_val_test(df):
    length = len(df)
    input_len = math.floor(length*0.7)
    val_len  = math.floor(length*0.2)
    test_len = math.floor(length*0.1)
    input_df = df[:input_len]
    val_df = df[input_len:input_len+val_len]
    test_df = df[input_len+val_len:]
    return input_df,val_df,test_df

In [768]:
input_cc_types_df = pd.concat([get_input_val_test(df)[0] for df in cc_types_dfs]).sample(frac=1)
val_cc_types_df = pd.concat([get_input_val_test(df)[1] for df in cc_types_dfs]).sample(frac=1)
test_cc_types_df = pd.concat([get_input_val_test(df)[2] for df in cc_types_dfs]).sample(frac=1)

In [769]:
max_size = input_cc_types_df.groupby('category').count().max()[0]

In [770]:
lst = [input_cc_types_df]
for class_index, group in input_cc_types_df.groupby('category'):
    sample = group.sample(max_size-len(group), replace=True, )
    lst.append(sample)
upsampled_input_cc_types_df = pd.concat(lst)

In [771]:
cc_input_df = upsampled_input_cc_types_df.drop('category',axis=1).sample(frac=1)
cc_val_df = val_cc_types_df.drop('category',axis=1).drop_duplicates()
cc_test_df = test_cc_types_df.drop('category',axis=1).drop_duplicates()

In [772]:
cc_input_df['category'] = MagnitudeUtils.from_categorical(cc_input_df[cc_input_df.columns[2:]].as_matrix())
cc_val_df['category'] = MagnitudeUtils.from_categorical(cc_val_df[cc_val_df.columns[2:]].as_matrix())
cc_test_df['category'] = MagnitudeUtils.from_categorical(cc_test_df[cc_test_df.columns[2:]].as_matrix())

In [773]:
cc_input_df.groupby('category').count()['clue']

category
0     6395
1     6395
2     6395
3     6395
4     6395
5     6395
6     6395
7     6395
8     6395
10    6395
11    6395
12    6395
13    6395
Name: clue, dtype: int64

In [774]:
cc_val_df.groupby('category').count()['clue']

category
0     1827
1      296
2       90
3      447
4     1090
5      521
6       54
7      147
8     1359
10      29
11     370
12      21
13       9
Name: clue, dtype: int64

In [775]:
cc_test_df.groupby('category').count()['clue']

category
0     915
1     148
2      46
3     225
4     546
5     262
6      28
7      75
8     681
10     15
11    186
12     12
13      5
Name: clue, dtype: int64

In [776]:
cc_input_df.clue = cc_input_df.clue.apply(text_to_word_sequence)
cc_val_df.clue = cc_val_df.clue.apply(text_to_word_sequence)
cc_test_df.clue = cc_test_df.clue.apply(text_to_word_sequence)

In [777]:
import numpy as np

In [778]:
cc_input_data = cc_input_df.clue.tolist()
cc_val_data = cc_val_df.clue.tolist()
cc_test_data = cc_test_df.clue.tolist()

In [779]:
cc_input_data_out = cc_input_df['category']
cc_val_data_out = cc_val_df['category']
cc_test_data_out = cc_test_df['category']

In [780]:
num_training = len(cc_input_data_out)
num_val = len(cc_val_data_out)
num_test = len(cc_test_data_out)
num_outputs = max(np.max(cc_input_data_out), np.max(cc_test_data_out)) + 1

In [781]:
from math import ceil

In [782]:
training_batches = MagnitudeUtils.batchify(cc_input_data, cc_input_data_out, BATCH_SIZE) # Split the training data into batches
num_batches_per_epoch_train = int(ceil(num_training/float(BATCH_SIZE)))
val_batches = MagnitudeUtils.batchify(cc_val_data, cc_val_data_out, 1)
num_batches_per_epoch_val = int(ceil(num_val/float(1)))
test_batches = MagnitudeUtils.batchify(cc_test_data, cc_test_data_out, 1)  # Split the test data into batches
num_batches_per_epoch_test = int(ceil(num_test/float(1)))

In [783]:
# Generates batches of the transformed training data
train_batch_generator = (
  (
    vectors.query(X_train_batch), # Magnitude will handle converting the 2D array of text into the 3D word vector representations!
    MagnitudeUtils.to_categorical(y_train_batch, num_outputs) # Magnitude will handle converting the class labels into one-hot encodings!
  ) for X_train_batch, y_train_batch in training_batches
)

In [784]:
# Generates batches of the transformed training data
val_batch_generator = (
  (
    vectors.query(X_val_batch), # Magnitude will handle converting the 2D array of text into the 3D word vector representations!
    MagnitudeUtils.to_categorical(y_val_batch, num_outputs) # Magnitude will handle converting the class labels into one-hot encodings!
  ) for X_val_batch, y_val_batch in val_batches
)

In [785]:
# Generates batches of the transformed test data
test_batch_generator = (
  (
    vectors.query(X_test_batch), # Magnitude will handle converting the 2D array of text into the 3D word vector representations!
    MagnitudeUtils.to_categorical(y_test_batch, num_outputs) # Magnitude will handle converting the class labels into one-hot encodings!
  ) for X_test_batch, y_test_batch in test_batches
)


In [786]:
model = keras.Sequential()
model.add(GaussianNoise(STD_DEV, input_shape=(MAX_WORDS, vectors.dim)))
model.add(Bidirectional(LSTM(300, activation='tanh')))
model.add(Dropout(DROPOUT_RATIO))
model.add(Dense(num_outputs, activation='softmax'))

model.compile(
    loss='categorical_crossentropy',
    optimizer='RMSProp',
    metrics=['categorical_accuracy']
)

In [None]:
filepath="1xBilstm-Pretrained_Fasttext-{epoch:02d}-{loss:.2f}-{categorical_accuracy:.2f}-{val_loss:.2f}-{val_categorical_accuracy:.2f}-singlelabel.hdf5"
checkpoint = keras.callbacks.ModelCheckpoint(filepath, monitor='val_loss', verbose=1, save_best_only=False, save_weights_only=False, mode='auto', period=1)
callbacks_list = [checkpoint]

In [None]:
from keras.utils import np_utils
EPOCHS = 16
history = model.fit_generator(
    generator = train_batch_generator,
    steps_per_epoch = num_batches_per_epoch_train,
    validation_data = (vectors.query(cc_val_data),MagnitudeUtils.to_categorical(cc_val_data_out)),
    epochs = EPOCHS,
    callbacks=callbacks_list
)

Epoch 1/16

Epoch 00001: saving model to 1xBilstm-Pretrained_Fasttext-01-1.62-0.46-1.84-0.33-singlelabel.hdf5
Epoch 2/16

Epoch 00002: saving model to 1xBilstm-Pretrained_Fasttext-02-1.11-0.63-1.91-0.36-singlelabel.hdf5
Epoch 3/16

Epoch 00003: saving model to 1xBilstm-Pretrained_Fasttext-03-0.91-0.69-1.77-0.40-singlelabel.hdf5
Epoch 4/16

Epoch 00004: saving model to 1xBilstm-Pretrained_Fasttext-04-0.78-0.73-1.80-0.41-singlelabel.hdf5
Epoch 5/16

Epoch 00005: saving model to 1xBilstm-Pretrained_Fasttext-05-0.68-0.77-1.80-0.42-singlelabel.hdf5
Epoch 6/16

In [None]:
model.evaluate(vectors.query(cc_val_data),MagnitudeUtils.to_categorical(cc_val_data_out))

In [None]:
rows = MagnitudeUtils.to_categorical(cc_val_data_out)

In [None]:
for i,row in enumerate(rows):
    if(row[13]==1):
        print(i)

In [None]:
preds = model.predict(vectors.query(cc_val_data))

In [None]:
preds[6826]

In [None]:
len(MagnitudeUtils.to_categorical(cc_val_data_out))

In [None]:
preds = [pred.argmax(axis=0) for pred in preds]

In [None]:
from sklearn.metrics import confusion_matrix
cnf_matrix = confusion_matrix(cc_val_data_out.as_matrix(),preds)

In [None]:
for i,pred in enumerate(cc_val_data_out.as_matrix()):
    if(pred == 13):
        print(i)

In [None]:
print(__doc__)

import itertools
import numpy as np
import matplotlib
import matplotlib.pyplot as plt


matplotlib.rcParams['interactive'] == True

def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.tight_layout()


# Compute confusion matrix
np.set_printoptions(precision=2)
class_names = cc_types
# Plot non-normalized confusion matrix
plt.figure(figsize=(15,15))
plot_confusion_matrix(cnf_matrix, classes=class_names,
                      title='Confusion matrix, without normalization')

# Plot normalized confusion matrix
plt.figure(figsize=(15,15))
plot_confusion_matrix(cnf_matrix, classes=class_names, normalize=True,
                      title='Normalized confusion matrix')

plt.show()