In [1]:
import pandas as pd
from keras.preprocessing.text import Tokenizer
import re

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
upsampled_input_cc_types_df = pd.read_csv("../upsampled_train_val_test/train_no_charade.csv")
val_cc_types_df = pd.read_csv("../upsampled_train_val_test/val_no_charade.csv")
test_cc_types_df = pd.read_csv("../upsampled_train_val_test/test_no_charade.csv")

In [3]:
input_cc_types_df=upsampled_input_cc_types_df.drop_duplicates()

In [4]:
tokenizer = Tokenizer(filters='"#$%&()*+/:;<=>?@[\]^_`{|}~')#exclude ! . , -
tokenizer.fit_on_texts(pd.concat([upsampled_input_cc_types_df,val_cc_types_df,test_cc_types_df])['clue'])

In [5]:
input_bow = tokenizer.texts_to_matrix(upsampled_input_cc_types_df['clue'])

In [6]:
def addSpaceBeforePunct(s):
    s = re.sub('([.,!?()])', r' \1 ', s)
    s = re.sub('\s{2,}', ' ', s)
    return s

In [7]:
upsampled_input_cc_types_df['clue'] = upsampled_input_cc_types_df['clue'].apply(addSpaceBeforePunct)
val_cc_types_df['clue'] = val_cc_types_df['clue'].apply(addSpaceBeforePunct)
test_cc_types_df['clue'] = test_cc_types_df['clue'].apply(addSpaceBeforePunct)

In [8]:
inputs_X = tokenizer.texts_to_matrix(upsampled_input_cc_types_df['clue'])
val_X = tokenizer.texts_to_matrix(val_cc_types_df['clue'])
test_X = tokenizer.texts_to_matrix(test_cc_types_df['clue'])

In [24]:
cc_types = 'is_anagram	is_homophone	is_double	is_cryptic	is_contain	is_reverse	is_alternate	is_init	is_delete	is_&lit	is_hidden	is_spoonerism	is_palindrome'.split('	')

In [25]:
cc_types_dict = {k: v for v, k in enumerate(cc_types)}

In [26]:
inputs_Y = [cc_types_dict[val] for val in upsampled_input_cc_types_df['category'].values]
val_Y = [cc_types_dict[val] for val in val_cc_types_df['category'].values]
test_Y = [cc_types_dict[val] for val in test_cc_types_df['category'].values]

In [27]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix


In [30]:
ovr = OneVsRestClassifier(MultinomialNB())

In [31]:
ovr.fit(inputs_X,inputs_Y)

OneVsRestClassifier(estimator=MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True),
          n_jobs=1)

In [32]:
ovr.score(val_X,val_Y)

0.5033112582781457

In [33]:
val_pred = ovr.predict(val_X)

In [34]:
cnf_matrix = confusion_matrix(val_Y,val_pred)

In [35]:
print(__doc__)

import itertools
import numpy as np
import matplotlib
import matplotlib.pyplot as plt

from sklearn.metrics import confusion_matrix

matplotlib.matplotlib_fname()

def plot_confusion_matrix(cm, classes,normalize=False,title='Confusion matrix',cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.tight_layout()


# Compute confusion matrix
np.set_printoptions(precision=2)
class_names = cc_types
# Plot non-normalized confusion matrix
plt.figure(figsize=(15,15))
plot_confusion_matrix(cnf_matrix, classes=class_names,
                      title='One vs Rest - Confusion matrix, without normalization')

plt.savefig("ovr_unnormalized")
# Plot normalized confusion matrix
plt.figure(figsize=(15,15))
plot_confusion_matrix(cnf_matrix, classes=class_names, normalize=True,
                      title='One vs Rest - Normalized confusion matrix')

plt.savefig("ovr_normalized")

Automatically created module for IPython interactive environment
Confusion matrix, without normalization
[[ 997   31   24  222  190  187   12  164  354   42  173    7    8]
 [   3  216   69   13   45   17    3    1    4    3    3    2    0]
 [  10    9    9   18   13    7    4    3    9    3   11    4    0]
 [  71   37   74  162   91   19   28    5   50   14   11    5    8]
 [  61   11    0   24 1427   38   14   21   30    1   10    0    0]
 [  83    3   19    2   61  414    2    4  113   11    1    0    1]
 [   0    0    0    0    0    0  129    0    0    0    0    0    0]
 [   4   12    3    3   12    9    2  101   28    1    5    0    1]
 [  29   59   16   77  406  101  234   68  724   16  179   13    7]
 [   5    5    1    5    2    5    1    3    2    2    2    0    0]
 [  70   11   24    6  121   65    6   30   23    4  128    0    0]
 [   0    0    0    0    0    0    0    0    0    0    1   21    0]
 [   1    1    0    0    0    4    0    0    1    0    0    0    2]]
Normalized