# Classifying poems as word sequences

Let us first load the dataset again.

In [None]:
import numpy as np
import pandas as pd

EXTRACT = 'selected_poems.json.bz2'
ALPHABET = 'abcdefghijklmnopqrstuvwxyzäöüßABCDEFGHIKLMNOPQRSTUVWXZYÄÖÜ .,;:!?-()"\'\n'

def clean_text(text):
    return ''.join([char for char in text if char in ALPHABET])

poems = pd.read_json(EXTRACT, compression='infer')
poems['cleaned_text'] = poems.text.apply(clean_text)

## Prepare the data

### Encode poems as word sequences

To represent the poems numerically as **sequences of words** instead of characters, we need to

1. split up each poem into sequences of words or token,
2. determine the vocabulary to enumerate the token,
3. replace each token in each poem by its index in the vocabulary.

These routine preprocessing tasks can be delegated to libraries like [Keras](https://keras.io) or [scikit-learn](https://scikit-learn.org). Here, we will use the former's text preprocessing class `Tokenizer` which is not well documented in keras itself, but in [TensorFlow](https://www.tensorflow.org/api_docs/python/tf/keras/preprocessing/text/Tokenizer).

In [None]:
from keras.preprocessing.text import Tokenizer

NUM_WORDS = 10000
tokenizer = Tokenizer(num_words=NUM_WORDS, lower=False) # do not automatically lower-case the entire text
tokenizer.fit_on_texts(poems.cleaned_text)
poems['word_seq'] = tokenizer.texts_to_sequences(poems.cleaned_text) 
poems[['cleaned_text', 'word_seq']].head()

Let us have a look at the size of the vocabulary:

In [None]:
nr_words = len(tokenizer.word_docs)
nr_words

Let us have a look at the lengths of the poems:

In [None]:
%matplotlib inline
from IPython.display import set_matplotlib_formats
set_matplotlib_formats('svg')

ax = poems['word_seq'].apply(len).plot.hist(bins=50)

Now, we use the convenience function pad_sequences of [Keras](https://keras.io) to trim down the sequences to 300 words:

### Streamline remaining preparation steps

Now, we need to one-hot-encode the authors and shuffle and split the training data and labels again. To streamline this process, we write a short function:

In [None]:
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

def data_from_column(column_name, max_len, train_ratio=0.7):
    if max_len is None:
        X = poems[column_name].values
    else:
        X = pad_sequences(poems[column_name], max_len)
    authors_ohe = pd.get_dummies(poems['author'])
    y = authors_ohe.values
    short_authors = [author.split(',')[0] for author in authors_ohe.columns]
    return train_test_split(X, y, train_size=train_ratio), short_authors

In [None]:
MAX_LEN = 300

(X_train, X_test, y_train, y_test), authors = data_from_column('word_seq', 300)
X_train.shape, y_test.shape, authors

## Train and test a convolutional neural network

In [None]:
from keras.models import Sequential
from keras.layers import Conv1D, Dense, Embedding, GlobalMaxPooling1D

DIM = 96

def build_model(max_len=MAX_LEN):
    return Sequential([
        Embedding(nr_words+1, DIM, input_shape=((max_len,))),
        Conv1D(96, kernel_size=3, activation='relu'),
        GlobalMaxPooling1D(),
        Dense(3, activation='softmax')
    ])

In [None]:
def train_model(model, epochs=5, batch_size=8):
    model.compile(loss='categorical_crossentropy', metrics=['accuracy'], optimizer='Adadelta')
    history = model.fit(X_train,y_train, epochs=epochs, batch_size=batch_size, validation_split=0.2)
    return model, pd.DataFrame(history.history)

In [None]:
model, history = train_model(build_model())

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
sns.set()

def plot_history(history):
    _, (ax1, ax2) = plt.subplots(1,2, figsize=(15,5))
    history[['loss', 'val_loss']].plot.line(ax=ax1)
    history[['acc', 'val_acc']].plot.line(ax=ax2)
    
plot_history(history)

In [None]:
from sklearn import metrics

def validate(model):
    authors = [author.split(',')[0] for author in pd.get_dummies(poems['author']).columns]
    y_pred = np.argmax(model.predict(X_test), axis=1)
    y_res = np.argmax(y_test, axis=1)
    print(metrics.classification_report(y_res, y_pred, target_names=authors))
    cm = pd.crosstab(y_res, y_pred)
    cm.index = authors
    cm.columns = authors
    print(cm)

validate(model)