In [1]:
%config InlineBackend.figure_format = 'retina'

In [2]:
%load_ext autoreload

%autoreload 1

In [3]:
import numpy as np
import pandas as pd

import pickle

from pathlib import Path

# Load data

In [4]:
data_root = Path.home() / "data" / "tmp"
reuters_dir = data_root / "reuters21578"
reuters_corpus_path = reuters_dir / "corpus.pkl"
reuters = pickle.load(open(reuters_corpus_path, "rb"))
top_ten_ids, top_ten_names = reuters.top_n(n=10)

cache_dir = reuters_dir / "cache"

# Build dataframe

In [5]:
df, top_ten_ids, train_labels, test_labels = reuters.build_dataframe()

In [6]:
df.head()

Unnamed: 0,modapte,category,label,date,title,dateline,body,newid,wd_name
0,train,interest,0,1987-03-11 18:14:49,U.S. ECONOMIC DATA KEY TO DEBT FUTURES OUTLOOK,"CHICAGO, March 11 -",U.S. economic data this week could be\nthe key...,4005,Wednesday
1,train,earn,3,1987-03-11 18:36:05,BANK OF BRITISH COLUMBIA 1ST QTR JAN 31 NET,"VANCOUVER, British Columbia, March 11 -\n",Oper shr loss two cts vs profit three cts\n ...,4012,Wednesday
2,train,earn,3,1987-03-11 18:38:02,RESTAURANT ASSOCIATES INC <RA> 4TH QTR JAN 3,"NEW YORK, March 11 -\n",Shr 25 cts vs 36 cts\n Net 1.4 mln vs 1.4 m...,4014,Wednesday
3,train,earn,3,1987-03-11 18:41:59,MICHIGAN GENERAL CORP <MGL> 4TH QTR,"SADDLE BROOK, N.J., March 11 -\n",Shr loss 1.02 dlrs vs 1.01 dlr\n Net loss 1...,4015,Wednesday
4,train,crude,4,1987-03-11 18:45:36,"USX <X> PROVED OIL, GAS RESERVES FALL IN 1986","NEW YORK, March 11 -",USX Corp said proved reserves of oil\nand natu...,4016,Wednesday


# Build feature extraction pipeline

In [7]:
from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion

from sklearn.preprocessing import Imputer
from sklearn.preprocessing import StandardScaler

from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import classification_report
from sklearn.feature_extraction import DictVectorizer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.svm import LinearSVC

In [8]:
from ds_tutorial.transformers import TextFromColumns, TextStats, ColumnSelector, TextFromColumns2

In [9]:
df_train = df.query("modapte == 'train'")
df_test = df.query("modapte == 'test'")
y_train = df_train.label.values
y_test = df_test.label.values

In [23]:
pipeline = Pipeline(memory=str(cache_dir), steps=[
    ("union", FeatureUnion(transformer_list=[
        ("title_stats", Pipeline([
            ("column", ColumnSelector("title")),
            ("stats", TextStats()),
            ("scaled", StandardScaler()),
        ])),
        ("body_stats", Pipeline([
            ("column", ColumnSelector("body")),
            ("stats", TextStats()),
            ("scaled", StandardScaler()),
        ])),
        ("combined_text", Pipeline([
            ("column", TextFromColumns2()),
            #("tfidf", TfidfVectorizer(analyzer="char_wb", ngram_range=(1, 5))),
            ("tfidf", TfidfVectorizer()),
            ("best", TruncatedSVD(n_components=300, random_state=2018))
        ])),

    ])),
])

In [24]:
%%time
X_train = pipeline.fit_transform(df_train)
X_test = pipeline.transform(df_test)

CPU times: user 18 s, sys: 1.49 s, total: 19.5 s
Wall time: 8.27 s


## Build multi layer perceptron

In [25]:
from tensorflow.python.keras.optimizers import Adam
from tensorflow.python.keras import models
from tensorflow.python.keras.layers import Dense
from tensorflow.python.keras.layers import Dropout

In [26]:
def mlp_model(layers, units, dropout_rate, input_shape, num_classes):
    model = models.Sequential()
    model.add(Dropout(rate=dropout_rate, input_shape=input_shape))

    for _ in range(layers-1):
        model.add(Dense(units=units, activation='relu'))
        model.add(Dropout(rate=dropout_rate))
    
    model.add(Dense(units=num_classes, activation="softmax"))
    return model

In [27]:
model = mlp_model(3, 32, 0.2, X_train.shape[1:], 75)

In [28]:
optimizer = Adam(lr=1e-3)
model.compile(optimizer=optimizer, loss="sparse_categorical_crossentropy", metrics=['acc'])

In [31]:
history = model.fit(X_train, y_train, epochs=70)

Epoch 1/70
Epoch 2/70
Epoch 3/70
Epoch 4/70
Epoch 5/70
Epoch 6/70
Epoch 7/70
Epoch 8/70
Epoch 9/70
Epoch 10/70
Epoch 11/70
Epoch 12/70
Epoch 13/70
Epoch 14/70
Epoch 15/70
Epoch 16/70
Epoch 17/70
Epoch 18/70
Epoch 19/70
Epoch 20/70
Epoch 21/70
Epoch 22/70
Epoch 23/70
Epoch 24/70
Epoch 25/70
Epoch 26/70
Epoch 27/70
Epoch 28/70
Epoch 29/70
Epoch 30/70
Epoch 31/70
Epoch 32/70
Epoch 33/70
Epoch 34/70
Epoch 35/70
Epoch 36/70
Epoch 37/70
Epoch 38/70
Epoch 39/70
Epoch 40/70
Epoch 41/70
Epoch 42/70
Epoch 43/70
Epoch 44/70
Epoch 45/70
Epoch 46/70
Epoch 47/70
Epoch 48/70
Epoch 49/70
Epoch 50/70
Epoch 51/70
Epoch 52/70
Epoch 53/70
Epoch 54/70
Epoch 55/70
Epoch 56/70
Epoch 57/70
Epoch 58/70
Epoch 59/70
Epoch 60/70
Epoch 61/70
Epoch 62/70
Epoch 63/70
Epoch 64/70
Epoch 65/70
Epoch 66/70
Epoch 67/70
Epoch 68/70
Epoch 69/70
Epoch 70/70


In [42]:
y_pred = model.predict_classes(X_test)
print(classification_report(y_test, y_pred, target_names=top_ten_names, labels=top_ten_ids, digits=3))

             precision    recall  f1-score   support

       earn      0.971     0.989     0.980      1087
        acq      0.951     0.979     0.965       710
   money-fx      0.681     0.855     0.758       145
      grain      0.366     0.357     0.361        42
      crude      0.725     0.902     0.804       164
      trade      0.744     0.826     0.783       109
   interest      0.804     0.701     0.749       117
       ship      0.625     0.493     0.551        71
      wheat      0.648     0.636     0.642        55
       corn      0.405     0.667     0.504        45

avg / total      0.879     0.915     0.895      2545



In [30]:
y_pred = model.predict_classes(X_test)
print(classification_report(y_test, y_pred, target_names=top_ten_names, labels=top_ten_ids, digits=3))

             precision    recall  f1-score   support

       earn      0.965     0.984     0.974      1087
        acq      0.899     0.975     0.935       710
   money-fx      0.661     0.821     0.732       145
      grain      0.000     0.000     0.000        42
      crude      0.714     0.823     0.765       164
      trade      0.692     0.844     0.760       109
   interest      0.780     0.726     0.752       117
       ship      0.405     0.690     0.510        71
      wheat      0.440     0.673     0.532        55
       corn      0.340     0.711     0.460        45

avg / total      0.839     0.908     0.869      2545



In [155]:
y_pred = model.predict_classes(X_test)
print(classification_report(y_test, y_pred, target_names=top_ten_names, labels=top_ten_ids, digits=3))

             precision    recall  f1-score   support

       earn      0.975     0.986     0.981      1087
        acq      0.936     0.968     0.952       710
   money-fx      0.738     0.834     0.783       145
      grain      0.486     0.405     0.442        42
      crude      0.765     0.872     0.815       164
      trade      0.736     0.817     0.774       109
   interest      0.816     0.795     0.805       117
       ship      0.688     0.620     0.652        71
      wheat      0.760     0.691     0.724        55
       corn      0.706     0.800     0.750        45

avg / total      0.894     0.919     0.906      2545



In [28]:
%%time
model = LinearSVC()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred, target_names=top_ten_names, labels=top_ten_ids, digits=3))

             precision    recall  f1-score   support

       earn      0.975     0.988     0.981      1087
        acq      0.921     0.975     0.947       710
   money-fx      0.753     0.800     0.776       145
      grain      0.515     0.405     0.453        42
      crude      0.761     0.835     0.797       164
      trade      0.738     0.853     0.791       109
   interest      0.767     0.786     0.776       117
       ship      0.629     0.620     0.624        71
      wheat      0.776     0.691     0.731        55
       corn      0.630     0.756     0.687        45

avg / total      0.886     0.918     0.901      2545

CPU times: user 30.1 s, sys: 77.9 ms, total: 30.2 s
Wall time: 30.3 s


# Sequential model

In [None]:
X_train.shape[:]

In [49]:
model = models.Sequential()
model.add(Embedding(
    input_dim=X_train.shape[1],
    input_shape=X_train.shape[1:],
    output_dim=32,
    input_length=X_train[0])
)

In [37]:
def _get_last_layer_units_and_activation(num_classes):
    """Gets the # units and activation function for the last network layer.

    # Arguments
        num_classes: int, number of classes.

    # Returns
        units, activation values.
    """
    if num_classes == 2:
        activation = 'sigmoid'
        units = 1
    else:
        activation = 'softmax'
        units = num_classes
    return units, activation

In [51]:
from tensorflow.python.keras import models
from tensorflow.python.keras import initializers
from tensorflow.python.keras import regularizers

from tensorflow.python.keras.layers import Dense
from tensorflow.python.keras.layers import Dropout
from tensorflow.python.keras.layers import Embedding
from tensorflow.python.keras.layers import SeparableConv1D
from tensorflow.python.keras.layers import MaxPooling1D
from tensorflow.python.keras.layers import GlobalAveragePooling1D

def sepcnn_model(blocks,
                 filters,
                 kernel_size,
                 embedding_dim,
                 dropout_rate,
                 pool_size,
                 input_shape,
                 num_classes,
                 num_features,
                 use_pretrained_embedding=False,
                 is_embedding_trainable=False,
                 embedding_matrix=None):
    """Creates an instance of a separable CNN model.

    # Arguments
        blocks: int, number of pairs of sepCNN and pooling blocks in the model.
        filters: int, output dimension of the layers.
        kernel_size: int, length of the convolution window.
        embedding_dim: int, dimension of the embedding vectors.
        dropout_rate: float, percentage of input to drop at Dropout layers.
        pool_size: int, factor by which to downscale input at MaxPooling layer.
        input_shape: tuple, shape of input to the model.
        num_classes: int, number of output classes.
        num_features: int, number of words (embedding input dimension).
        use_pretrained_embedding: bool, true if pre-trained embedding is on.
        is_embedding_trainable: bool, true if embedding layer is trainable.
        embedding_matrix: dict, dictionary with embedding coefficients.

    # Returns
        A sepCNN model instance.
    """
    op_units, op_activation = _get_last_layer_units_and_activation(num_classes)
    model = models.Sequential()

    # Add embedding layer. If pre-trained embedding is used add weights to the
    # embeddings layer and set trainable to input is_embedding_trainable flag.
    if use_pretrained_embedding:
        model.add(Embedding(input_dim=num_features,
                            output_dim=embedding_dim,
                            input_length=input_shape[0],
                            weights=[embedding_matrix],
                            trainable=is_embedding_trainable))
    else:
        model.add(Embedding(input_dim=num_features,
                            output_dim=embedding_dim,
                            input_length=input_shape[0]))

    for _ in range(blocks-1):
        model.add(Dropout(rate=dropout_rate))
        model.add(SeparableConv1D(filters=filters,
                                  kernel_size=kernel_size,
                                  activation='relu',
                                  bias_initializer='random_uniform',
                                  depthwise_initializer='random_uniform',
                                  padding='same'))
        model.add(SeparableConv1D(filters=filters,
                                  kernel_size=kernel_size,
                                  activation='relu',
                                  bias_initializer='random_uniform',
                                  depthwise_initializer='random_uniform',
                                  padding='same'))
        model.add(MaxPooling1D(pool_size=pool_size))

    model.add(SeparableConv1D(filters=filters * 2,
                              kernel_size=kernel_size,
                              activation='relu',
                              bias_initializer='random_uniform',
                              depthwise_initializer='random_uniform',
                              padding='same'))
    model.add(SeparableConv1D(filters=filters * 2,
                              kernel_size=kernel_size,
                              activation='relu',
                              bias_initializer='random_uniform',
                              depthwise_initializer='random_uniform',
                              padding='same'))
    model.add(GlobalAveragePooling1D())
    model.add(Dropout(rate=dropout_rate))
    model.add(Dense(op_units, activation=op_activation))
    return model

In [63]:
#TextFromColumns2().transform(df_train)

In [83]:
from tensorflow.python.keras.preprocessing import text
from tensorflow.python.keras.preprocessing import sequence

MAX_SEQUENCE_LENGTH = 500
TOP_K = 20000

tokenizer = text.Tokenizer(num_words=TOP_K)

train_text = TextFromColumns2().transform(df_train)
tokenizer.fit_on_texts(train_text)

X_train = tokenizer.texts_to_sequences(train_text)
test_text = TextFromColumns2().transform(df_test)
X_test = tokenizer.texts_to_sequences(test_text)

max_length = len(max(X_train, key=len))
if max_length > MAX_SEQUENCE_LENGTH:
    max_length = MAX_SEQUENCE_LENGTH

# Fix sequence length to max value. Sequences shorter than the length are
# padded in the beginning and sequences longer are truncated
# at the beginning.
X_train = sequence.pad_sequences(X_train, maxlen=max_length)
X_test = sequence.pad_sequences(X_test, maxlen=max_length)

In [84]:
num_features = min(len(tokenizer.word_index) + 1, TOP_K)

In [85]:
X_train.shape, X_test.shape, num_features

((7770, 500), (3019, 500), 20000)

In [86]:
model = sepcnn_model(
    blocks=2,
    filters=64,
    kernel_size=3,
    embedding_dim=200,
    dropout_rate=0.2,
    pool_size=3,
    input_shape=X_train.shape[1:],
    num_classes=75,
    num_features=num_features
)

In [87]:
optimizer = Adam(lr=1e-3)
model.compile(optimizer=optimizer, loss="sparse_categorical_crossentropy", metrics=['acc'])

In [88]:
import tensorflow as tf
callbacks = [tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=2)]

In [None]:
%%time
history = model.fit(
    X_train,
    y_train,
    epochs=10,
    validation_data=(X_test, y_test),
    callbacks=callbacks,
    batch_size=128
)

Train on 7770 samples, validate on 3019 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10