In [None]:
%config InlineBackend.figure_format = 'retina'

In [None]:
%load_ext autoreload

%autoreload 1

In [None]:
import numpy as np
import pandas as pd

import pickle

from pathlib import Path

# Load data

In [None]:
data_root = Path.home() / "data" / "tmp"
reuters_dir = data_root / "reuters21578"
reuters_corpus_path = reuters_dir / "corpus.pkl"
reuters = pickle.load(open(reuters_corpus_path, "rb"))
top_ten_ids, top_ten_names = reuters.top_n(n=10)

cache_dir = reuters_dir / "cache"

# Build dataframe

In [None]:
train_docs, test_docs = reuters.split_modapte()
docs = train_docs + test_docs
train_labels = reuters.get_labels(train_docs, set(top_ten_ids))
test_labels = reuters.get_labels(test_docs, set(top_ten_ids))

In [None]:
from ds_tutorial.datasets import build_reuters_dataframe
df, top_ten_ids, train_labels, test_labels = build_reuters_dataframe(
    docs, reuters.topics, train_labels, test_labels, top_ten_ids)

In [None]:
df.head()

Unnamed: 0,modapte,category,label,date,title,dateline,body,newid,wd_name
0,train,interest,0,1987-03-11 18:14:49,U.S. ECONOMIC DATA KEY TO DEBT FUTURES OUTLOOK,"CHICAGO, March 11 -",U.S. economic data this week could be\nthe key...,4005,Wednesday
1,train,earn,3,1987-03-11 18:36:05,BANK OF BRITISH COLUMBIA 1ST QTR JAN 31 NET,"VANCOUVER, British Columbia, March 11 -\n",Oper shr loss two cts vs profit three cts\n ...,4012,Wednesday
2,train,earn,3,1987-03-11 18:38:02,RESTAURANT ASSOCIATES INC <RA> 4TH QTR JAN 3,"NEW YORK, March 11 -\n",Shr 25 cts vs 36 cts\n Net 1.4 mln vs 1.4 m...,4014,Wednesday
3,train,earn,3,1987-03-11 18:41:59,MICHIGAN GENERAL CORP <MGL> 4TH QTR,"SADDLE BROOK, N.J., March 11 -\n",Shr loss 1.02 dlrs vs 1.01 dlr\n Net loss 1...,4015,Wednesday
4,train,crude,4,1987-03-11 18:45:36,"USX <X> PROVED OIL, GAS RESERVES FALL IN 1986","NEW YORK, March 11 -",USX Corp said proved reserves of oil\nand natu...,4016,Wednesday


# Build feature extraction pipeline

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion

from sklearn.preprocessing import Imputer
from sklearn.preprocessing import StandardScaler

from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import classification_report
from sklearn.feature_extraction import DictVectorizer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.svm import LinearSVC

In [None]:
from ds_tutorial.transformers import TextFromColumns, TextStats, ColumnSelector, TextFromColumns2

In [None]:
df_train = df.query("modapte == 'train'")
df_test = df.query("modapte == 'test'")
y_train = df_train.label.values
y_test = df_test.label.values

In [None]:
pipeline = Pipeline(memory=str(cache_dir), steps=[
    ("union", FeatureUnion(transformer_list=[
        ("title_stats", Pipeline([
            ("column", ColumnSelector("title")),
            ("stats", TextStats()),
            ("scaled", StandardScaler()),
        ])),
        ("body_stats", Pipeline([
            ("column", ColumnSelector("body")),
            ("stats", TextStats()),
            ("scaled", StandardScaler()),
        ])),
        ("combined_text", Pipeline([
            ("column", TextFromColumns(columns=["title", "body"])),
            #("tfidf", TfidfVectorizer(analyzer="char_wb", ngram_range=(1, 5))),
            ("tfidf", TfidfVectorizer()),
            #("best", TruncatedSVD(n_components=300, random_state=2018))
        ])),

    ])),
])

In [None]:
X_train = pipeline.fit_transform(df_train)
X_test = pipeline.transform(df_test)

## Build multi layer perceptron

In [None]:
from tensorflow.python.keras.optimizers import Adam
from tensorflow.python.keras import models
from tensorflow.python.keras.layers import Dense
from tensorflow.python.keras.layers import Dropout

In [None]:
def mlp_model(layers, units, dropout_rate, input_shape, num_classes):
    model = models.Sequential()
    model.add(Dropout(rate=dropout_rate, input_shape=input_shape))

    for _ in range(layers-1):
        model.add(Dense(units=units, activation='relu'))
        model.add(Dropout(rate=dropout_rate))
    
    model.add(Dense(units=num_classes, activation="softmax"))
    return model

In [None]:
model = mlp_model(3, 32, 0.2, X_train.shape[1:], 75)

Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Instructions for updating:
Colocations handled automatically by placer.


In [None]:
optimizer = Adam(lr=1e-3)
model.compile(optimizer=optimizer, loss="sparse_categorical_crossentropy", metrics=['acc'])

In [None]:
history = model.fit(X_train, y_train, epochs=15)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15

In [None]:
y_pred = model.predict_classes(X_test)
print(classification_report(y_test, y_pred, target_names=top_ten_names, labels=top_ten_ids, digits=3))

             precision    recall  f1-score   support

       earn      0.971     0.989     0.980      1087
        acq      0.951     0.979     0.965       710
   money-fx      0.681     0.855     0.758       145
      grain      0.366     0.357     0.361        42
      crude      0.725     0.902     0.804       164
      trade      0.744     0.826     0.783       109
   interest      0.804     0.701     0.749       117
       ship      0.625     0.493     0.551        71
      wheat      0.648     0.636     0.642        55
       corn      0.405     0.667     0.504        45

avg / total      0.879     0.915     0.895      2545



In [None]:
y_pred = model.predict_classes(X_test)
print(classification_report(y_test, y_pred, target_names=top_ten_names, labels=top_ten_ids, digits=3))

             precision    recall  f1-score   support

       earn      0.980     0.987     0.984      1087
        acq      0.940     0.966     0.953       710
   money-fx      0.729     0.779     0.753       145
      grain      0.408     0.476     0.440        42
      crude      0.770     0.835     0.801       164
      trade      0.721     0.853     0.782       109
   interest      0.748     0.761     0.754       117
       ship      0.662     0.690     0.676        71
      wheat      0.750     0.600     0.667        55
       corn      0.660     0.733     0.695        45

avg / total      0.890     0.914     0.901      2545



In [None]:
y_pred = model.predict_classes(X_test)
print(classification_report(y_test, y_pred, target_names=top_ten_names, labels=top_ten_ids, digits=3))

             precision    recall  f1-score   support

       earn      0.975     0.986     0.981      1087
        acq      0.936     0.968     0.952       710
   money-fx      0.738     0.834     0.783       145
      grain      0.486     0.405     0.442        42
      crude      0.765     0.872     0.815       164
      trade      0.736     0.817     0.774       109
   interest      0.816     0.795     0.805       117
       ship      0.688     0.620     0.652        71
      wheat      0.760     0.691     0.724        55
       corn      0.706     0.800     0.750        45

avg / total      0.894     0.919     0.906      2545



In [None]:
%%time
model = LinearSVC()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred, target_names=top_ten_names, labels=top_ten_ids, digits=3))

             precision    recall  f1-score   support

       earn      0.975     0.988     0.981      1087
        acq      0.921     0.975     0.947       710
   money-fx      0.753     0.800     0.776       145
      grain      0.515     0.405     0.453        42
      crude      0.761     0.835     0.797       164
      trade      0.738     0.853     0.791       109
   interest      0.767     0.786     0.776       117
       ship      0.629     0.620     0.624        71
      wheat      0.776     0.691     0.731        55
       corn      0.630     0.756     0.687        45

avg / total      0.886     0.918     0.901      2545

CPU times: user 30.1 s, sys: 77.9 ms, total: 30.2 s
Wall time: 30.3 s
