In [None]:
%config InlineBackend.figure_format = 'retina'

In [None]:
%load_ext autoreload

%autoreload 1

In [None]:
import numpy as np
import pandas as pd

import pickle

from pathlib import Path

# Load data

In [None]:
data_root = Path.home() / "data" / "tmp"
reuters_dir = data_root / "reuters21578"
reuters_corpus_path = reuters_dir / "corpus.pkl"
reuters = pickle.load(open(reuters_corpus_path, "rb"))
top_ten_ids, top_ten_names = reuters.top_n(n=10)

cache_dir = reuters_dir / "cache"

# Build dataframe

In [None]:
train_docs, test_docs = reuters.split_modapte()
docs = train_docs + test_docs
train_labels = reuters.get_labels(train_docs, set(top_ten_ids))
test_labels = reuters.get_labels(test_docs, set(top_ten_ids))

### Remove gaps

In [None]:
labels = train_labels + test_labels
label_lookup = {}
num = 0
for label in sorted(labels):
    if label not in label_lookup:
        label_lookup[label] = num
        num += 1
labels = [label_lookup[l] for l in labels]
train_labels = [label_lookup[l] for l in train_labels]
test_labels = [label_lookup[l] for l in test_labels]
top_ten_ids = [label_lookup[tid] for tid in top_ten_ids]

In [None]:
df = pd.DataFrame()
df["modapte"] = [d["modapte"] for d in docs]
df["label"] = train_labels + test_labels
df["date"] = [d["date"] for d in docs]
df["title"] = [d["title"] for d in docs]
df["dateline"] = [d["dateline"] for d in docs]
df["body"] = [d["body"] for d in docs]
df["date"] = pd.to_datetime(df.date.str.split(".").apply(lambda x:x[0].lstrip()), format="%d-%b-%Y %H:%M:%S")

In [None]:
df["wd_name"] = df.date.dt.weekday_name

In [None]:
df.head()

Unnamed: 0,modapte,label,date,title,dateline,body,wd_name
0,train,0,1987-03-11 18:14:49,U.S. ECONOMIC DATA KEY TO DEBT FUTURES OUTLOOK,"CHICAGO, March 11 -",U.S. economic data this week could be\nthe key...,Wednesday
1,train,3,1987-03-11 18:36:05,BANK OF BRITISH COLUMBIA 1ST QTR JAN 31 NET,"VANCOUVER, British Columbia, March 11 -\n",Oper shr loss two cts vs profit three cts\n ...,Wednesday
2,train,3,1987-03-11 18:38:02,RESTAURANT ASSOCIATES INC <RA> 4TH QTR JAN 3,"NEW YORK, March 11 -\n",Shr 25 cts vs 36 cts\n Net 1.4 mln vs 1.4 m...,Wednesday
3,train,3,1987-03-11 18:41:59,MICHIGAN GENERAL CORP <MGL> 4TH QTR,"SADDLE BROOK, N.J., March 11 -\n",Shr loss 1.02 dlrs vs 1.01 dlr\n Net loss 1...,Wednesday
4,train,4,1987-03-11 18:45:36,"USX <X> PROVED OIL, GAS RESERVES FALL IN 1986","NEW YORK, March 11 -",USX Corp said proved reserves of oil\nand natu...,Wednesday


In [None]:
df.shape

(10789, 7)

# Build feature extraction pipeline

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion

from sklearn.preprocessing import Imputer
from sklearn.preprocessing import StandardScaler

from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import classification_report
from sklearn.feature_extraction import DictVectorizer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier

import xgboost as xgb
import lightgbm as lgb

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [None]:
class EmptyFitMixin:
    def fit(self, x, y=None):
        return self

In [None]:
class TextFromColumns(EmptyFitMixin, BaseEstimator, TransformerMixin):
    """Extract the text from a list of columns in a single pass.

    Takes a pandas dataframe and produces a series of texts
    from joined columns defined in `text_cols`.
    """
    def __init__(self, columns=["title", "body"]):
        self.text_cols = columns

    def transform(self, df):
        def join(items, axis=None):
            return ' '.join([str(item) for item in items])

        data = df[self.text_cols].apply(lambda x: "" if x[0] is None else x, axis=1)
        texts = data.apply(join, axis=1)
        return texts

In [None]:
class ColumnSelector(EmptyFitMixin, BaseEstimator, TransformerMixin):
    def __init__(self, column, filter_none=True):
        self.column = column
        self.filter_none = filter_none

    def transform(self, df):
        col = df[self.column]
        if self.filter_none:
            col = col.apply(lambda x: "" if x is None else x)
        return col

In [None]:
class FilterNone(EmptyFitMixin, BaseEstimator, TransformerMixin):
    def __init__(self, column):
        self.column = column

    def transform(self, data):
        return df[self.column].values

In [None]:
class TextStats(BaseEstimator, EmptyFitMixin, TransformerMixin):
    """Extract features from each document"""

    def transform(self, col):
        tc = col.str
        features = [
            tc.len(),  # character count
            tc.count("\n"),  # line count
            tc.count("\."),  # sentence count
            tc.split().apply(lambda x:len(x) if x is not None else 0),  # word count
        ]
        features = np.concatenate(
            [f.values.reshape(-1, 1) for f in features],
            axis=1,
        )
        where_are_NaNs = np.isnan(features)
        features[where_are_NaNs] = 0
        return features.astype(np.float)

In [None]:
df_train = df.query("modapte == 'train'")
df_test = df.query("modapte == 'test'")
y_train = df_train.label.values
y_test = df_test.label.values

## The actual pipeline

In [None]:
pipeline = Pipeline(
    steps=[
    ("union", FeatureUnion(n_jobs=4, transformer_list=[
        ("title_stats", Pipeline([
            ("column", ColumnSelector("title")),
            ("stats", TextStats()),
            ("scaled", StandardScaler()),
        ])),
        ("body_stats", Pipeline([
            ("column", ColumnSelector("body")),
            ("stats", TextStats()),
            ("scaled", StandardScaler()),
        ])),
        ("combined_text", Pipeline([
            ("column", TextFromColumns(columns=["title", "body", "wd_name"])),
            ("tfidf", TfidfVectorizer()),
        ])),
    ])),
    ("clf", LinearSVC()),
])

In [None]:
pipeline.fit(df_train, y_train)
y_pred = pipeline.predict(df_test)
print(classification_report(y_test, y_pred, target_names=top_ten_names, labels=top_ten_ids, digits=3))

             precision    recall  f1-score   support

       earn      0.979     0.989     0.984      1087
        acq      0.935     0.977     0.956       710
   money-fx      0.734     0.800     0.766       145
      grain      0.657     0.548     0.597        42
      crude      0.802     0.890     0.844       164
      trade      0.744     0.853     0.795       109
   interest      0.822     0.752     0.786       117
       ship      0.698     0.620     0.657        71
      wheat      0.745     0.745     0.745        55
       corn      0.646     0.689     0.667        45

avg / total      0.900     0.924     0.911      2545

