In [12]:
import json
import keras.layers as layers
import numpy as np
import pandas as pd
import spacy
from gensim.corpora import Dictionary
from keras.models import Model
from keras.preprocessing import sequence
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer
from spacy.util import minibatch

import en_core_web_sm

In [13]:
# nlp = spacy.load('en')
nlp = spacy.load('en_core_web_sm')
data = pd.read_json('../input/news-category/News_Category_Dataset_v2.json', lines=True)

In [14]:
# WORLDPOST and THE WORLDPOST were given as two separate categories in the dataset. Here I change the category THE WORLDPOST to WORLDPOST 
data.category = data.category.map(lambda x: "WORLDPOST" if x == "THE WORLDPOST" else x)

In [15]:
categories = data.groupby('category').size().sort_values(ascending=False)
categories

category
POLITICS          32739
WELLNESS          17827
ENTERTAINMENT     16058
TRAVEL             9887
STYLE & BEAUTY     9649
PARENTING          8677
HEALTHY LIVING     6694
QUEER VOICES       6314
WORLDPOST          6243
FOOD & DRINK       6226
BUSINESS           5937
COMEDY             5175
SPORTS             4884
BLACK VOICES       4528
HOME & LIVING      4195
PARENTS            3955
WEDDINGS           3651
WOMEN              3490
IMPACT             3459
DIVORCE            3426
CRIME              3405
MEDIA              2815
WEIRD NEWS         2670
GREEN              2622
RELIGION           2556
STYLE              2254
SCIENCE            2178
WORLD NEWS         2177
TASTE              2096
TECH               2082
MONEY              1707
ARTS               1509
FIFTY              1401
GOOD NEWS          1398
ARTS & CULTURE     1339
ENVIRONMENT        1323
COLLEGE            1144
LATINO VOICES      1129
CULTURE & ARTS     1030
EDUCATION          1004
dtype: int64

In [16]:
TOP_N_CATEGORIES = 15
data = data[data.category.apply(lambda x: x in categories.index[:TOP_N_CATEGORIES]) &\
            (data.headline.apply(len) > 0)]
data_train, data_test = train_test_split(data, test_size=.1, random_state=31)

# SpaCy Baseline

In [17]:
# check if `textcat` is already in the pipe, add if not
if 'textcat' not in nlp.pipe_names:
    textcat = nlp.create_pipe('textcat')
    nlp.add_pipe(textcat, last=True)
else:
    textcat = nlp.get_pipe('textcat')

# add labels to the model    
for label in categories.index[:TOP_N_CATEGORIES]:
    textcat.add_label(label)

# preprocess training data
data_train_spacy = list(
    zip(data_train.headline,
        data_train.category.apply(
            lambda cat: {'cats': {c: float(c == cat)
                                  for c in categories.index[:TOP_N_CATEGORIES]}}))
)

# train the model
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'textcat']
with nlp.disable_pipes(*other_pipes):
    optimizer = nlp.begin_training()
    for i in range(5):
        print('Epoch %d' % i)
        losses = {}
        batches = minibatch(data_train_spacy, size=128)
        for batch in batches:
            texts, annotations = zip(*batch)
            nlp.update(texts, annotations, sgd=optimizer, drop=0.2,
                       losses=losses)
        with textcat.model.use_params(optimizer.averages):
            docs = [nlp.tokenizer(h) for h in data_test.headline]
            test_pred = np.array(
                [sorted(doc.cats.items(), key=lambda x: -x[1])[0][0]
                 for doc in textcat.pipe(docs)])
            print('Test Acc: %.4f' %
                  (pd.Series(test_pred == data_test.category.values).sum() / data_test.shape[0]))

Epoch 0
Test Acc: 0.5236
Epoch 1
Test Acc: 0.6598
Epoch 2
Test Acc: 0.6979
Epoch 3
Test Acc: 0.7119
Epoch 4
Test Acc: 0.7211


In [18]:
spacy_y_pred = [sorted(doc.cats.items(), key=lambda x: -x[1])[0][0]
                for doc in nlp.pipe(data_test.headline)]
print(classification_report(data_test.category, spacy_y_pred))

                precision    recall  f1-score   support

  BLACK VOICES       0.67      0.39      0.49       439
      BUSINESS       0.61      0.47      0.53       606
        COMEDY       0.76      0.39      0.51       547
 ENTERTAINMENT       0.70      0.80      0.75      1635
  FOOD & DRINK       0.75      0.73      0.74       646
HEALTHY LIVING       0.41      0.20      0.27       664
 HOME & LIVING       0.77      0.70      0.73       422
     PARENTING       0.65      0.70      0.67       822
      POLITICS       0.82      0.86      0.84      3303
  QUEER VOICES       0.84      0.65      0.73       624
        SPORTS       0.68      0.75      0.71       465
STYLE & BEAUTY       0.81      0.79      0.80       944
        TRAVEL       0.78      0.77      0.78      1038
      WELLNESS       0.58      0.81      0.68      1717
     WORLDPOST       0.74      0.73      0.73       631

      accuracy                           0.72     14503
     macro avg       0.70      0.65      0.66 

In [19]:
'''
# SERIALIZE
config = nlp.config
bytes_data = nlp.to_bytes()

# https://spacy.io/usage/saving-loading
# nlp.to_disk("/path")
'''

AttributeError: 'English' object has no attribute 'config'

In [None]:
'''
# DESERIALIZE

lang_cls = spacy.util.get_lang_class(config["nlp"]["lang"])
nlp = lang_cls.from_config(config)
nlp.from_bytes(bytes_data)

# nlp.from_disk("/path")
'''

In [20]:
# Saving trained pipeline
nlp.to_disk("./en_our_pipeline")

In [None]:
# Loading the custom pipeline
nlp = spacy.load("/path/to/pipeline")

In [25]:
doc = nlp.tokenizer('The Paris Saint-Germain midfielder Adrien Rabio will join Barcelona on a free transfer in the summer.\nThe 23-year-old’s contract runs out in the summer and he has agreed a signing on fee of €10m (£9m) as well as a salary of just over £170,000 a week for five years with the La Liga leaders.')
doc

The Paris Saint-Germain midfielder Adrien Rabio will join Barcelona on a free transfer in the summer.
The 23-year-old’s contract runs out in the summer and he has agreed a signing on fee of €10m (£9m) as well as a salary of just over £170,000 a week for five years with the La Liga leaders.

In [32]:
doc = nlp.tokenizer("Will Smith Joins Diplo And Nicky Jam For The 2018 World Cup's Official Song")
doc

Will Smith Joins Diplo And Nicky Jam For The 2018 World Cup's Official Song

In [33]:
textcat.pipe(doc)

<generator at 0x7f99ed4eceb0>

In [34]:
doc.cats

{}

In [30]:
data.headline.iloc[0]

"Will Smith Joins Diplo And Nicky Jam For The 2018 World Cup's Official Song"