In [1]:
import numpy as np
%pylab inline
%load_ext autoreload
%autoreload 2
%precision 3
from news import NewsH5

Populating the interactive namespace from numpy and matplotlib


## Loading the dataset
The dataset is in `HDF5` format as explained in [`readme.md`](/README.MD). We can load dataset as:

In [4]:
beefban = NewsH5('beefban')

In [5]:
##titles
beefban.title

array(['City joins national campaign against lynch mob attacks',
       "India's Got Beef With Beef: What You Need To Know About The Country's Controversial 'Beef Ban'",
       'With China Beef Deal, US Re-Enters Market It Once Dominated', ...,
       'KID.V M O N E Y',
       "How Hindus killed a Muslim over beef and murdered India's secularism",
       "I made a promise to Javed Khan. But Congress alone can't keep that promise"],
      dtype='<U200')

In [8]:
## shape of data
beefban.X.shape
## we have 1543 articles and each is represented by 300 dim vector (averaged GLOVE)

(1543, 300)

### splits
there are 51 precomputed splits (train-test-val: 70-20-10)

In [10]:
beefban.train_idxs.shape,beefban.test_idxs.shape,beefban.val_idxs.shape

((51, 1079), (51, 309), (51, 155))

lets use split 0 for demonstration

In [11]:
split = 0
idxs_tr = np.array(beefban.val_idxs[split].tolist() + beefban.train_idxs[split].tolist())
idxs_te = beefban.test_idxs[split]
X_tr, y_tr = beefban.X[idxs_tr], beefban.y[idxs_tr]
X_te, y_te = beefban.X[idxs_te], beefban.y[idxs_te]
X_tr.shape, X_te.shape, y_tr.shape, y_te.shape

((1234, 300), (309, 300), (1234,), (309,))

## Using the summariser models

In [9]:
from models import *

In [16]:
## Kmeans summariser
m = 4
kmeans = get_model("kmeans", mk = m, verbose = True)
kmeans.fit(X_tr, y_tr)
beefban.title[idxs_tr[kmeans.idxs]]

fitting KMeansSumm(gamma=0.01, gamma2=0.1, C=10.0, lambda=0.1, seed=0, m=4)
fitted KMeans: seed = 0,  Counter({0: 4, 1: 4})


array(['No need to ban beef imports from Brazil - Samuda',
       'Athawale supports right to eat beef, warns cow vigilantes',
       'Beef ban in Goa predates BJP rule: Amit Shah',
       'Maha: Maharashtra beef ban law to be tested in light of privacy verdict: SC',
       'China lifts ban on UK beef in move worth £250m in first five years -',
       'No ban on cow slaughter, BJP says in poll-bound Meghalaya',
       'Yogi Adityanath: If Karnataka CM is Hindu, let him ban beef',
       'British beef back on menu after China lifts BSE ban'],
      dtype='<U200')

In [19]:
## MMD-diff-grad summariser
m = 4
mmd_diff_grad = get_model("mmd-diff-grad", mk = m, gamma = 0.1, gamma2=0.1, lambdaa=0.1, verbose = True)
mmd_diff_grad.fit(X_tr, y_tr)
beefban.title[idxs_tr[mmd_diff_grad.idxs]]

fitting MMDGradSumm(gamma=0.1, gamma2=0.1, C=10.0, lambda=0.1, seed=0, m=4) data
data 0.1 0.1 10.0
fitted MMD-Grad- data Counter({0: 4, 1: 4}) [898, 494, 944, 659, 1083, 187, 42, 212]


array(['Operation Gautankvad: Terrifying truth about cow smugglers : India First: India Today',
       'Lynchings, beef-ban not to figure in talks with Shah: Goa BJP',
       'Supreme Court observes Right to Privacy may affect beef ban in Maharashtra',
       'China lifts ban on imported Australian beef',
       'China lifts ban on exports of beef from UK',
       'Yogi Adityanath: If Karnataka CM is Hindu, let him ban beef',
       'Where in India can you get beef?',
       'Read what VD Savarkar wrote: Care for cows, do not worship them'],
      dtype='<U200')

In [20]:
## MMD-div-greedy summariser
m = 4
mmd_diff_grad = get_model("mmd-div-greedy", mk = m, gamma = 0.1, gamma2=0.1, lambdaa=0.1, verbose = True)
mmd_diff_grad.fit(X_tr, y_tr)
beefban.title[idxs_tr[mmd_diff_grad.idxs]]

fitting MMDGreedySumm(gamma=0.1, gamma2=0.1, C=10.0, lambda=0.1, seed=0, m=4)
Kxs 0.1 0.1 10.0
fitted MMD-Greedy- Kxs Counter({0: 4, 1: 4})


array(['Cow vigilantism: After beef ban, meat-seller takes to vending vegetables',
       "India's top court suspends ban on trade in cattle for slaughter",
       "The liberals flaunting 'Not In My Name' placards got it wrong",
       "US ban Brazil's beef imports until food safety addressed",
       '£250m boost for British farmers as China lifts decades-old BSE ban',
       'BJP to stay away from imposing beef ban in Northeast, in other parts it might',
       'Three years after beef ban, slaughter of buffaloes at all-time high in Maharashtra - Nagpur Today : Nagpur News',
       'Jack White vs. the Black Keys: A Beef History'], dtype='<U200')

## Tuning hyperparameters

Let's see how we can tune hyperparameters for `mmd-diff-greedy`.

In [21]:
from utils import bacc
Gammas = [0.05, 0.1, 0.2, 0.4, 0.8] ## gamma of rbf kernel
lambdas = [0.01, 0.03, 0.1, 0.3] ## lambda term in model
Cs = [0.01, 0.1, 1.0, 10.0] ## SVM C term
params = {"C": Cs, "gamma": Gammas, "lambdaa": lambdas}
scorer = make_scorer(bacc)

In [29]:
clf = GridSearchCV(get_model("mmd-diff-greedy", mk = m, verbose = True), params, 
                   cv = 3, n_jobs = 4, verbose = 1, scoring = scorer, return_train_score = False, error_score = 0.0)
clf.fit(X_tr, y_tr)
model = clf.best_estimator_

Fitting 3 folds for each of 80 candidates, totalling 240 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.


data 0.05 0.01 0.01


[Parallel(n_jobs=4)]: Done 240 out of 240 | elapsed:    1.7s finished


In [30]:
## scoring on test-set
model.score(X_te, y_te)

0.6044066227889757

In [31]:
protos = idxs_tr[model.idxs]
beefban.title[protos]

array(['Cow vigilantism: After beef ban, meat-seller takes to vending vegetables',
       "India's top court suspends ban on trade in cattle for slaughter",
       'After 14 years, U.S. beef hits Chinese market',
       'Maha beef ban law to be tested after privacy ruling',
       '£250m boost for British farmers as China lifts decades-old BSE ban',
       'BJP to stay away from imposing beef ban in Northeast, in other parts it might',
       'Three years after beef ban, slaughter of buffaloes at all-time high in Maharashtra - Nagpur Today : Nagpur News',
       'British beef back on menu after China lifts BSE ban'],
      dtype='<U200')