In [1]:
import pandas as pd
import numpy as np

from news_vec.corpus import Corpus

from tqdm import tqdm
from boltons.iterutils import windowed
from collections import Counter

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import chi2
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

In [2]:
corpus = Corpus('../data/clf-links.json/', '../data/clf-headlines.json/')

2018-12-25 00:25:18,384 | INFO : Reading links.
1225511it [00:06, 203771.88it/s]
2018-12-25 00:25:27,741 | INFO : Reading headlines.
1127502it [00:32, 34874.18it/s]


In [68]:
dsf = corpus.sample_a_vs_b('apnews.com', 'buzzfeed.com')

In [69]:
ds = dsf.skim(6000)

In [70]:
ds

HeadlineDataset<4800/600/600>

In [71]:
rows = []

for hl, domain in tqdm(ds.train):
    rows.append((hl['clf_tokens'], domain, 'train'))
    
for hl, domain in tqdm(ds.val):
    rows.append((hl['clf_tokens'], domain, 'val'))
    
for hl, domain in tqdm(ds.test):
    rows.append((hl['clf_tokens'], domain, 'test'))

100%|██████████| 4800/4800 [00:00<00:00, 40880.90it/s]
100%|██████████| 600/600 [00:00<00:00, 41485.32it/s]
100%|██████████| 600/600 [00:00<00:00, 30993.16it/s]


In [72]:
df = pd.DataFrame(rows, columns=('tokens', 'domain', 'split'))

In [73]:
tv = TfidfVectorizer(
    analyzer='word',
    tokenizer=lambda x: x,
    preprocessor=lambda x: x,
    token_pattern=None,
    ngram_range=(1,3)
)

In [74]:
X_train = tv.fit_transform(df[df.split=='train']['tokens'])
X_val = tv.transform(df[df.split=='val']['tokens'])
X_test = tv.transform(df[df.split=='test']['tokens'])

In [75]:
y_train = df[df.split=='train']['domain']
y_val = df[df.split=='val']['domain']
y_test = df[df.split=='test']['domain']

In [76]:
feature_names = tv.get_feature_names()

In [77]:
for d in df.domain.unique():
    scores, _ = chi2(X_train, df[df.split=='train']['domain'] == d)
    idx = np.argsort(scores)
    names = np.array(feature_names)[idx][-20:]
    print(d, names)

apnews.com ['just' 'it' 'these' 'here' 'and we ll' 'is' 'we ll' 'people' 'and we'
 'which' 'we' 'll' 'your' 'that' 'this' 'and' 'a' 'are' 'the' 'you']
buzzfeed.com ['just' 'it' 'these' 'here' 'and we ll' 'is' 'we ll' 'people' 'and we'
 'which' 'we' 'll' 'your' 'that' 'this' 'and' 'a' 'are' 'the' 'you']


In [78]:
lr = LogisticRegression(solver='lbfgs', multi_class='auto', n_jobs=-1, verbose=True)
fit = lr.fit(X_train, df[df.split=='train']['domain'])

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    1.3s finished


In [79]:
y_test_pred = fit.predict(X_test)

In [80]:
metrics.accuracy_score(y_test, y_test_pred)

0.8883333333333333