In [27]:
import pandas as pd
import numpy as np

from news_vec.corpus import Corpus

from tqdm import tqdm
from boltons.iterutils import windowed
from collections import Counter

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import chi2
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

In [2]:
corpus = Corpus('../data/clf-links.json/', '../data/clf-headlines.json/')

2018-12-25 22:05:56,848 | INFO : Reading links.
1225511it [00:03, 346304.00it/s]
2018-12-25 22:06:02,253 | INFO : Reading headlines.
1127502it [00:21, 52856.69it/s]


In [3]:
dsf = corpus.sample_all_vs_all()

In [4]:
ds = dsf.skim(10000)

In [5]:
ds

HeadlineDataset<8000/1000/1000>

In [6]:
rows = []

for split in ('train', 'test'):
    for hl, domain in tqdm(getattr(ds, split)):
        rows.append((hl['clf_tokens'], domain, split))

100%|██████████| 8000/8000 [00:00<00:00, 171393.71it/s]
100%|██████████| 1000/1000 [00:00<00:00, 123700.24it/s]


In [38]:
df = pd.DataFrame(rows, columns=('tokens', 'domain', 'split'))

In [39]:
tv = TfidfVectorizer(
    analyzer='word',
    tokenizer=lambda x: x,
    preprocessor=lambda x: x,
    token_pattern=None,
    ngram_range=(1,3),
)

In [40]:
X_train = tv.fit_transform(df[df.split=='train']['tokens'])
X_test = tv.transform(df[df.split=='test']['tokens'])

In [41]:
y_train = df[df.split=='train']['domain']
y_test = df[df.split=='test']['domain']

In [42]:
feature_names = tv.get_feature_names()

In [43]:
for d in df.domain.unique():
    scores, _ = chi2(X_train, df[df.split=='train']['domain'] == d)
    idx = np.argsort(scores)
    names = np.array(feature_names)[idx][-20:]
    print(d, names)

cnn.com ['olivia newton' 'newton' 'review of' 'weighing' 'vets the claims'
 'check team vets' 'check team' 'reality check' 'reality check team'
 's reality' 'team vets' 'the claims' 's reality check' 'vets the'
 'team vets the' 'cnn s reality' 's most' 'the most dangerous'
 'does trump' 'world s most']
dailykos.com ['live digest' 'elections live' 'elections live digest' 'digest' 'et'
 'daily kos radio' 'is live at' 'radio is live' 'radio is' 'kos radio is'
 'kos radio' 'is live' 'and #' 'undocumented immigrants' 'kos elections'
 'daily kos elections' 'republicans' 'kos' 'daily kos' 'trumpcare']
sputniknews.com ['swedes' 'syria s' 'videos infographics' 'photos videos'
 'photos videos infographics' 'radio photos videos' 'infographics'
 'radio photos' 'mediterranean' 'daesh terrorists' 'de' 'tehran' 'russia'
 'us' '#mln' '$ #mln' 'from syria' 'in syria' 'syria' 'daesh']
thehill.com ['not trump' 'oppose gop' 'do anything' 'replacement' 'blue dogs'
 'new era of' 'new era' 'a new era' 'senat

In [44]:
lr = LogisticRegression(solver='lbfgs', multi_class='auto', n_jobs=-1, verbose=True)
fit = lr.fit(X_train, df[df.split=='train']['domain'])

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:   13.0s finished


In [45]:
y_test_pred = fit.predict(X_test)

In [46]:
metrics.accuracy_score(y_test, y_test_pred)

0.226