In [36]:
import pandas as pd
import numpy as np

from news_vec.corpus import Corpus

from tqdm import tqdm
from boltons.iterutils import windowed
from collections import Counter

from sklearn.model_selection import cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn import metrics

In [2]:
corpus = Corpus('../data/clf-links.json/', '../data/clf-headlines.json/')

2018-12-26 00:37:31,390 | INFO : Reading links.
1225511it [00:03, 341863.14it/s]
2018-12-26 00:37:36,942 | INFO : Reading headlines.
1127502it [00:21, 52666.14it/s]


In [39]:
ds = corpus.sample_all_vs_all()

In [40]:
rows = []

for split in ('train', 'test'):
    for hl, domain in tqdm(getattr(ds, split)):
        rows.append((hl['clf_tokens'], domain, split))

100%|██████████| 373568/373568 [00:02<00:00, 178305.05it/s]
100%|██████████| 46696/46696 [00:00<00:00, 194836.08it/s]


In [41]:
df = pd.DataFrame(rows, columns=('tokens', 'domain', 'split'))

In [42]:
tv = TfidfVectorizer(
    analyzer='word',
    tokenizer=lambda x: x,
    preprocessor=lambda x: x,
    token_pattern=None,
    ngram_range=(1,3)
)

In [43]:
X_train = tv.fit_transform(df[df.split=='train'].tokens)
X_test = tv.transform(df[df.split=='test'].tokens)

In [44]:
y_train = df[df.split=='train'].domain
y_test = df[df.split=='test'].domain

In [45]:
clf = LinearSVC(verbose=True)

In [46]:
fit = clf.fit(X_train, y_train)

[LibLinear]

In [47]:
y_test_pred = fit.predict(X_test)

In [48]:
metrics.accuracy_score(y_test, y_test_pred)

0.3946804865513106

In [49]:
X = tv.fit_transform(df.tokens)

In [50]:
scores = cross_val_score(clf, X, df.domain, cv=10, n_jobs=-1, verbose=True)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   6 out of  10 | elapsed:  6.6min remaining:  4.4min
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:  9.2min finished


In [51]:
scores

array([0.40177   , 0.39759237, 0.40058055, 0.40018559, 0.39712566,
       0.40103745, 0.39891491, 0.40431203, 0.40127555, 0.40295098])