In [2]:
import pandas as pd
import numpy as np

from news_vec.corpus import Corpus

from tqdm import tqdm
from boltons.iterutils import windowed
from collections import Counter

from sklearn.model_selection import cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression

In [3]:
corpus = Corpus('../data/clf-links.json/', '../data/clf-headlines.json/')

2018-12-26 00:19:21,003 | INFO : Reading links.
1225511it [00:03, 344737.84it/s]
2018-12-26 00:19:26,470 | INFO : Reading headlines.
1127502it [00:20, 54252.23it/s]


In [4]:
dsf = corpus.sample_all_vs_all()

In [29]:
ds = dsf

In [30]:
ds

HeadlineDataset<373568/46696/46696>

In [31]:
rows = []

for split in ('train', 'test'):
    for hl, domain in tqdm(getattr(ds, split)):
        rows.append((hl['clf_tokens'], domain, split))

100%|██████████| 373568/373568 [00:02<00:00, 154495.13it/s]
100%|██████████| 46696/46696 [00:00<00:00, 169520.07it/s]


In [32]:
df = pd.DataFrame(rows, columns=('tokens', 'domain', 'split'))

In [33]:
tv = TfidfVectorizer(
    analyzer='word',
    tokenizer=lambda x: x,
    preprocessor=lambda x: x,
    token_pattern=None,
    ngram_range=(1,3)
)

In [34]:
# clf_lr = LogisticRegression(solver='lbfgs', multi_class='multinomial')
clf_lr = LogisticRegression()

In [35]:
clf_mnb = MultinomialNB()

In [36]:
clf_svc = LinearSVC()

In [37]:
X = tv.fit_transform(df.tokens)

In [38]:
for clf in (clf_mnb, clf_svc, clf_lr):
    print(clf)
    scores = cross_val_score(clf, X, df.domain, cv=5, n_jobs=-1, verbose=True)
    print(np.mean(scores))

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:   21.1s remaining:   31.6s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   21.8s finished


0.3551172762726898
LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:  3.9min remaining:  5.8min
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  3.9min finished


0.3923890599681962
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:  7.2min remaining: 10.8min


KeyboardInterrupt: 