In [23]:
import pandas as pd
import numpy as np

from itertools import combinations
from scipy import stats
from collections import Counter

from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn import metrics

from news_vec.corpus import HeadlineDataset, Corpus
from news_vec.encoder import read_preds

In [2]:
import matplotlib as mpl
import matplotlib.pyplot as plt
import altair as alt
import seaborn as sns

mpl.style.use('seaborn-muted')
sns.set(style="whitegrid")

In [18]:
corpus = Corpus('../data/clf-articles.json/')

2019-01-06 15:49:40,657 | INFO : Reading headlines.
1081790it [00:16, 64064.91it/s]


In [21]:
df = corpus.sample_ab('huffingtonpost.com', 'wsj.com')

In [24]:
train_df, test_df = train_test_split(df)

In [25]:
X_train, y_train = train_df.clf_tokens, train_df.domain

In [26]:
X_test, y_test = test_df.clf_tokens, test_df.domain

In [27]:
tv = TfidfVectorizer(
    analyzer='word',
    tokenizer=lambda x: x,
    preprocessor=lambda x: x,
    ngram_range=(1,3),
    token_pattern=None,
)

In [28]:
X_train = tv.fit_transform(X_train)
X_test = tv.transform(X_test)

In [29]:
clf = LinearSVC()
fit = clf.fit(X_train, y_train)

In [30]:
y_test_pred = fit.predict(X_test)
acc = metrics.accuracy_score(y_test, y_test_pred)

In [31]:
acc

0.7991280306252658

In [32]:
y_test_pred

array(['huffingtonpost.com', 'wsj.com', 'wsj.com', ..., 'wsj.com',
       'wsj.com', 'wsj.com'], dtype=object)

In [33]:
y_test

domain                     
huffingtonpost.com  493410     huffingtonpost.com
wsj.com             345545                wsj.com
                    340010                wsj.com
huffingtonpost.com  1003667    huffingtonpost.com
wsj.com             1042550               wsj.com
huffingtonpost.com  772738     huffingtonpost.com
wsj.com             556049                wsj.com
huffingtonpost.com  422239     huffingtonpost.com
wsj.com             729890                wsj.com
huffingtonpost.com  466014     huffingtonpost.com
                    190099     huffingtonpost.com
wsj.com             875756                wsj.com
huffingtonpost.com  734935     huffingtonpost.com
wsj.com             975491                wsj.com
                    616014                wsj.com
huffingtonpost.com  316787     huffingtonpost.com
wsj.com             586930                wsj.com
                    45501                 wsj.com
                    703083                wsj.com
huffingtonpost.com  93

In [35]:
metrics.confusion_matrix(y_test, y_test_pred)

array([[4062,  587],
       [1302, 3453]])

In [36]:
metrics.confusion_matrix?

[0;31mSignature:[0m [0mmetrics[0m[0;34m.[0m[0mconfusion_matrix[0m[0;34m([0m[0my_true[0m[0;34m,[0m [0my_pred[0m[0;34m,[0m [0mlabels[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m [0msample_weight[0m[0;34m=[0m[0;32mNone[0m[0;34m)[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Compute confusion matrix to evaluate the accuracy of a classification

By definition a confusion matrix :math:`C` is such that :math:`C_{i, j}`
is equal to the number of observations known to be in group :math:`i` but
predicted to be in group :math:`j`.

Thus in binary classification, the count of true negatives is
:math:`C_{0,0}`, false negatives is :math:`C_{1,0}`, true positives is
:math:`C_{1,1}` and false positives is :math:`C_{0,1}`.

Read more in the :ref:`User Guide <confusion_matrix>`.

Parameters
----------
y_true : array, shape = [n_samples]
    Ground truth (correct) target values.

y_pred : array, shape = [n_samples]
    Estimated targets as returned by a classifier.

labels : array, sha