Dead simple classification scheme for news articles.  Here we compute the tf-idf weighted
bag-of-words representation (bi-gram) for the corpus, and run the sparse vector representations through a KNN classifier.

In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import f1_score
from numpy import ones_like

%matplotlib inline

In [2]:
def load_text(filename='../data/articles.txt', max_lines=None):
  '''Returns a list of article text strings.'''
  lines_read = 0
  text = []
  with open(filename, 'r') as f:
    for line in f:
      text.append(unicode(line, 'utf-8'))
      lines_read += 1
      if lines_read == max_lines:
        break
  return text

def load_labels(filename='../data/labels.txt'):
  '''Returns a list of the text labels per article.'''
  with open(filename, 'r') as f:
    labels = [x.strip() for x in f.readlines()]
  return labels

In [3]:
# Load text labels and convert to binary.

text_labels = load_labels()

le = LabelEncoder()
le.fit(text_labels)
labels = le.transform(text_labels)

In [4]:
# Read the articles as elements of list.

text = load_text()

In [5]:
# Create and fit the vectorizer using bi-grams.

vectorizer = TfidfVectorizer(sublinear_tf=True, stop_words='english',
                             ngram_range=(1, 2))

%time X = vectorizer.fit_transform(text)
text = None

X.shape

CPU times: user 1min 4s, sys: 1.89 s, total: 1min 5s
Wall time: 1min 6s


(27097, 3709236)

In [6]:
# Split training and test data, hold out 30%.

X_train, X_test, y_train, y_test = train_test_split(X, labels, train_size=0.7)
X = None

X_train.shape, y_train.shape, X_test.shape, y_test.shape

((18967, 3709236), (18967,), (8130, 3709236), (8130,))

In [7]:
# Create and fit the KNN classifier.  Need n_jobs=1 for sparse input.

neigh = KNeighborsClassifier(n_neighbors=16, weights='distance', n_jobs=1)
neigh.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=16, p=2,
           weights='distance')

In [9]:
# Predict on the test set.

%time y_pred = neigh.predict(X_test)

CPU times: user 14.9 s, sys: 2.95 s, total: 17.9 s
Wall time: 18.8 s


In [10]:
# f1 score on hold-out set.

f1_score(y_test, y_pred)

0.82233597021870641

In [11]:
# f1 score, always guessing the most likely label ("liberal").

f1_score(y_test, ones_like(y_test))

0.73512252042006998