In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [2]:
from lib import jsoncorpus, datastuff
import gensim
import traceback
import numpy as np
import matplotlib.pyplot as plt
import sklearn.pipeline
import sklearn.preprocessing
import sklearn.cross_validation
import sklearn.ensemble
import sklearn.svm
import sklearn.decomposition
from lib.scikitComponents import *
import itertools
import collections

# LDA

## Data preprocessing

In [3]:
# Load in the previously created corpus and dictionary of scraped sites
dictionary, corpus, meta_corpus, dmoz_data = jsoncorpus.load_or_create('docs/sites.jl', stemmed=True, prefix="stemmed_")

Done.


In [4]:
# Build a list of all topics
allcategories = set(tuple(t) for t in dmoz_data['dmoz_categories'])
# Build a list of all top-level topics
topcategories = set(topic[0] for topic in dmoz_data['dmoz_categories'])
# Link topics to URLs
meta = list(zip(dmoz_data['urls'], dmoz_data['dmoz_categories']))
# Represent the topics in an alternative way
heirarchal_categories = lambda max_depth: [['; '.join(topics[:ti+1]) for ti, t in enumerate(topics) if ti < max_depth] for topics in dmoz_data['dmoz_categories']]
# Top categories
top_categories = [x[0] for x in heirarchal_categories(1)]

## Define a classification pipeline for the corpus data

The Pipeline() object chains together objects from the lib.scikitComponents file, so that they can be used as part of a scikit-learn classification pipeline

Define a classifier (decision trees), and chain the preprocessing step to the classifier

In [5]:
# classifier = sklearn.svm.LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
#      intercept_scaling=1, loss='squared_hinge', max_iter=1000,
#      multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
#      verbose=0)

# classifier = sklearn.svm.SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, degree=3,
# gamma=0.0, kernel='rbf', max_iter=-1, probability=False, random_state=None,
# shrinking=True, tol=0.001, verbose=False)

classifier = sklearn.ensemble.ExtraTreesClassifier(random_state=0, n_estimators=100, oob_score=True, bootstrap=True, n_jobs=4)
builder = MetaMatrixBuilder(stem=True, use_description=True)
clf = sklearn.pipeline.Pipeline([
    ('matrix_builder', builder),
    ('classification', classifier)])

## Process the dmoz categories
In this case, we will just turn the top category in to an index

In [6]:
dmoz_encoder = sklearn.preprocessing.LabelEncoder().fit(top_categories)
classes = dmoz_encoder.transform(top_categories)
X = dmoz_data['meta']
print classes

[7 7 7 ..., 3 3 3]


## Run the classifier

We can use the meta corpus or the full body corpus here, just by replacing a single argument

The data is split in to training and test sets, and then fit to the training set. The LDA model is generated ***only*** from the training set, not the testing set.

In [7]:
scores = sklearn.cross_validation.cross_val_score(clf, X, classes, cv=5)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Accuracy: 0.35 (+/- 0.05)


In [8]:
X_train, X_test, y_train, y_test = sklearn.cross_validation.train_test_split(np.array(X), classes)

In [9]:
model = clf.fit(X_train, y_train)
print model.score(X_test, y_test)
# model.predict(X_test)
# X_test[0].get("keyphrases",[])

0.45


In [10]:
category_scores = collections.defaultdict(float)
category_counts = collections.defaultdict(float)
for real, pred in sorted(zip(dmoz_encoder.inverse_transform(y_test), dmoz_encoder.inverse_transform(model.predict(X_test)))):
    category_scores[real] += 1 if real == pred else 0
    category_counts[real] += 1 

### Find the score for each category. 

Note that if the classifier assigned *random* categories, the score would be $\frac{1}{\textrm{num categories}}$. Instead, it is actually quite high in some cases, indicating a moderate amount of success (given how naive this is!)

In [11]:
print "1/num_cats = ", float(1)/float(15), "\n"
print "Category       Proportion   Score"
print "---------------------------------"
for cat, proportion, score in [(k, category_counts[k]/len(y_test), category_scores[k] / category_counts[k]) for k in category_counts.keys()]:
    print "{:<15} {:<7.4}      {:<5.4}".format(cat, proportion, score)

1/num_cats =  0.0666666666667 

Category       Proportion   Score
---------------------------------
arts            0.04833      0.2069
regional        0.003333      0.0  
shopping        0.045        0.2593
reference       0.07667      0.3478
business        0.08667      0.3846
kids and teens  0.02333      0.2143
computers       0.08333      0.42 
recreation      0.065        0.359
sports          0.1833       0.9  
society         0.08167      0.1429
health          0.04167      0.24 
home            0.06167      0.5676
games           0.04         0.4583
news            0.06333      0.3947
science         0.09667      0.4138
