In [311]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [312]:
from lib import jsoncorpus, datastuff
import gensim
import traceback
import numpy as np
import matplotlib.pyplot as plt
import sklearn.pipeline
import sklearn.preprocessing
import sklearn.cross_validation
import sklearn.ensemble
import sklearn.svm
import sklearn.decomposition
from lib.scikitComponents import *
import itertools
import collections

# LDA

## Data preprocessing

In [313]:
# Load in the previously created corpus and dictionary of scraped sites
dictionary, corpus, meta_corpus, dmoz_data = jsoncorpus.load_or_create('docs/sites.jl')

Done.


In [341]:
X = dmoz_data["meta"]
lists = [set(x.get('keyphrases', [])) | set(x.get('keywords', [])) for x in X]
sanitize = lambda word: word.strip()
counts = collections.Counter(sanitize(word) for word in itertools.chain(*lists))
words = [x for x in counts if counts[x] > 4 and x != '']
features = np.zeros((len(X), len(words)), dtype=int)

for i, doc in enumerate(lists):
    features[i,[words.index(x) for x in doc if x in words]] += 1

In [260]:
# Build a list of all topics
allcategories = set(tuple(t) for t in dmoz_data['dmoz_categories'])
# Build a list of all top-level topics
topcategories = set(topic[0] for topic in dmoz_data['dmoz_categories'])
# Link topics to URLs
meta = list(zip(dmoz_data['urls'], dmoz_data['dmoz_categories']))
# Represent the topics in an alternative way
heirarchal_categories = lambda max_depth: [['; '.join(topics[:ti+1]) for ti, t in enumerate(topics) if ti < max_depth] for topics in dmoz_data['dmoz_categories']]
# Top categories
top_categories = [x[0] for x in heirarchal_categories(1)]

## Define a classification pipeline for the corpus data

The Pipeline() object chains together objects from the lib.scikitComponents file, so that they can be used as part of a scikit-learn classification pipeline

In [315]:
X = dmoz_data['meta']
pipe = lda_feature_pipeline(10)
pipe.fit(X)
pipe.transform(X)


array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ..., 
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

Define a classifier (decision trees), and chain the preprocessing step to the classifier

In [359]:
classifier = sklearn.svm.LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

# classifier = sklearn.ensemble.ExtraTreesClassifier(random_state=0, n_estimators=100, oob_score=True, bootstrap=True, n_jobs=4)

clf = sklearn.pipeline.Pipeline([
    ('matrix_builder', MetaMatrixBuilder()),
    ('classification', classifier)])


In [19]:
#matrix = lda_feature_pipeline(len(top_categories)).fit_transform(np.array(meta_corpus))

In [20]:
#pca = sklearn.decomposition.PCA(n_components=0.9)
#pca.fit_transform(np.array(matrix))
#[[dictionary[w] for w in np.where(np.abs(comp)>0.08)[0]] for comp in pca.components_]

## Process the dmoz categories
In this case, we will just turn the top category in to an index

In [317]:
dmoz_encoder = sklearn.preprocessing.LabelEncoder().fit(top_categories)
classes = dmoz_encoder.transform(top_categories)
print classes

[7 7 7 ..., 3 3 3]


## Run the classifier

We can use the meta corpus or the full body corpus here, just by replacing a single argument

The data is split in to training and test sets, and then fit to the training set. The LDA model is generated ***only*** from the training set, not the testing set.

In [296]:
sklearn.cross_validation.cross_val_score(clf, X, classes)

AttributeError: 'MetaMatrixBuilder' object has no attribute 'topic_min_members'

In [360]:
X_train, X_test, y_train, y_test = sklearn.cross_validation.train_test_split(np.array(X), classes)

In [361]:
model = clf.fit(X_train, y_train)
print model.score(X_test, y_test)
# model.predict(X_test)
# X_test[0].get("keyphrases",[])

ValueError: X has 130 features per sample; expecting 781

In [355]:
model

Pipeline(steps=[('matrix_builder', MetaMatrixBuilder(top_min_members=None)), ('classification', ExtraTreesClassifier(bootstrap=True, class_weight=None, criterion='gini',
           max_depth=None, max_features='auto', max_leaf_nodes=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=4,
           oob_score=True, random_state=0, verbose=0, warm_start=False))])

In [45]:
category_scores = collections.defaultdict(float)
category_counts = collections.defaultdict(float)
for real, pred in sorted(zip(dmoz_encoder.inverse_transform(y_test), dmoz_encoder.inverse_transform(model.predict(X_test)))):
    category_scores[real] += 1 if real == pred else 0
    category_counts[real] += 1 

### Find the score for each category. 

Note that if the classifier assigned *random* categories, the score would be $\frac{1}{\textrm{num categories}}$. Instead, it is actually quite high in some cases, indicating a moderate amount of success (given how naive this is!)

In [46]:
print "Category       Proportion   Score"
print "---------------------------------"
for cat, proportion, score in [(k, category_counts[k]/len(y_test), category_scores[k] / category_counts[k]) for k in category_counts.keys()]:
    print "{:<15} {:<7.4}      {:<5.4}".format(cat, proportion, score)

Category       Proportion   Score
---------------------------------
arts            0.06         0.3611
shopping        0.05333      0.5625
reference       0.07167      0.3721
business        0.08333      0.44 
kids and teens  0.02333      0.1429
computers       0.09833      0.5424
recreation      0.06667      0.3  
sports          0.1683       0.7525
society         0.08         0.2083
health          0.05333      0.375
home            0.06167      0.3243
games           0.045        0.2593
news            0.05167      0.2581
science         0.08333      0.24 
