In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [5]:
from lib import jsoncorpus, datastuff
import gensim
import traceback
import numpy as np
import matplotlib.pyplot as plt
import sklearn.svm
import sklearn.feature_extraction

# LDA

In [14]:
# Load in the previously created corpus and dictionary of scraped sites
dictionary, corpus, meta_corpus = jsoncorpus.load_or_create('docs/sites.jl')

In [40]:
# Build a list of all topics
alltopics = []
toptopics = []
# And a list linking website URLs to DMOZ categories
meta = []

for site in datastuff.loadSplitJsonLines('docs/sites.jl.'):
    alltopics.append(', '.join(site['topics']))
    toptopics.append(site['topics'][0])
    meta.append((site['url'], site['topics']))
alltopics = set(alltopics)
toptopics = set(toptopics)

In [16]:
# Generate LDA Model of document bodies
model = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=dictionary, num_topics = len(alltopics), passes = 1, chunksize = 100)

In [41]:
# Generate LDA Model of metadata
meta_model = gensim.models.ldamodel.LdaModel(corpus=meta_corpus, id2word=dictionary, num_topics = len(toptopics), passes = 1, chunksize = 100)

In [43]:
# Show some examples of LDA topics
for t in meta_model.show_topics(10)[:5]:
    print t

0.034*products + 0.031*party + 0.016*trade + 0.015*architecture + 0.015*english + 0.015*company + 0.014*libraries + 0.013*center + 0.012*engineering + 0.012*university
0.038*resources + 0.029*services + 0.026*web + 0.021*maps + 0.020*map + 0.018*travel + 0.018*software + 0.015*city + 0.015*data + 0.013*air
0.074*kids + 0.069*books + 0.044*american + 0.031*school + 0.015*articles + 0.014*family + 0.012*government + 0.012*great + 0.012*disney + 0.009*puppets
0.082*art + 0.016*parents + 0.016*forum + 0.014*card + 0.013*directory + 0.012*words + 0.011*quotes + 0.011*source + 0.010*global + 0.009*beauty
0.066*health + 0.054*news + 0.033*medical + 0.024*time + 0.018*world + 0.017*thesaurus + 0.015*national + 0.013*business + 0.012*adventure + 0.011*latin


In [53]:
# Show some examples of topics for a particular corpus
site = 35
print meta[site]
print [dictionary[word] for word, freq in meta_corpus[site]]

topics = [(p, [x[1] for x in model.show_topic(tid)[:6]]) for tid, p in meta_model[meta_corpus[site]]]
topics = sorted(topics, key = (lambda x: -x[0]))
sums = 0
for x in topics:
    sums += x[0]
    if sums > 0.9:
        break
    print x
print "\nTotal prob: {}\n\n".format(sums)


(u'https://sites.google.com/a/firm-racing.com/home/', [u'sports', u'multi-sports'])
[u'race', u'independent', u'management']
(0.76666666648436588, [u'integrated', u'kb', u'absolute', u'actual', u'tag', u'arena'])
(0.016666666848741532, [u'blog', u'news', u'--', u'times', u'stone', u'radio'])
(0.01666666666669521, [u'regional', u'city', u'high', u'2nd', u'weekend', u'goals'])
(0.016666666666690842, [u'number', u'message', u'boards', u'equal', u'edit', u'active'])
(0.016666666666687348, [u'hats', u'advice', u'supplies', u'build', u'earliest', u'tower'])
(0.016666666666684714, [u'refer', u'target', u'majority', u'point', u'expansion', u'variations'])
(0.016666666666684322, [u'insurance', u'suggested', u'order', u'extend', u'decision', u'happening'])
(0.016666666666684156, [u'solutions', u'combination', u'commonly', u'observer', u'level', u'move'])
(0.016666666666683403, [u'august', u'april', u'media', u'march', u'february', u'june'])

Total prob: 0.916666666667




# Category assignment

In [52]:
model[corpus[1:3]]

<gensim.interfaces.TransformedCorpus at 0x114a81fd0>

In [None]:
se

In [16]:
# Build an array of features linking corpus to topics
features = np.zeros((len(corpus), len(alltopics)))
totals = np.zeros(len(alltopics))
for i, doc in enumerate(model[corpus]):
    for topic, p in doc:
        features[i, topic] = p
        totals[topic] += p
        


In [None]:
plt.imshow(features.T)

In [70]:
import sklearn.preprocessing
import sklearn.tree
import sklearn.svm
import sklearn.multiclass
import sklearn.ensemble
import sklearn.cross_validation


all_ylabels = np.array([x['topics'][:2] for x in datastuff.loadSplitJsonLines('docs/sites.jl.')])
import itertools
import collections
counts = collections.Counter(itertools.chain.from_iterable(all_ylabels))
uncommon = [x for x in counts if counts[x] < 9]
ylabels = np.array([[i for i in l if i not in uncommon] for l in all_ylabels])

#if len(ylabels.shape) == 1:
if False:
    label_processor = sklearn.preprocessing.LabelEncoder()
else:
    label_processor = sklearn.preprocessing.MultiLabelBinarizer()

label_processor.fit(ylabels)
#clf = sklearn.tree.DecisionTreeClassifier(random_state=0)
clf = sklearn.ensemble.ExtraTreesClassifier(random_state=0, n_estimators=100, oob_score=True, bootstrap=True, n_jobs=4)
X = features[:,np.sum(features > 0, axis=0)>0]
#X = np.random.rand(X.shape[0], X.shape[1])
y = label_processor.transform(ylabels)
print sklearn.cross_validation.cross_val_score(clf, X, y)


[ 0.00375     0.00875     0.00375469]


In [71]:
label_processor.transform(ylabels[1:4])

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  

In [72]:
ylabels[1:40]

array([[u'sports', u'software'], [u'sports', u'software'],
       [u'sports', u'software'], [u'sports', u'software'],
       [u'sports', u'software'], [u'sports', u'people'],
       [u'sports', u'organizations'], [u'sports', u'organizations'],
       [u'sports', u'organizations'], [u'sports', u'software'],
       [u'sports', u'software'], [u'sports'],
       [u'sports', u'organizations'], [u'sports', u'organizations'],
       [u'sports'], [u'sports', u'organizations'],
       [u'sports', u'organizations'], [u'sports', u'organizations'],
       [u'sports', u'organizations'], [u'sports'], [u'sports'],
       [u'sports'], [u'sports', u'winter sports'],
       [u'sports', u'winter sports'], [u'sports', u'water sports'],
       [u'sports', u'water sports'], [u'sports', u'water sports'],
       [u'sports', u'water sports'], [u'sports', u'organizations'],
       [u'sports', u'walking'], [u'sports', u'walking'],
       [u'sports', u'walking'], [u'sports', u'walking'],
       [u'sports', u'walk

In [73]:
ylabels[100]

[u'sports', u'resources']

In [74]:
test = sklearn.preprocessing.MultiLabelBinarizer().fit(ylabels)
print test.transform(ylabels[1])
print test.inverse_transform(test.transform([ylabels[0]]))


KeyError: u's'

In [75]:
#zip(clf.transform(X)[0][np.where(clf.transform(X)[0]>0)], ylabels[np.where(clf.transform(X)[0]>0)])
clf.fit(X, y)

ExtraTreesClassifier(bootstrap=True, compute_importances=None,
           criterion='gini', max_depth=None, max_features='auto',
           max_leaf_nodes=None, min_density=None, min_samples_leaf=1,
           min_samples_split=2, n_estimators=100, n_jobs=4, oob_score=True,
           random_state=0, verbose=0)

In [77]:
meta[0:10]

[(u'http://www.siliconcoach.com/', [u'sports', u'software']),
 (u'http://www.recenroll.com/', [u'sports', u'software']),
 (u'http://www.gametime.net/', [u'sports', u'software']),
 (u'http://www.foxsportspulse.com/', [u'sports', u'software']),
 (u'http://www.crosstrainer.ca/', [u'sports', u'software']),
 (u'http://www.integratedsports.net/', [u'sports', u'software']),
 (u'http://www.sporting-heroes.net/', [u'sports', u'people']),
 (u'http://coedjewishsports.org/', [u'sports', u'organizations']),
 (u'http://www.sportandrecreation.org.uk', [u'sports', u'organizations']),
 (u'http://www.eis2win.co.uk/pages/default.aspx',
  [u'sports', u'organizations'])]

In [76]:
from pprint import pprint
pprint(label_processor.inverse_transform(clf.predict(X[0:10])))


[(u'software', u'sports'),
 (u'software', u'sports'),
 (u'software', u'sports'),
 (u'software', u'sports'),
 (u'software', u'sports'),
 (u'software', u'sports'),
 (u'people', u'sports'),
 (u'organizations', u'sports'),
 (u'organizations', u'sports'),
 (u'organizations', u'sports')]


In [16]:
X_train, X_test, y_train, y_test, ylabels_train, ylabels_test, meta_train, meta_test, ind_train, ind_test = sklearn.cross_validation.train_test_split(X, y, ylabels, meta, range(len(y)))
clf.fit(X_train, y_train)
pred = clf.predict(X_test)

derp1 = np.array([t[0] for t in meta_test])[:15]
derp2 = dv.inverse_transform(pred)[:15]
derp3 = np.array([t[0] for t in meta_test])[:15]

#print np.vstack(()).T
#print ylabels[testnum]


NameError: name 'dv' is not defined

In [None]:
print meta[897]
print [dictionary[x[0]] for x in corpus[897]]

In [None]:
print dv.inverse_transform(pred)
print ylabels[testnum]

In [None]:
' '.join(dictionary[w] for w, f in corpus[site])

In [None]:
hdpmodel = gensim.models.hdpmodel.HdpModel(corpus=corpus, id2word=dictionary)

In [None]:
hdpmodel[corpus[5]]

In [None]:
hdptops = hdpmodel.show_topics()
[x[0] for x in hdpmodel[corpus[5]]]

In [None]:
hdptops[8]


In [None]:
hdpmodel.m_eta + hdpmodel.m_lambda
hdpmodel.id2word