In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [2]:
from lib import jsoncorpus, datastuff
import gensim
import traceback
import numpy as np
import matplotlib.pyplot as plt
import sklearn.svm
import sklearn.feature_extraction

# LDA

In [3]:
# Load in the previously created corpus and dictionary of scraped sites
corpus, dictionary = jsoncorpus.load_or_create('corpus', 'docs/sites.jl')

Succesfully loaded corpus.mm and corpus.dict


In [4]:
dat = list(datastuff.loadSplitJsonLines('docs/sites.jl.'))

In [5]:
# Generate LDA Model
alltopics = set(', '.join(x['topics']) for x in dat)
model = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=dictionary, num_topics = len(alltopics), passes = 1, chunksize = 100)

In [6]:
# Show some examples of categories
list(alltopics)[:5]

[u'science, reference',
 u'sports, racquetball',
 u'science, chemistry',
 u'news, alternative',
 u'science, search engines']

In [7]:
# Build a list of tuples linking URL to category
meta = list((x['url'], x['topics']) for x in dat)
meta[:5]

[(u'http://www.siliconcoach.com/', [u'sports', u'software']),
 (u'http://www.recenroll.com/', [u'sports', u'software']),
 (u'http://www.gametime.net/', [u'sports', u'software']),
 (u'http://www.foxsportspulse.com/', [u'sports', u'software']),
 (u'http://www.crosstrainer.ca/', [u'sports', u'software'])]

In [8]:
# Show some examples of LDA topics
for t in model.show_topics(10)[:5]:
    print t

0.000*tenopir + 0.000*noticing + 0.000*newsflash + 0.000*negotiating + 0.000*naturally + 0.000*mysteriously + 0.000*mud + 0.000*mornings + 0.000*messengers + 0.000*mess
0.370*space + 0.076*level + 0.040*sense + 0.028*requires + 0.026*amount + 0.025*unable + 0.018*demonstration + 0.018*basic + 0.017*time + 0.015*we’re
0.000*chargers + 0.000*community + 0.000*events + 0.000*programs + 0.000*holiday + 0.000*giveaway + 0.000*youth + 0.000*battle + 0.000*squad + 0.000*corner
0.234*prison + 0.145*donald + 0.105*wizard + 0.000*red + 0.000*commissioner + 0.000*reached + 0.000*david + 0.000*plane + 0.000*grant + 0.000*early
0.000*ski + 0.000*resorts + 0.000*top + 0.000*national + 0.000*linear + 0.000*gallery + 0.000*resort + 0.000*city + 0.000*years + 0.000*1px


In [51]:
# Show some examples of topics for a particular corpus
site =4
print meta[site]
print 
topics = [(p, [x[1] for x in model.show_topic(tid)[:6]]) for tid, p in model[corpus[site]]]
topics = sorted(topics, key = (lambda x: -x[0]))
sums = 0
for x in topics:
    sums += x[0]
    if sums > 0.9:
        break
    print x
print "\nTotal prob: {}\n\n".format(sums)


(u'http://www.crosstrainer.ca/', [u'sports', u'software'])

(0.4664673315740408, [u'paper', u'save', u'time', u'add', u'create', u'build'])
(0.12198810428798466, [u'light', u'body', u'time', u'change', u'simple', u'people'])
(0.098034596930550777, [u'health', u'smoking', u'disease', u'cancer', u'heart', u'drugs'])
(0.056793330126687698, [u'le', u'crosstrainer', u'avec', u'nous', u'du', u'de'])
(0.044188561834639324, [u'groups', u'group', u'planning', u'families', u'dvd', u'definitions'])
(0.033387935441654581, [u'li', u'ul', u'recipe', u'ahover', u'8px', u'machine'])
(0.02601495684122275, [u'software', u'windows', u'pc', u'free', u'machine', u'users'])
(0.025319139748980746, [u'training', u'career', u'institute', u'resource', u'canadian', u'program'])
(0.011933557546013218, [u'food', u'moving', u'buying', u'rate', u'matter', u'move'])
(0.010053135352826004, [u'library', u'online', u'find', u'family', u'studio', u'rooms'])

Total prob: 0.894180649685




# Category assignment

In [16]:
# Build an array of features linking corpus to topics
features = np.zeros((len(corpus), len(alltopics)))
totals = np.zeros(len(alltopics))
for i, doc in enumerate(model[corpus]):
    for topic, p in doc:
        features[i, topic] = p
        totals[topic] += p
        


In [None]:
plt.imshow(features.T)

In [70]:
import sklearn.preprocessing
import sklearn.tree
import sklearn.svm
import sklearn.multiclass
import sklearn.ensemble
import sklearn.cross_validation


all_ylabels = np.array([x['topics'][:2] for x in datastuff.loadSplitJsonLines('docs/sites.jl.')])
import itertools
import collections
counts = collections.Counter(itertools.chain.from_iterable(all_ylabels))
uncommon = [x for x in counts if counts[x] < 9]
ylabels = np.array([[i for i in l if i not in uncommon] for l in all_ylabels])

#if len(ylabels.shape) == 1:
if False:
    label_processor = sklearn.preprocessing.LabelEncoder()
else:
    label_processor = sklearn.preprocessing.MultiLabelBinarizer()

label_processor.fit(ylabels)
#clf = sklearn.tree.DecisionTreeClassifier(random_state=0)
clf = sklearn.ensemble.ExtraTreesClassifier(random_state=0, n_estimators=100, oob_score=True, bootstrap=True, n_jobs=4)
X = features[:,np.sum(features > 0, axis=0)>0]
#X = np.random.rand(X.shape[0], X.shape[1])
y = label_processor.transform(ylabels)
print sklearn.cross_validation.cross_val_score(clf, X, y)


[ 0.00375     0.00875     0.00375469]


In [71]:
label_processor.transform(ylabels[1:4])

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  

In [72]:
ylabels[1:40]

array([[u'sports', u'software'], [u'sports', u'software'],
       [u'sports', u'software'], [u'sports', u'software'],
       [u'sports', u'software'], [u'sports', u'people'],
       [u'sports', u'organizations'], [u'sports', u'organizations'],
       [u'sports', u'organizations'], [u'sports', u'software'],
       [u'sports', u'software'], [u'sports'],
       [u'sports', u'organizations'], [u'sports', u'organizations'],
       [u'sports'], [u'sports', u'organizations'],
       [u'sports', u'organizations'], [u'sports', u'organizations'],
       [u'sports', u'organizations'], [u'sports'], [u'sports'],
       [u'sports'], [u'sports', u'winter sports'],
       [u'sports', u'winter sports'], [u'sports', u'water sports'],
       [u'sports', u'water sports'], [u'sports', u'water sports'],
       [u'sports', u'water sports'], [u'sports', u'organizations'],
       [u'sports', u'walking'], [u'sports', u'walking'],
       [u'sports', u'walking'], [u'sports', u'walking'],
       [u'sports', u'walk

In [73]:
ylabels[100]

[u'sports', u'resources']

In [74]:
test = sklearn.preprocessing.MultiLabelBinarizer().fit(ylabels)
print test.transform(ylabels[1])
print test.inverse_transform(test.transform([ylabels[0]]))


KeyError: u's'

In [75]:
#zip(clf.transform(X)[0][np.where(clf.transform(X)[0]>0)], ylabels[np.where(clf.transform(X)[0]>0)])
clf.fit(X, y)

ExtraTreesClassifier(bootstrap=True, compute_importances=None,
           criterion='gini', max_depth=None, max_features='auto',
           max_leaf_nodes=None, min_density=None, min_samples_leaf=1,
           min_samples_split=2, n_estimators=100, n_jobs=4, oob_score=True,
           random_state=0, verbose=0)

In [77]:
meta[0:10]

[(u'http://www.siliconcoach.com/', [u'sports', u'software']),
 (u'http://www.recenroll.com/', [u'sports', u'software']),
 (u'http://www.gametime.net/', [u'sports', u'software']),
 (u'http://www.foxsportspulse.com/', [u'sports', u'software']),
 (u'http://www.crosstrainer.ca/', [u'sports', u'software']),
 (u'http://www.integratedsports.net/', [u'sports', u'software']),
 (u'http://www.sporting-heroes.net/', [u'sports', u'people']),
 (u'http://coedjewishsports.org/', [u'sports', u'organizations']),
 (u'http://www.sportandrecreation.org.uk', [u'sports', u'organizations']),
 (u'http://www.eis2win.co.uk/pages/default.aspx',
  [u'sports', u'organizations'])]

In [76]:
from pprint import pprint
pprint(label_processor.inverse_transform(clf.predict(X[0:10])))


[(u'software', u'sports'),
 (u'software', u'sports'),
 (u'software', u'sports'),
 (u'software', u'sports'),
 (u'software', u'sports'),
 (u'software', u'sports'),
 (u'people', u'sports'),
 (u'organizations', u'sports'),
 (u'organizations', u'sports'),
 (u'organizations', u'sports')]


In [16]:
X_train, X_test, y_train, y_test, ylabels_train, ylabels_test, meta_train, meta_test, ind_train, ind_test = sklearn.cross_validation.train_test_split(X, y, ylabels, meta, range(len(y)))
clf.fit(X_train, y_train)
pred = clf.predict(X_test)

derp1 = np.array([t[0] for t in meta_test])[:15]
derp2 = dv.inverse_transform(pred)[:15]
derp3 = np.array([t[0] for t in meta_test])[:15]

#print np.vstack(()).T
#print ylabels[testnum]


NameError: name 'dv' is not defined

In [None]:
print meta[897]
print [dictionary[x[0]] for x in corpus[897]]

In [None]:
print dv.inverse_transform(pred)
print ylabels[testnum]

In [None]:
' '.join(dictionary[w] for w, f in corpus[site])

In [None]:
hdpmodel = gensim.models.hdpmodel.HdpModel(corpus=corpus, id2word=dictionary)

In [None]:
hdpmodel[corpus[5]]

In [None]:
hdptops = hdpmodel.show_topics()
[x[0] for x in hdpmodel[corpus[5]]]

In [None]:
hdptops[8]


In [None]:
hdpmodel.m_eta + hdpmodel.m_lambda
hdpmodel.id2word