In [23]:
# load the data
from nltk.book import *

In [22]:
# inspect the start of the first book
print [t for t in text1[:10]]

[u'[', u'Moby', u'Dick', u'by', u'Herman', u'Melville', u'1851', u']', u'ETYMOLOGY', u'.']


In [24]:
# count the words and display the most frequent
from collections import Counter
text1_counts = Counter()
text1_counts.update(text1)
text1_counts.most_common(10)

[(u',', 18713),
 (u'the', 13721),
 (u'.', 6862),
 (u'of', 6536),
 (u'and', 6024),
 (u'a', 4569),
 (u'to', 4542),
 (u';', 4072),
 (u'in', 3916),
 (u'that', 2982)]

In [25]:
# display the number of unique words
len(text1_counts)

19317

In [26]:
# do the same for Sense and Sensibility
text2_counts = Counter()
text2_counts.update(text2)
print len(text2_counts)
text2_counts.most_common(10)

6833


[(u',', 9397),
 (u'to', 4063),
 (u'.', 3975),
 (u'the', 3861),
 (u'of', 3565),
 (u'and', 3350),
 (u'her', 2436),
 (u'a', 2043),
 (u'I', 2004),
 (u'in', 1904)]

In [27]:
# determine the shared vocabulary
shared_vocab = list(set(text1_counts.keys()).intersection(text2_counts.keys()))
shared_vocab.sort()
print len(shared_vocab)

4669


In [28]:
# extract the counts of the shared vocabulary and put it into a matrix
import numpy as np
from sklearn.preprocessing import normalize
text1_vector = [float(text1_counts[t]) for t in shared_vocab]
text2_vector = [float(text2_counts[t]) for t in shared_vocab]
feature_matrix = np.vstack([text1_vector, text2_vector])
# normalize the counts to get frequencies
feature_matrix = normalize(feature_matrix, axis=1)
print feature_matrix.shape

(2, 4669)


In [29]:
from sklearn.linear_model import Lasso

# create a sparse regression model
model = Lasso(alpha=0.00005)

# assign labels 
y = [1,0]

# fit the model
# note, this model will not converge because we only have two examples!
model.fit(X=feature_matrix, y=y)

Lasso(alpha=5e-05, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)

In [30]:
# display the words that have non-zero weights
indices = [i for i, v in enumerate(shared_vocab) if model.coef_[i] != 0]
print "word\ttext1\ttext2\tweight"
for i in indices:
    word = shared_vocab[i]
    print "%s\t%d\t%d\t%f" % (word, text1_counts.get(word), text2_counts.get(word), model.coef_[i])

word	text1	text2	weight
her	329	2436	-4.137692
the	13721	3861	1.706455


In [31]:
# apply the classifier to the WSJ
test_counts = Counter()
test_counts.update(text7)
test_vector = np.reshape([float(test_counts[t]) for t in shared_vocab], (1, len(shared_vocab)))
test_vector = normalize(test_vector)
result = model.predict(test_vector)
# display the prediction
print result

# display the word counts for our features
print "her:", test_counts.get('her')
print  "the:", test_counts.get('the')

[ 0.98542581]
her: 51
the: 4045
