In [1]:
# In this example, I walk through how to use sklearn to classify users into male or female
# based on their user description.

# First, we need to get some tweets in JSON format.
# Create a tweets.json file with something like:
# twitter-curl --query "track=obama" > tweets.json
# This will query twitter for tweets containing the word obama.

# Now, we'll parse that file into a list of (name, description) tuples.
import json
import io
# open the json file
fp = io.open('tweets.json', mode='rt', encoding='utf8')
# read the names and description fields from each tweet.
data = []
for line in fp:
    js = json.loads(line)  # parse into a JSON object.
    name = js['user']['name']
    description = js['user']['description']
    if name and description:  # if fields aren't blank
        data.append((name.lower(), description.lower()))
print 'read', len(data), 'users'
print 'example:', data[0]


IOError: [Errno 2] No such file or directory: 'tweets.json'

In [65]:
# Now, we need to label them as male or female. 
# To do that, we get the top 500 male/female names from census
import requests  # This is a very handy library for html requests.
males = requests.get('http://www.census.gov/genealogy/www/data/1990surnames/dist.male.first').text.split('\n')
males = [m.split()[0].lower() for m in males[:500]]  # lower case and take top 500
print 'first male is', males[0]
females = requests.get('http://www.census.gov/genealogy/www/data/1990surnames/dist.female.first').text.split('\n')
females = [f.split()[0].lower() for f in females[:500]]  # lower case and take top 500
print 'first female is', females[0]

# Remove ambiguous names (those that appear on both lists)
# Note that the plus operator is overloaded to mean concatentation for lists.
ambiguous = [f for f in females + males if f in males and f in females]
print 'ambiguous is', ambiguous[0]
males = [m for m in males if m not in ambiguous]
females = [f for f in females if f not in ambiguous]
print 'got', len(males), 'males and', len(females), 'females'

first male is james
first female is mary
ambiguous is jean
got 473 males and 473 females


In [66]:
# sort male, female users
male_users = [d for d in data if len(d[0].split()) > 0 and d[0].split()[0] in males]
print len(male_users), 'males'
print male_users[0]
female_users = [d for d in data if len(d[0].split()) > 0 and d[0].split()[0] in females]
print len(female_users), 'females'
print female_users[0]

1637 males
(u'daniel john sobieski', u"editorial writer, investor's business daily, #reagan #conservative, somewhere to the right of attila the hun #catholic #prolife #tcot #teaparty #nra")
780 females
(u'jessica cranor ', u'lame, nerdy, brilliant, fantastic')


In [67]:
# Make target vector. Female=1, Male=0
import numpy as np
y = np.array([0.] * len(male_users) + [1.] * len(female_users))
data = [d[1] for d in male_users + female_users]
print 'first label=', y[0]
print 'first description=', data[0]

first label= 0.0
first description= editorial writer, investor's business daily, #reagan #conservative, somewhere to the right of attila the hun #catholic #prolife #tcot #teaparty #nra


In [68]:
# Convert descriptions into feature vectors.
from sklearn.feature_extraction.text import CountVectorizer
vec = CountVectorizer()
X = vec.fit_transform(data)
print data[0],'\nis transformed into the sparse vector\n', X[0]
print 'the word THE is mapped to index', vec.vocabulary_['the']
print 'there are', len(vec.vocabulary_), 'unique features'

editorial writer, investor's business daily, #reagan #conservative, somewhere to the right of attila the hun #catholic #prolife #tcot #teaparty #nra 
is transformed into the sparse vector
  (0, 662)	1
  (0, 1142)	1
  (0, 1274)	1
  (0, 1633)	1
  (0, 1846)	1
  (0, 2248)	1
  (0, 3396)	1
  (0, 3625)	1
  (0, 4803)	1
  (0, 4880)	1
  (0, 5480)	1
  (0, 5663)	1
  (0, 5863)	1
  (0, 6330)	1
  (0, 6663)	1
  (0, 6693)	1
  (0, 6757)	2
  (0, 6851)	1
  (0, 7522)	1
the word THE is mapped to index 6757
there are 7662 unique features


In [69]:
# Compute cross validation accuracy
from sklearn import cross_validation
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression()
print 'avg accuracy=%.3f' % np.average(cross_validation.cross_val_score(clf, X, y, cv=5, scoring='accuracy'))

avg accuracy=0.807


In [70]:
# Try Naive Bayes
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()
print 'avg accuracy=%.3f' % np.average(cross_validation.cross_val_score(clf, X, y, cv=5, scoring='accuracy'))

avg accuracy=0.752


In [71]:
# Try adding bigrams
vec = CountVectorizer(ngram_range=(1,2))
X = vec.fit_transform(data)
print 'there are', len(vec.vocabulary_), 'unique features'
print 'ten feature examples:', vec.vocabulary_.keys()[:10]
from sklearn import cross_validation
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression()
print 'avg accuracy with bigrams=%.3f' % np.average(cross_validation.cross_val_score(clf, X, y, cv=5, scoring='accuracy'))

there are 27322 unique features
ten feature examples: [u'16 33', u'my hobbies', u'loving faithful', u'member aft', u'twitter thing', u'whizzy_walexzy', u'rush daily', u'lom', u'se it', u'expect the']
avg accuracy with bigrams=0.815


In [72]:
# Print top feature weights for female
clf = LogisticRegression()
clf.fit(X, y)  # fit on all data
top_indices = clf.coef_[0].argsort()[::-1] # sort in decreasing order
# reverse the alphabet to map from idx->word
vocab_r = dict((idx, word) for word, idx in vec.vocabulary_.iteritems())
print 'female words:\n', '\n'.join(['%s=%.3f' % (vocab_r[idx], clf.coef_[0][idx]) for idx in top_indices[:20]])
top_indices = clf.coef_[0].argsort() # sort in increasing order
print '\n\nmale words:\n', '\n'.join(['%s=%.3f' % (vocab_r[idx], clf.coef_[0][idx]) for idx in top_indices[:20]])

female words:
mom=2.050
mother=1.767
christian=1.034
wife=0.976
grandmother=0.962
girl=0.959
lover=0.863
fighter=0.848
kind=0.736
loving=0.722
guns and=0.710
one=0.697
children=0.695
lover of=0.693
woman=0.671
loves=0.659
just don=0.643
communications=0.643
indian=0.642
ayeeee=0.642


male words:
father=-1.423
sports=-1.182
husband=-1.146
of the=-0.873
veteran=-0.869
love jesus=-0.806
radio=-0.726
dad=-0.704
man=-0.703
boy=-0.691
jesus wife=-0.687
consurvative love=-0.687
consurvative=-0.687
army=-0.644
editor=-0.641
guy=-0.639
liberty=-0.638
my wife=-0.614
your=-0.606
entrepreneur=-0.585


In [73]:
# Use PCA to reduce the dimensionality of X to only 2 dimensions,
# then compute cross-validation accuracy of resulting data X2.
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
X2 = pca.fit_transform(X.toarray())
print 'first document with reduced representation'
print X2[0]
dim1 = pca.components_[0]
print 'first PCA dimension (eigenvector):', dim1
top_indices = dim1.argsort()[::-1]
print 'top words of first dimension:\n', '\n'.join(['%s=%.3f' % (vocab_r[idx], dim1[idx]) for idx in top_indices[:20]])
dim2 = pca.components_[1]
print 'second PCA dimension (eigenvector):', dim2
top_indices = dim2.argsort()[::-1]
print 'top words of second dimension:\n', '\n'.join(['%s=%.3f' % (vocab_r[idx], dim2[idx]) for idx in top_indices[:20]])
print 'avg accuracy using only 2 dimensions=%.3f' % np.average(cross_validation.cross_val_score(clf, X2, y, cv=5, scoring='accuracy'))

first document with reduced representation
[ 3.31207804 -3.67943496]
first PCA dimension (eigenvector): [ 0.00066414  0.00066414 -0.00033876 ...,  0.00021401  0.00021401
  0.00021401]
top words of first dimension:
the=0.683
of=0.443
and=0.244
to=0.217
in=0.139
of the=0.109
is=0.082
to the=0.075
on=0.063
nra=0.062
right=0.061
conservative=0.060
the right=0.060
editorial=0.058
teaparty=0.057
daily=0.057
tcot teaparty=0.057
prolife tcot=0.056
right of=0.056
prolife=0.056
second PCA dimension (eigenvector): [-0.00038728 -0.00038728 -0.00027458 ...,  0.00030366  0.00030366
  0.00030366]
top words of second dimension:
and=0.729
in=0.166
my=0.147
love=0.090
is=0.074
country=0.065
for=0.063
it=0.056
am=0.055
wife=0.050
mother=0.043
at=0.041
that=0.039
politics=0.039
god=0.039
as=0.036
all=0.033
with=0.031
good=0.031
we=0.030
avg accuracy using only 2 dimensions=0.680
