In [2]:
# Download Twitter data labeled by sentiment.

from StringIO import StringIO
from zipfile import ZipFile
from urllib import urlopen

# The file is 78M, so this will take a while.
url = urlopen('http://cs.stanford.edu/people/alecmgo/trainingandtestdata.zip')
zipfile = ZipFile(StringIO(url.read()))
# We'll focus on the smaller file that was manually labeled.
# The larger file has 1.6M tweets "pseudo-labeled" using emoticons
tweet_file = zipfile.open('testdata.manual.2009.06.14.csv')

In [3]:
import csv
file_reader = csv.reader(tweet_file, delimiter=',', quotechar='"')
tweets = []
for row in file_reader:
    tweets.append({'label': int(row[0]),
                   'text': row[5]})
print 'read %d tweets' % len(tweets)

read 498 tweets


In [5]:
# Create label vector (y) and print its stats.
import numpy as np
from collections import Counter
y = np.array([t['label'] for t in tweets])
print 'label counts=', Counter(y)

label counts= Counter({4: 182, 0: 177, 2: 139})


In [6]:
# Create feature vectors (X)
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(t['text'] for t in tweets)
print 'vectorized %d tweets. found %d terms.' % (X.shape[0], X.shape[1])

vectorized 498 tweets. found 2264 terms.


In [7]:
# Print part of the vocabulary.
vectorizer.vocabulary_.items()[:10]

[(u'msgs', 1325),
 (u'whoopi', 2176),
 (u'sleep', 1804),
 (u'6pm', 67),
 (u'hate', 920),
 (u'whose', 2177),
 (u'boortz', 317),
 (u'davehitt', 557),
 (u'bike', 276),
 (u'under', 2072)]

In [15]:
#What are the most frequent terms?
# Sum columns:
col_sums = X.sum(axis=0).tolist()[0]
#print col_sums
top_indices = np.argsort(col_sums)[::-1]
print top_indices
vocab = np.array(vectorizer.get_feature_names())
top_terms = vocab[top_indices]
print vocab
print top_terms
print 'top_terms:\n', '\n'.join('%s %d' % (term, count) for term, count in zip(top_terms, top_indices)[:10])

[1961 1998  988 ..., 1363 1364    0]
[u'00' u'000' u'04fo' ..., u'zomg' u'zoom' u'zydrunas']
[u'the' u'to' u'http' ..., u'nerd' u'nerdy' u'00']
top_terms:
the 1961
to 1998
http 988
is 1060
and 152
at 209
it 1062
for 790
my 1337
of 1416


In [None]:
What are the most frequent terms

In [16]:
# Fit a LogisticRegression model
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(X, y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr',
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0)

In [17]:
# Compute accuracy
def accuracy(truth, predicted):
    return (1. * len([1 for tr, pr in zip(truth, predicted) if tr == pr]) / len(truth))

predicted = model.predict(X)
print 'accuracy on training data=%.3f' % accuracy(y, predicted)

accuracy on training data=0.996


In [20]:
# What are the top weighted features?

# Get the learned coefficients for the Positive class.
coef = model.coef_[0]
print coef
top_coef_ind = np.argsort(coef)[::-1]
# Get the names of those features.
top_coef_terms = vocab[top_coef_ind]
# Get the weights of those features
top_coef = coef[top_coef_ind]
# Get the names of those features.
top_coef_terms = vocab[top_coef_ind]
# Get the weights of those features
top_coef = coef[top_coef_ind]
# Print the top 10.
print 'top weighted terms for positive class:\n', \
    '\n'.join('%s %.2f' % (term, weight) for term, weight in zip(top_coef_terms, top_coef)[:10])

[-0.10560692 -0.04011087 -0.10644479 ..., -0.11934086 -0.01161168
 -0.11164278]
