In [1]:
# Labelling the class values for the twitter dataset.
import os
input_filename = os.path.join(os.path.expanduser("~"), "Data", "twitter", "python_tweets.json")
classes_filename = os.path.join(os.path.expanduser("~"), "Data", "twitter", "python_classes.json")

In [2]:
import json
tweets = []
with open(input_filename) as inf:
    for line in inf:
        if len(line.strip()) == 0:
            continue
        tweets.append(json.loads(line)['text'])
print("Loaded {} tweets".format(len(tweets)))

Loaded 100 tweets


In [3]:
with open(classes_filename) as inf:
    labels = json.load(inf)

In [4]:
n_samples = min(len(tweets), len(labels))

In [5]:
sample_tweets = [t.lower() for t in tweets[:n_samples]]
labels = labels[:n_samples]

In [6]:
import numpy as np
y_true = np.array(labels)

In [7]:
print("{:.1f}% have class 1".format(np.mean(y_true == 1) * 100))

55.0% have class 1


In [8]:
from sklearn.base import TransformerMixin
from nltk import word_tokenize

class NLTKBOW(TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return [{word: True for word in word_tokenize(document)}
                 for document in X]

In [9]:
from sklearn.feature_extraction import DictVectorizer

In [10]:
from sklearn.naive_bayes import BernoulliNB

In [11]:
from sklearn.cross_validation import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer

In [14]:
pipeline = Pipeline([('bag-of-words', NLTKBOW()),
                     ('vectorizer', DictVectorizer()),
                     ('naive-bayes', BernoulliNB())
                     ])
scores = cross_val_score(pipeline, sample_tweets, y_true, cv=10, scoring='f1')
print("Score: {:.3f}".format(np.mean(scores)))

Score: 0.576


In [15]:
scores

array([ 0.70588235,  0.76923077,  0.46153846,  0.66666667,  0.5       ,
        0.76923077,  0.54545455,  0.4       ,  0.4       ,  0.54545455])