In [1]:
import nltk
#nltk.download()
from nltk.classify.naivebayes import NaiveBayesClassifier


happy_file = "/Users/zhourb21/zhazha/project/dataset/happy"
sad_file = "/Users/zhourb21/zhazha/project/dataset/sad"

test_set = "/Users/zhourb21/zhazha/project/dataset/sentimental_test_data"


def get_words_in_tweets(tweets):
    all_words = []
    for (words, sentiment) in tweets:
      all_words.extend(words)
    return all_words


def get_word_features(wordlist):
    wordlist = nltk.FreqDist(wordlist)
    word_features = wordlist.keys()
    return word_features


def read_tweets(fname, t_type):
    tweets = []
    f = open(fname, 'r')
    line = f.readline()
    while line != '':
        tweets.append([line, t_type])
        line = f.readline()
    f.close()
    return tweets


def extract_features(document):
    document_words = set(document)
    features = {}
    for word in word_features:
      features['contains(%s)' % word] = (word in document_words)
    return features


def classify_tweet(tweet):
    #return classifier.classify(extract_features(tweet))
    return classifier.classify(extract_features(nltk.word_tokenize(tweet)))


# read in postive and negative training tweets
pos_tweets = read_tweets(happy_file, 'positive')
neg_tweets = read_tweets(sad_file, 'negative')

# filter away words that are less than 3 letters to form the training data
tweets = []
for (words, sentiment) in pos_tweets + neg_tweets:
    words_filtered = [e.lower() for e in words.split() if len(e) >= 3]
    tweets.append((words_filtered, sentiment))


# extract the word features out from the training data
word_features = get_word_features(get_words_in_tweets(tweets))


print "prepare train_set..."
# get the training set and train the Naive Bayes Classifier
training_set = nltk.classify.util.apply_features(extract_features, tweets)
print "build classifier..."
classifier = NaiveBayesClassifier.train(training_set)


with open(test_set, "rb") as f:
    counter = 0
    positive = 0
    negative = 0
    fail = 0
    for line in f.readlines():
        counter += 1
        try:
            rtn = classify_tweet(line)
            if rtn == 'positive':
                positive += 1
            else:
                negative += 1
        except:
            fail += 1
    print "positive: %d, negative: %d, total: %d" %(positive, negative, counter)
prepare train_set...
build classifier...
positive: 552973, negative: 1502300, total: 2067030
In [ ]: