import nltk
#nltk.download()
from nltk.classify.naivebayes import NaiveBayesClassifier
happy_file = "/Users/zhourb21/zhazha/project/dataset/happy"
sad_file = "/Users/zhourb21/zhazha/project/dataset/sad"
test_set = "/Users/zhourb21/zhazha/project/dataset/sentimental_test_data"
def get_words_in_tweets(tweets):
all_words = []
for (words, sentiment) in tweets:
all_words.extend(words)
return all_words
def get_word_features(wordlist):
wordlist = nltk.FreqDist(wordlist)
word_features = wordlist.keys()
return word_features
def read_tweets(fname, t_type):
tweets = []
f = open(fname, 'r')
line = f.readline()
while line != '':
tweets.append([line, t_type])
line = f.readline()
f.close()
return tweets
def extract_features(document):
document_words = set(document)
features = {}
for word in word_features:
features['contains(%s)' % word] = (word in document_words)
return features
def classify_tweet(tweet):
#return classifier.classify(extract_features(tweet))
return classifier.classify(extract_features(nltk.word_tokenize(tweet)))
# read in postive and negative training tweets
pos_tweets = read_tweets(happy_file, 'positive')
neg_tweets = read_tweets(sad_file, 'negative')
# filter away words that are less than 3 letters to form the training data
tweets = []
for (words, sentiment) in pos_tweets + neg_tweets:
words_filtered = [e.lower() for e in words.split() if len(e) >= 3]
tweets.append((words_filtered, sentiment))
# extract the word features out from the training data
word_features = get_word_features(get_words_in_tweets(tweets))
print "prepare train_set..."
# get the training set and train the Naive Bayes Classifier
training_set = nltk.classify.util.apply_features(extract_features, tweets)
print "build classifier..."
classifier = NaiveBayesClassifier.train(training_set)
with open(test_set, "rb") as f:
counter = 0
positive = 0
negative = 0
fail = 0
for line in f.readlines():
counter += 1
try:
rtn = classify_tweet(line)
if rtn == 'positive':
positive += 1
else:
negative += 1
except:
fail += 1
print "positive: %d, negative: %d, total: %d" %(positive, negative, counter)