# Tweets Sentiment Analysis with NLTK

In [1]:
import nltk

In [2]:
# positive tweets
pos_tweets = [('I love this car', 'positive'),
              ('This view is amazing', 'positive'),
              ('I feel great this morning', 'positive'),
              ('I am so excited about the concert', 'positive'),
              ('He is my best friend', 'positive')]

In [3]:
# negative tweets
neg_tweets = [('I do not like this car', 'negative'),
              ('This view is horrible', 'negative'),
              ('I feel tired this morning', 'negative'),
              ('I am not looking forward to the concert', 'negative'),
              ('He is my enemy', 'negative')]

In [4]:
tweets = []
for (words, sentiment) in pos_tweets + neg_tweets:
    words_filtered = [e.lower() for e in words.split() if len(e) >= 3] 
    tweets.append((words_filtered, sentiment))

In [5]:
tweets

[(['love', 'this', 'car'], 'positive'),
 (['this', 'view', 'amazing'], 'positive'),
 (['feel', 'great', 'this', 'morning'], 'positive'),
 (['excited', 'about', 'the', 'concert'], 'positive'),
 (['best', 'friend'], 'positive'),
 (['not', 'like', 'this', 'car'], 'negative'),
 (['this', 'view', 'horrible'], 'negative'),
 (['feel', 'tired', 'this', 'morning'], 'negative'),
 (['not', 'looking', 'forward', 'the', 'concert'], 'negative'),
 (['enemy'], 'negative')]

In [6]:
test_tweets = [
    (['feel', 'happy', 'this', 'morning'], 'positive'),
    (['larry', 'friend'], 'positive'),
    (['not', 'like', 'that', 'man'], 'negative'),
    (['house', 'not', 'great'], 'negative'),
    (['your', 'song', 'annoying'], 'negative')]

## Classifier

To create a classifier, we need to decide what features are relevant. To do that, we first need a feature extractor. The one we are going to use returns a dictionary indicating what words are contained in the input passed. Here, the input is the tweet. We use the word features list defined above along with the input to create the dictionary.

<img src='https://www.laurentluce.com/images/blog/nltk/overview.png'>

In [7]:
def get_word_features(wordlist):
    wordlist = nltk.FreqDist(wordlist)
    word_features = wordlist.keys()
    return word_features

def get_words_in_tweets(tweets):
    all_words = []
    for (words, sentiment) in tweets:
      all_words.extend(words)
    return all_words

In [8]:
word_features = get_word_features(get_words_in_tweets(tweets))
print(word_features)

dict_keys(['love', 'this', 'car', 'view', 'amazing', 'feel', 'great', 'morning', 'excited', 'about', 'the', 'concert', 'best', 'friend', 'not', 'like', 'horrible', 'tired', 'looking', 'forward', 'enemy'])


In [9]:
def extract_features(document):
    document_words = set(document)
    features = {}
    for word in word_features:
        features['contains(%s)' % word] = (word in document_words)
    return features

In [10]:
training_set = nltk.classify.apply_features(extract_features, tweets)
print(training_set)

[({'contains(love)': True, 'contains(this)': True, 'contains(car)': True, 'contains(view)': False, 'contains(amazing)': False, 'contains(feel)': False, 'contains(great)': False, 'contains(morning)': False, 'contains(excited)': False, 'contains(about)': False, 'contains(the)': False, 'contains(concert)': False, 'contains(best)': False, 'contains(friend)': False, 'contains(not)': False, 'contains(like)': False, 'contains(horrible)': False, 'contains(tired)': False, 'contains(looking)': False, 'contains(forward)': False, 'contains(enemy)': False}, 'positive'), ({'contains(love)': False, 'contains(this)': True, 'contains(car)': False, 'contains(view)': True, 'contains(amazing)': True, 'contains(feel)': False, 'contains(great)': False, 'contains(morning)': False, 'contains(excited)': False, 'contains(about)': False, 'contains(the)': False, 'contains(concert)': False, 'contains(best)': False, 'contains(friend)': False, 'contains(not)': False, 'contains(like)': False, 'contains(horrible)': Fa

In [11]:
# NaiveBayesClassifier
classifier = nltk.NaiveBayesClassifier.train(training_set)

In [12]:
print (classifier.show_most_informative_features(32))

Most Informative Features
           contains(not) = False          positi : negati =      1.6 : 1.0
       contains(forward) = False          positi : negati =      1.2 : 1.0
         contains(enemy) = False          positi : negati =      1.2 : 1.0
          contains(like) = False          positi : negati =      1.2 : 1.0
         contains(great) = False          negati : positi =      1.2 : 1.0
         contains(tired) = False          positi : negati =      1.2 : 1.0
       contains(looking) = False          positi : negati =      1.2 : 1.0
        contains(friend) = False          negati : positi =      1.2 : 1.0
          contains(love) = False          negati : positi =      1.2 : 1.0
         contains(about) = False          negati : positi =      1.2 : 1.0
          contains(best) = False          negati : positi =      1.2 : 1.0
      contains(horrible) = False          positi : negati =      1.2 : 1.0
       contains(amazing) = False          negati : positi =      1.2 : 1.0

## Predict

In [13]:
tweet1 = 'Larry is my friend'
print (classifier.classify(extract_features(tweet1.split())))

positive


In [14]:
tweet2 = 'This house is not great'
print (classifier.classify(extract_features(tweet2.split())))

negative


In [15]:
tweet3 = 'I am feeling awful'
print (classifier.classify(extract_features(tweet3.split())))

positive


Taking the following test tweet ‘I am feeling awful’. The classifier thinks it is positive. The reason is that we don’t have any information on the feature name ‘awful’. We need larger the training sample tweets in order to improve the classifier.

## Reference:

- [Twitter sentiment analysis using Python and NLTK](https://www.laurentluce.com/posts/twitter-sentiment-analysis-using-python-and-nltk/)