> Notes: [How To Work with Language Data in Python 3 using the Natural Language Toolkit (NLTK)](https://www.digitalocean.com/community/tutorials/how-to-work-with-language-data-in-python-3-using-the-natural-language-toolkit-nltk)

In [9]:
import nltk

In [10]:
print('nltk version: {}'.format(nltk.__version__))

nltk version: 3.2.2


In [11]:
# nltk twitter corpus contains 20K tweets from twitter streaming api.
# Tweets are stored in JSON format.
from nltk.corpus import twitter_samples

In [12]:
# JSON files in the corpus
twitter_samples.fileids()

# if get error: Resource 'corpora/twitter_samples.zip/twitter_samples/' not found.
# use on command line: $ python -m nltk.downloader twitter_samples

['negative_tweets.json', 'positive_tweets.json', 'tweets.20150430-223406.json']

In [13]:
# objective: to count the number of nouns [helps determine topics being discussed] 
# and adjectives [helps determine what type of language is being used, i.e. opinions
# tend to include more adjectives than facts] that appear in the positive subset of 
# the twitter corpus
pos_tweets = twitter_samples.strings('positive_tweets.json')
print(pos_tweets[:10])

['#FollowFriday @France_Inte @PKuchly57 @Milipol_Paris for being top engaged members in my community this week :)', '@Lamb2ja Hey James! How odd :/ Please call our Contact Centre on 02392441234 and we will be able to assist you :) Many thanks!', '@DespiteOfficial we had a listen last night :) As You Bleed is an amazing track. When are you in Scotland?!', '@97sides CONGRATS :)', 'yeaaaah yippppy!!!  my accnt verified rqst has succeed got a blue tick mark on my fb profile :) in 15 days', '@BhaktisBanter @PallaviRuhail This one is irresistible :)\n#FlipkartFashionFriday http://t.co/EbZ0L2VENM', "We don't like to keep our lovely customers waiting for long! We hope you enjoy! Happy Friday! - LWWF :) https://t.co/smyYriipxI", '@Impatientraider On second thought, there’s just not enough time for a DD :) But new shorts entering system. Sheep must be buying.', 'Jgh , but we have to go to Bayan :D bye', 'As an act of mischievousness, am calling the ETL layer of our in-house warehousing app Katam

In [14]:
# Tokenization: breaking up a sequence of strings into pieces such as words, keywords, 
# phrases, symbols and other elements, which are called tokens.
pos_tweets_tokens = twitter_samples.tokenized('positive_tweets.json')
print(pos_tweets_tokens[:10])

[['#FollowFriday', '@France_Inte', '@PKuchly57', '@Milipol_Paris', 'for', 'being', 'top', 'engaged', 'members', 'in', 'my', 'community', 'this', 'week', ':)'], ['@Lamb2ja', 'Hey', 'James', '!', 'How', 'odd', ':/', 'Please', 'call', 'our', 'Contact', 'Centre', 'on', '02392441234', 'and', 'we', 'will', 'be', 'able', 'to', 'assist', 'you', ':)', 'Many', 'thanks', '!'], ['@DespiteOfficial', 'we', 'had', 'a', 'listen', 'last', 'night', ':)', 'As', 'You', 'Bleed', 'is', 'an', 'amazing', 'track', '.', 'When', 'are', 'you', 'in', 'Scotland', '?', '!'], ['@97sides', 'CONGRATS', ':)'], ['yeaaaah', 'yippppy', '!', '!', '!', 'my', 'accnt', 'verified', 'rqst', 'has', 'succeed', 'got', 'a', 'blue', 'tick', 'mark', 'on', 'my', 'fb', 'profile', ':)', 'in', '15', 'days'], ['@BhaktisBanter', '@PallaviRuhail', 'This', 'one', 'is', 'irresistible', ':)', '#FlipkartFashionFriday', 'http://t.co/EbZ0L2VENM'], ['We', "don't", 'like', 'to', 'keep', 'our', 'lovely', 'customers', 'waiting', 'for', 'long', '!', 'W

In [15]:
# POS (part-of-speech) tagging is the process of labelling a word in a text as 
# corresponding to a particular POS tag: nouns, verbs, adjectives, adverbs, etc.
# NLTK's averaged_perceptron_tagger uses the perceptron algorithm to predict which 
# POS tag is most likely given the word
from nltk.tag import pos_tag_sents

In [16]:
pos_tweets_tokens_tagged = pos_tag_sents(pos_tweets_tokens)
print(pos_tweets_tokens_tagged[:10])

[[('#FollowFriday', 'JJ'), ('@France_Inte', 'NNP'), ('@PKuchly57', 'NNP'), ('@Milipol_Paris', 'NNP'), ('for', 'IN'), ('being', 'VBG'), ('top', 'JJ'), ('engaged', 'VBN'), ('members', 'NNS'), ('in', 'IN'), ('my', 'PRP$'), ('community', 'NN'), ('this', 'DT'), ('week', 'NN'), (':)', 'NN')], [('@Lamb2ja', 'NN'), ('Hey', 'NNP'), ('James', 'NNP'), ('!', '.'), ('How', 'NNP'), ('odd', 'JJ'), (':/', 'NNP'), ('Please', 'NNP'), ('call', 'VB'), ('our', 'PRP$'), ('Contact', 'NNP'), ('Centre', 'NNP'), ('on', 'IN'), ('02392441234', 'CD'), ('and', 'CC'), ('we', 'PRP'), ('will', 'MD'), ('be', 'VB'), ('able', 'JJ'), ('to', 'TO'), ('assist', 'VB'), ('you', 'PRP'), (':)', 'VBP'), ('Many', 'JJ'), ('thanks', 'NNS'), ('!', '.')], [('@DespiteOfficial', 'JJ'), ('we', 'PRP'), ('had', 'VBD'), ('a', 'DT'), ('listen', 'VBN'), ('last', 'JJ'), ('night', 'NN'), (':)', 'NN'), ('As', 'IN'), ('You', 'PRP'), ('Bleed', 'VBP'), ('is', 'VBZ'), ('an', 'DT'), ('amazing', 'JJ'), ('track', 'NN'), ('.', '.'), ('When', 'WRB'), ('a

In [17]:
# in nltk, [JJ]=adjectives, [NN]=singular nouns, [NNS]=plural nouns
count_JJ = 0
count_NN = 0
count_NNS = 0

for tweet in pos_tweets_tokens_tagged:
    for pair in tweet:
        tag = pair[1]
        if tag == 'JJ':
            count_JJ += 1
        elif tag == 'NN':
            count_NN += 1
        elif tag == 'NNS':
            count_NNS += 1
print('adjectives: {}'.format(count_JJ))
print('singular nouns: {}'.format(count_NN))
print('plural nouns: {}'.format(count_NNS))

adjectives: 6094
singular nouns: 13180
plural nouns: 2429


In [18]:
# todo: extend this script to count positive adjectives (great, awesome, happy, etc.) 
# versus negative adjectives (boring, lame, sad, etc.), which could be used to analyze 
# the sentiment of tweets or reviews about a product or movie, for example. This script 
# provides data that can in turn inform decisions related to that product or movie.