In [26]:
import re
import nltk
import ssl

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

nltk.download('averaged_perceptron_tagger') # download corpora

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/chinmay/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [17]:
article = """LONDON, England (Reuters) -- Harry Potter star Daniel Radcliffe gains access to a reported £20 million ($41.1 million) fortune as he turns 18 on Monday, but he insists the money won't cast a spell on him. Daniel Radcliffe as Harry Potter in "Harry Potter and the Order of the Phoenix" To the disappointment of gossip columnists around the world, the young actor says he has no plans to fritter his cash away on fast cars, drink and celebrity parties. "I don't plan to be one of those people who, as soon as they turn 18, suddenly buy themselves a massive sports car collection or something similar," he told an Australian interviewer earlier this month. "I don't think I'll be particularly extravagant. "The things I like buying are things that cost about 10 pounds -- books and CDs and DVDs." At 18, Radcliffe will be able to gamble in a casino, buy a drink in a pub or see the horror film "Hostel: Part II," currently six places below his number one movie on the UK box office chart. Details of how he'll mark his landmark birthday are under wraps. His agent and publicist had no comment on his plans. "I'll definitely have some sort of party," he said in an interview. "Hopefully none of you will be reading about it." Radcliffe's earnings from the first five Potter films have been held in a trust fund which he has not been able to touch. Despite his growing fame and riches, the actor says he is keeping his feet firmly on the ground. "People are always looking to say 'kid star goes off the rails,'" he told reporters last month. "But I try very hard not to go that way because it would be too easy for them." His latest outing as the boy wizard in "Harry Potter and the Order of the Phoenix" is breaking records on both sides of the Atlantic and he will reprise the role in the last two films. Watch I-Reporter give her review of Potter's latest » . There is life beyond Potter, however. The Londoner has filmed a TV movie called "My Boy Jack," about author Rudyard Kipling and his son, due for release later this year. He will also appear in "December Boys," an Australian film about four boys who escape an orphanage. Earlier this year, he made his stage debut playing a tortured teenager in Peter Shaffer's "Equus." Meanwhile, he is braced for even closer media scrutiny now that he's legally an adult: "I just think I'm going to be more sort of fair game," he told Reuters. E-mail to a friend . Copyright 2007 Reuters. All rights reserved.This material may not be published, broadcast, rewritten, or redistributed."""
article

'LONDON, England (Reuters) -- Harry Potter star Daniel Radcliffe gains access to a reported £20 million ($41.1 million) fortune as he turns 18 on Monday, but he insists the money won\'t cast a spell on him. Daniel Radcliffe as Harry Potter in "Harry Potter and the Order of the Phoenix" To the disappointment of gossip columnists around the world, the young actor says he has no plans to fritter his cash away on fast cars, drink and celebrity parties. "I don\'t plan to be one of those people who, as soon as they turn 18, suddenly buy themselves a massive sports car collection or something similar," he told an Australian interviewer earlier this month. "I don\'t think I\'ll be particularly extravagant. "The things I like buying are things that cost about 10 pounds -- books and CDs and DVDs." At 18, Radcliffe will be able to gamble in a casino, buy a drink in a pub or see the horror film "Hostel: Part II," currently six places below his number one movie on the UK box office chart. Details o

In [14]:
def getSentences(article):
    sentences = nltk.sent_tokenize(article)
    return sentences
sentences = getSentences(article)

In [22]:
def convertShortForms(article):
    article = article.replace("won't", "will not")
    article = article.replace("can't", "cannot")
    article = article.replace("i'm", "i am")
    article = article.replace("ain't", "is not")
    article = article.replace("(\w+)'ll", "\g<1> will")
    article = article.replace("(\w+)'ve", "\g<1> have")
    article = article.replace("(\w+)'re", "\g<1> are")
    article = article.replace("(\w+)'d", "\g<1> would")
    return article

def convertAbbreviations(article):
    article = article.replace("U.S.", "United States")
    article = article.replace("U.K.", "United Kingdom")
    article = article.replace("etc.", "and so on")
    article = article.replace("e.g.", "for example")
    article = article.replace("i.e.", "more precisely")
    return article

def convertNumbers(article):
    article = re.sub(r'(\d+) million', r'\g<1>000000', article)
    article = re.sub(r'(\d+) billion', r'\g<1>000000000', article)
    article = re.sub(r'(\d+) trillion', r'\g<1>000000000000', article)
    return article

def convertDates(article):
    article = re.sub(r'(\d+)/(\d+)/(\d+)', r'\g<3>-\g<1>-\g<2>', article)
    article = re.sub(r'(\d+)-(\d+)-(\d+)', r'\g<3>-\g<1>-\g<2>', article)
    return article

def convertMoney(article):
    article = re.sub(r'\$(\d+)', r'\g<1> dollars', article)
    return article

def removeStopWords(article):
    article = article.split()
    stopWords = set(nltk.corpus.stopwords.words('english'))
    article = [word for word in article if word not in stopWords]
    article = ' '.join(article)
    return article

def stemWords(article):
    stemmer = nltk.stem.PorterStemmer()
    article = article.split()
    article = [stemmer.stem(word) for word in article]
    article = ' '.join(article)
    return article

def lemmatizeWords(article):
    lemmatizer = nltk.stem.WordNetLemmatizer()
    article = article.split()
    article = [lemmatizer.lemmatize(word) for word in article]
    article = ' '.join(article)
    return article

def removePunctuation(article):
    article = re.sub(r'[^\w\s]', '', article)
    return article

def process(article):
    article = article.lower()
    article = convertShortForms(article)
    article = convertAbbreviations(article)
    article = convertNumbers(article)
    article = convertDates(article)
    article = convertMoney(article)
    article = removePunctuation(article)
    article = removeStopWords(article)
    article = lemmatizeWords(article)
    return article


In [None]:
processedSentences = []
for sentence in sentences:
    processedSentences.append((sentence, process(sentence)))
    

In [24]:
processedArticle = process(article)

In [33]:
taggedWords = nltk.pos_tag(processedArticle.split())

wordCategories = dict()
for word in taggedWords:
    if word[1] in wordCategories:
        wordCategories[word[1]].append(word[0])
    else:
        wordCategories[word[1]] = [word[0]]

- CC coordinating conjunction
- CD cardinal digit
- DT determiner
- EX existential there (like: “there is” … think of it like “there exists”)
- FW foreign word
- IN preposition/subordinating conjunction
- JJ adjective ‘big’
- JJR adjective, comparative ‘bigger’
- JJS adjective, superlative ‘biggest’
- LS list marker 1)
- MD modal could, will
- NN noun, singular ‘desk’
- NNS noun plural ‘desks’
- NNP proper noun, singular ‘Harrison’
- NNPS proper noun, plural ‘Americans’
- PDT predeterminer ‘all the kids’
- POS possessive ending parent’s
- PRP personal pronoun I, he, she
- PRP$ possessive pronoun my, his, hers
- RB adverb very, silently,
- RBR adverb, comparative better
- RBS adverb, superlative best
- RP particle give up
- TO, to go ‘to’ the store.
- UH interjection, errrrrrrrm
- VB verb, base form take
- VBD verb, past tense took
- VBG verb, gerund/present participle taking
- VBN verb, past participle taken
- VBP verb, sing. present, non-3d take
- VBZ verb, 3rd person sing. present takes
- WDT wh-determiner which
- WP wh-pronoun who, what
- WP$ possessive wh-pronoun whose
- WRB wh-abverb where, when

In [42]:
def getCount(words):
    count = dict()
    for word in words:
        if word in count:
            count[word] += 1
        else:
            count[word] = 1
    return count

nounCounts = getCount(wordCategories['NN'])

nounCounts = [(nounCounts[word], word) for word in nounCounts]
nounCounts.sort(reverse=True)
nounCounts[:8]

[(6, 'potter'),
 (4, 'film'),
 (4, 'boy'),
 (3, 'radcliffe'),
 (3, 'plan'),
 (2, 'year'),
 (2, 'thing'),
 (2, 'star')]