## Task 1: Classifying Documents

#### Using Tokenization (and basic bag-of-words features)

Here is the code we went over at the start, to get started classifying documents by sentiment.

In [None]:
import re
import random
import nltk
from nltk.corpus import movie_reviews

# Read in a list of document (wordlist, category) tuples, and shuffle
docs_tuples = [(movie_reviews.words(fileid), category)
               for category in movie_reviews.categories()
               for fileid in movie_reviews.fileids(category)[:100]]
random.shuffle(docs_tuples)

# Create a list of the most frequent words in the entire corpus
movie_words = [word.lower() for (wordlist, cat) in docs_tuples for word in wordlist]
all_wordfreqs = nltk.FreqDist(movie_words)
top_wordfreqs = all_wordfreqs.most_common()[:1000]
feature_words = [x[0] for x in top_wordfreqs]

# Define a function to extract features of the form containts(word) for each document
def document_features(doc_toks):
    document_words = set(doc_toks)
    features = {}
    for word in feature_words:
        features['contains({})'.format(word)] = 1 if word in document_words else 0
    return features

# Create feature sets of document (features, category) tuples
featuresets = [(document_features(wordlist), cat) for (wordlist, cat) in docs_tuples]

# Separate train and test sets, train the classifier, print accuracy and best features
train_set, test_set = featuresets[:100], featuresets[100:]
classifier = nltk.NaiveBayesClassifier.train(train_set)
print(nltk.classify.accuracy(classifier, test_set))
print(classifier.show_most_informative_features(10))

#### Using POS Tagging

We left the first part of the code the same as above, but created a new list of most common adjectives as our feature words:

In [None]:
# Create a list of the most frequent adjectives in the entire corpus
from nltk import FreqDist

movie_tokstags = nltk.pos_tag(movie_words)
movie_adjs = [toktag[0] for toktag in movie_tokstags if re.match('JJ',toktag[1])]
all_adjfreqs = FreqDist(movie_adjs)
top_adjfreqs = all_adjfreqs.most_common()[:1000]
feature_words = [x[0] for x in top_adjfreqs]

Then we left the document_features() function and remaining code the same:

In [None]:
# Define a function to extract features of the form containts(word) for each document
def document_features(doc_toks):
    document_words = set(doc_toks)
    features = {}
    for word in feature_words:
        features['contains({})'.format(word)] = 1 if word in document_words else 0
    return features

# Create feature sets of document (features, category) tuples
featuresets = [(document_features(wordlist), cat) for (wordlist, cat) in docs_tuples]

# Separate train and test sets, train the classifier, print accuracy and best features
train_set, test_set = featuresets[:100], featuresets[100:]
classifier = nltk.NaiveBayesClassifier.train(train_set)
print(nltk.classify.accuracy(classifier, test_set))
print(classifier.show_most_informative_features(10))

#### Using Phrase Chunking

Now we created a new list of most common noun phrases, and also modified the document_features() function to chunk each sentence and check for the common noun phrases:

In [None]:
# Create a list of the most frequent noun phrases in the entire corpus
from nltk import RegexpParser

grammar = "NP: {<JJ><NN.*>}"
cp = RegexpParser(grammar)

def extract_nps(wordlist):
    wordlist_tagged = nltk.pos_tag(wordlist)
    wordlist_chunked = cp.parse(wordlist_tagged)
    nps = []
    for node in wordlist_chunked:
        if type(node)==nltk.tree.Tree and node.label()=='NP':
            phrase = [tok for (tok, tag) in node.leaves()]
            nps.append(' '.join(phrase))
    return nps

docs_tuples_nps = [(extract_nps(wordlist), cat) for (wordlist, cat) in docs_tuples]

movie_nps = [np for (nplist, cat) in docs_tuples_nps for np in nplist]
all_npfreqs = FreqDist(movie_nps)
top_npfreqs = all_npfreqs.most_common()[:1000]
feature_nps = [x[0] for x in top_npfreqs]

In [None]:
# Define a function to extract features of the form containts(np) for each document
def document_features(doc_nps):
    features = {}
    for np in feature_nps:
        features['contains({})'.format(np)] = 1 if np in doc_nps else 0
    return features

# Create feature sets of document (features, category) tuples
featuresets = [(document_features(nplist), cat) for (nplist, cat) in docs_tuples_nps]

We left the last part of the code the same:

In [None]:
# Separate train and test sets, train the classifier, print accuracy and best features
train_set, test_set = featuresets[:100], featuresets[100:]
classifier = nltk.NaiveBayesClassifier.train(train_set)
print(nltk.classify.accuracy(classifier, test_set))
print(classifier.show_most_informative_features(10))

This actually doesn't do that well. One reason is that we're using more complex sequences of words as our noun phrase features, each of which is going to appear far less frequently across documents. We might need to increase the number of noun phrases we use, or limit the pattern we're looking for to a single adjective followed by a single noun (leaving out articles, etc). But it might also be the case that adjectives are really the best features to use for sentiment classification in the domain of movie reviews, and the version of this task we did using POS tags was the right way to go.

## Task 2. Information Extraction

#### Using Tokenization (and basic keyword search)

Here is the code we went over at the start, to initially extract election-related sentences.

In [None]:
from nltk.corpus import brown

# Read in all news docs as a list of sentences, each sentence a list of tokens
news_docs = [brown.sents(fileid) for fileid in brown.fileids(categories='news')]

# Create regular expression to search for election-related words
elect_regexp = 'elect|vote'

# Loop through documents and extract each sentence containing an election-related word
elect_sents = []
for doc in news_docs:
    for sent in doc:
        for tok in sent:
            if re.match(elect_regexp, tok):
                elect_sents.append(sent)
                break # Break out of last for loop, so we only add the sentence once
            
len(elect_sents)

#### Using POS Tagging

We used the election-related sentences we identified in the first step (so we don't waste time tagging irrelevant text). Then we looped through each sentence, ran the POS tagger, and extracted all the nouns.

In [None]:
# Extract nouns from election-related sentences
elect_nouns = []
for sent in elect_sents:
    sent_tagged = nltk.pos_tag(sent)
    for (tok, tag) in sent_tagged:
        if re.match('N', tag):
            elect_nouns.append(tok)

print(len(elect_nouns))
print(elect_nouns[:50])

We can add a check to see if the sentence has a token matching the election regexp that's tagged as a verb, once we've POS-tagged the sentence, and only add the sentence's nouns if the sentence passes this more specific test.

In [None]:
# Extract nouns if the sentence contains an election-related verb
elect_nouns = []
for sent in elect_sents:
    sent_nouns = []
    contains_elect_verb = False
    sent_tagged = nltk.pos_tag(sent)
    for (tok, tag) in sent_tagged:
        if re.match('V', tag) and re.match(elect_regexp, tok):
            contains_elect_verb = True
        elif re.match('N', tag):
            sent_nouns.append(tok)
    if contains_elect_verb:
        elect_nouns.extend(sent_nouns)

print(len(elect_nouns))
print(elect_nouns[:50])

#### Using Phrase Chunking and NER Tagging

Next we used the NLTK NER tagger (which chunks a sentence into named entity noun phrases, labeled by entity category), to extract named entities for either people or organizations mentioned in election-related sentences.

In [None]:
elect_entities = {'ORGANIZATION':[], 'PERSON':[]}
for sent in elect_sents:
    sent_tagged = nltk.pos_tag(sent)
    sent_nes = nltk.ne_chunk(sent_tagged)
    for node in sent_nes:
        if type(node)==nltk.tree.Tree:
            phrase = [tok for (tok, tag) in node.leaves()]
            if node.label() in elect_entities.keys():
                elect_entities[node.label()].append(' '.join(phrase))

for key, value in elect_entities.items():
    print(key, value, '\n')

We also extracted noun phrases if they appeared right before or after an election-related word.

In [None]:
grammar = "NP: {<DT>?<JJ>*<NN.*>+}"
cp = RegexpParser(grammar)

nps_before_elect = []
nps_after_elect = []

for sent in elect_sents:
    sent_tagged = nltk.pos_tag(sent)
    sent_chunked = cp.parse(sent_tagged)
    for n in range(len(sent_chunked)):
        if type(sent_chunked[n])==nltk.tree.Tree:
            node_toks = [tok for (tok, tag) in sent_chunked[n].leaves()]
            node_phrase = ' '.join(node_toks)
            elect_match = re.match(elect_regexp, node_phrase)
        else: elect_match = re.match(elect_regexp, sent_chunked[n][0])
        if elect_match:
            if n>0 and type(sent_chunked[n-1])==nltk.tree.Tree:
                prev_toks = [tok for (tok, tag) in sent_chunked[n-1].leaves()]
                nps_before_elect.append(' '.join(prev_toks))
            if n<len(sent_chunked)-1 and type(sent_chunked[n+1])==nltk.tree.Tree:
                next_toks = [tok for (tok, tag) in sent_chunked[n+1].leaves()]
                nps_after_elect.append(' '.join(next_toks))

print('NPs before elect word:\n', nps_before_elect)
print('\nNPs after elect word:\n', nps_after_elect)

#### Using Dependency Parsing

In [None]:
import os
from nltk.parse.stanford import StanfordDependencyParser

os.environ['STANFORD_PARSER'] = '/Users/natalieahn/Documents/SourceCode/stanford-parser-full-2015-12-09'
os.environ['STANFORD_MODELS'] = '/Users/natalieahn/Documents/SourceCode/stanford-parser-full-2015-12-09'

dependency_parser = StanfordDependencyParser(model_path="edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz")
sents_parsed = dependency_parser.parse_sents(elect_sents)
sents_parseobjs = [obj for sent in sents_parsed for obj in sent]

In [None]:
elect_winners = []

for sent_parseobj in sents_parseobjs:
    sent_triples = sent_parseobj.triples()
    for triple in sent_triples:
        # Insert your code here
        if re.match('win|won|defeat|gain|secure|achieve|got', triple[0][0]):
            if re.match('nsubj', triple[1]):
                elect_winners.append(triple[2][0])
        elif re.match('elect|vote|choose|pick', triple[0][0]):
            if re.match('dobj', triple[1]):
                elect_winners.append(triple[2][0])

print(elect_winners)