In [217]:
from collections import defaultdict
import random
import numpy as np

from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier

import nltk
from nltk.corpus import state_union
from nltk.parse.stanford import StanfordParser
from nltk.tokenize import PunktSentenceTokenizer, sent_tokenize
from nltk.corpus import stopwords

# Setup


**pos_list** is the list of tags used by the nltk pos tagger.


**X_tags** are lists of the different tags used by the stanford parser. See https://gist.github.com/nlothian/9240750.


**path** should point to the folder where the stanford-parser/model files were extracted.

In [218]:
pos_list = '$ \'\' ( ) , -- . : CC CD DT EX FW IN JJ JJR JJS MD NN NNP NNPS NNS PDT POS PRP PRP$ RB RBR RBS RP SYM TO UH VB VBD VBG VBN VBP VBZ WDT WP WP$ WRB `` LS'.split()

word_tags = 'CC CD DT EX FW IN JJ JJR JJS LS MD NN NNS NNP NNPS PDT POS PRP PRP$ RB RBR RBS RP SYM TO UH VB VBD VBG VBN VBP VBZ WDT WP WP$ WRB # -LRB- -RRB- -None-'.split()
phrase_tags = 'ADJP ADVP CONJP FRAG INTJ LST NAC NP NX PP PRN PRT QP RRC UCP VP WHADJP WHAVP WHNP WHPP X WHADVP'.split()
clause_tags = 'S SBAR SBARQ SINV SQ'.split()
punc_tags = '$ \'\' ( ) , -- . : ``'.split()

all_tags = word_tags + phrase_tags + clause_tags + punc_tags
list_of_tag_cats = [word_tags, phrase_tags, clause_tags, punc_tags]

path = 'C:\\Users\\Eoin\\Documents\\5 MAI\\Text Analytics\\stanford-parser-full-2018-02-27'
sp = StanfordParser(path_to_jar=path + '\\stanford-parser.jar',
                    path_to_models_jar=path + '\\stanford-parser-3.9.1-models.jar')

# Helper functions

In [219]:
def create_common_word_list(text, size=100):
    stop = set(stopwords.words('english'))
    stop.update(',', '\'s', '(', ')', 'applause', '.', '--', ':', '[', ']')
    word_dict = defaultdict(int)
    for block, _ in text:
        for sent in block:
            for word in nltk.word_tokenize(sent):
                word_dict[word.lower()] += 1
    s = [(k, word_dict[k]) for k in sorted(word_dict, key=word_dict.get, reverse=True) if k not in stop]
    return [p for p, _ in s[:size]]

def get_max_width(node):
    stack = [node]
    max_width = 0
    while stack:
        node = stack.pop()
        if isinstance(node, str):
            continue
        max_width = max(max_width, len(node))
        for child in node:
            stack.append(child)
    return max_width

def create_blocks(texts, chunk_size=10):
    blocks = []
    for text in texts:
        sents = [sent for sent in sent_tokenize(state_union.raw(text))]
        blocks += [(sents[i:i+chunk_size], text) for i in range(0, len(sents), chunk_size)]
    return blocks

def normalize_counts(array):
    if array.sum():
        array = array / array.sum()
    return array

# Analysis functions

Functions for creating features sets for blocks of texts

**full_analysis:** records all phrasal and word tags produced by Stanford parser as well as tree height and max width. Normalises each category of tag values and averages tree heights/widths for the given block of text. **slow**

**simple analysis:** records pos for given block of text using nltk pos tagger. Normalises values.

**word_analysis:** records how frequently popular words are used, also gets average word and sentence length.

In [237]:
def full_analysis(text):
    heights = []
    widths = []
    words_dict = {pos: 0 for pos in all_tags}
    for parses in sp.raw_parse_sents(text):
        for parse in parses:
            for prod in parse.productions()[1:]:
                words_dict[prod.lhs().__str__()] += 1
            widths.append(get_max_width(parse))
            heights.append(parse.height())

    features = np.array([sum(heights) / len(heights), sum(widths) / len(widths)], dtype='float64')
    for tag_cat in list_of_tag_cats:
        tag_counts = [words_dict[tag] for tag in tag_cat]
        features = np.append(features, normalize_counts(np.array(tag_counts, dtype='float64')))
    return features

def simple_analysis(text):
    pos_dict = {pos: 0 for pos in pos_list}
    for sent in text:
        words = nltk.word_tokenize(sent)
        tagged = nltk.pos_tag(words)
        for tag in tagged:
            pos_dict[tag[1]] += 1
    pos_count = list(pos_dict.values())
    features = normalize_counts(np.array(pos_count, dtype='float64'))   
    return features

def word_analysis(text):
    sent_lengths = []
    word_lengths = []
    word_dict = {word: 0 for word in word_list}
    for sent in text:
        sent_lengths.append(len(sent))
        for word in nltk.word_tokenize(sent):
            word_lengths.append(len(word))
            if word in word_list:
                word_dict[word.lower()] += 1
    features = np.array([sum(sent_lengths) / len(sent_lengths), sum(word_lengths) / len(word_lengths)], dtype='float64')
    word_counts = [word_dict[word] for word in word_list]
    features = np.append(features, normalize_counts(np.array(word_counts, dtype='float64')))
    return features

In [239]:
import time
def create_dataset(blocks, analysis_func):
    features = []
    labels = []
    for text, label in blocks:
        features.append(analysis_func(text))
        label = label[5:-4]
        if label[-2] == '-':
            label = label[:-2]
        labels.append(label)

    return features, labels  

# Build datasets

I used state of the union addresses since they come built in with nltk and are a good proxy while we don't have our dataset yet. All files take the format year-president.txt as far back as 1945-Truman.txt. Sampling the blocks of texts sppeds up the process since the Stanford parser is pretty slow.

In [245]:
training_examples = ['2005-GWBush.txt', '2004-GWBush.txt', '2002-GWBush.txt', 
                     '1995-Clinton.txt', '1997-Clinton.txt','1993-Clinton.txt',
                     '1996-Clinton.txt', '1998-Clinton.txt', '2003-GWBush.txt', '2004-GWBush.txt']
test_examples = ['2006-GWBush.txt', '1994-Clinton.txt', '1999-Clinton.txt', '2001-GWBush-1.txt']

training_set = create_blocks(training_examples)
test_set = random.sample(create_blocks(test_examples), 50)

word_list = create_common_word_list(training_set)

train_data, train_labels = create_dataset(training_set, word_analysis)
test_data, test_labels = create_dataset(test_set, word_analysis)

# Classifiers

In [246]:
svc = LinearSVC()
mnb = MultinomialNB()
gnb = GaussianNB()
lr = LogisticRegression()
sgd = SGDClassifier()
rfc = RandomForestClassifier() 
abc = AdaBoostClassifier()

clfs = [svc, mnb, gnb, lr, sgd, rfc, abc]
for clf in clfs:
    clf.fit(train_data, train_labels)
    print('{0} \ntraining score: {1} \ntest     score: {2}\n\n'.format(type(clf).__name__, 
                                                               clf.score(train_data, train_labels),
                                                               clf.score(test_data, test_labels)))

LinearSVC 
training score: 0.47368421052631576 
test     score: 0.52


MultinomialNB 
training score: 0.7397660818713451 
test     score: 0.74


GaussianNB 
training score: 0.8947368421052632 
test     score: 0.8


LogisticRegression 
training score: 0.7485380116959064 
test     score: 0.8


SGDClassifier 
training score: 0.4619883040935672 
test     score: 0.52


RandomForestClassifier 
training score: 0.9970760233918129 
test     score: 0.74


AdaBoostClassifier 
training score: 0.9970760233918129 
test     score: 0.94




# Full parse tree features

**LinearSVC**


training score: 0.8450292397660819 


test     score: 0.88


**MultinomialNB** 


training score: 0.6783625730994152 


test     score: 0.58


**GaussianNB**


training score: 0.827485380116959 


test     score: 0.92


**LogisticRegression**


training score: 0.7894736842105263 


test     score: 0.82


**SGDClassifier** 


training score: 0.6403508771929824 


test     score: 0.5


**RandomForestClassifier**


training score: 0.9912280701754386 


test     score: 0.98


**AdaBoostClassifier** 


training score: 1.0


test     score: 0.92

---

# POS features

**LinearSVC**

training score: 0.8362573099415205 


test     score: 0.78


**MultinomialNB** 


training score: 0.5409356725146199 


test     score: 0.56


**GaussianNB**


training score: 0.9093567251461988 


test     score: 0.96


**LogisticRegression** 


training score: 0.6345029239766082 


test     score: 0.62


**SGDClassifier**


training score: 0.47076023391812866 


test     score: 0.44


**RandomForestClassifier** 


training score: 0.9883040935672515 


test     score: 0.94


**AdaBoostClassifier**


training score: 1.0 


test     score: 0.94

---

# Word features

**LinearSVC**


training score: 0.47368421052631576 


test     score: 0.52


**MultinomialNB** 


training score: 0.7397660818713451 


test     score: 0.74


**GaussianNB**


training score: 0.8947368421052632 


test     score: 0.8


**LogisticRegression** 


training score: 0.7485380116959064 


test     score: 0.8


**SGDClassifier**


training score: 0.4619883040935672 


test     score: 0.52


**RandomForestClassifier**


training score: 0.9970760233918129 


test     score: 0.74


**AdaBoostClassifier**


training score: 0.9970760233918129 


test     score: 0.94