In [1]:
from collections import defaultdict, Counter
from itertools import groupby
import random
import os
import csv
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.neural_network import MLPClassifier

import nltk
from nltk.corpus import stopwords
from nltk.parse.stanford import StanfordParser
from nltk.tokenize import PunktSentenceTokenizer, sent_tokenize
from nltk.corpus import stopwords

# Setup


**pos_list** is the list of tags used by the nltk pos tagger.


**X_tags** are lists of the different tags used by the stanford parser. See https://gist.github.com/nlothian/9240750.


**path** should point to the folder where the stanford-parser/model files were extracted.

In [4]:
pos_list = '$ \'\' ( ) , -- . : CC CD DT EX FW IN JJ JJR JJS MD NN NNP NNPS NNS PDT POS PRP PRP$ RB RBR RBS RP SYM TO UH VB VBD VBG VBN VBP VBZ WDT WP WP$ WRB `` LS'.split()

word_tags = 'CC CD DT EX FW IN JJ JJR JJS LS MD NN NNS NNP NNPS PDT POS PRP PRP$ RB RBR RBS RP SYM TO UH VB VBD VBG VBN VBP VBZ WDT WP WP$ WRB # -LRB- -RRB- -None-'.split()
phrase_tags = 'ADJP ADVP CONJP FRAG INTJ LST NAC NP NX PP PRN PRT QP RRC UCP VP WHADJP WHAVP WHNP WHPP X WHADVP'.split()
clause_tags = 'S SBAR SBARQ SINV SQ'.split()
punc_tags = '$ \'\' ( ) , -- . : ``'.split()

all_tags = word_tags + phrase_tags + clause_tags + punc_tags
list_of_tag_cats = [word_tags, phrase_tags, clause_tags, punc_tags]

path = ''
sp = StanfordParser(path_to_jar=path + '\\stanford-parser.jar',
                    path_to_models_jar=path + '\\stanford-parser-3.9.1-models.jar')

In [None]:
bbc_articles = []
path = 'Articles/Articles/bbc'
for file in os.listdir(path):
    with open(path + '\\' + file, 'r', encoding='UTF-8') as f:
        txt = f.read()
    txt = txt.replace('\n\n', '. ')
    article_sents = [sent for sent in sent_tokenize(txt)]
    bbc_articles.append((article_sents, 'bbc'))
        
guard_articles = []
path = 'Articles/Articles/Guardian'
for file in os.listdir(path):
    with open(path + '\\' + file, 'r', encoding='UTF-8') as f:
        txt = f.read()
    txt = txt.replace('\n\n', '').replace('\n', '. ')
    article_sents = [sent for sent in sent_tokenize(txt)]
    guard_articles.append((article_sents, 'guard'))
    
mirror_articles = []
path = 'Articles/Articles/Mirror'
for file in os.listdir(path):
    with open(path + '\\' + file, 'r', encoding='UTF-8') as f:
        txt = f.read()
    txt = txt.replace('\n\n', ' ').replace('\n', '')
    article_sents = [sent for sent in sent_tokenize(txt)]
    mirror_articles.append((article_sents, 'mirror'))

telegraph_articles = []
path = 'Articles/Articles/Telegraph'
for file in os.listdir(path):
    with open(path + '\\' + file, 'r', encoding='UTF-8') as f:
        txt = f.read().replace('\n\n', ' ')
    article_sents = [sent for sent in sent_tokenize(txt)]
    telegraph_articles.append((article_sents, 'telegraph'))

In [None]:
#### labour ####
with open('Manifestos/Manifestos/LabourMani.csv', 'r', encoding='UTF-8') as f:
    txt = f.read()
lab_sents = [sent.replace('\n', '') for sent in sent_tokenize(txt)]

#### Cons ####
with open('Manifestos/Manifestos/ConservMani.csv', 'r', encoding='UTF-8') as f:
    txt = f.read()
con_sents = [sent.replace('\n', ' ') for sent in sent_tokenize(txt)]

#### LibDem ####
with open('Manifestos/Manifestos/LibDemMani.csv', 'r', encoding='UTF-8') as f:
    txt = f.read()
lib_sents = [sent.replace('\n', '') for sent in sent_tokenize(txt)]

#### SNP ####
with open('Manifestos/Manifestos/SNPMani.csv', 'r', encoding='UTF-8') as f:
    txt = f.read()
snp_sents = [sent.replace('\n', '') for sent in sent_tokenize(txt)]

In [None]:
#### labour ####
with open('Manifestos/2010/LabourMani.txt', 'r', encoding='UTF-8') as f:
    txt = f.read()
lab_2010_sents = [sent.replace('\n', '') for sent in sent_tokenize(txt)]

#### Cons ####
with open('Manifestos/2010/ConservMani.txt', 'r', encoding='UTF-8') as f:
    txt = f.read()
con_2010_sents = [sent.replace('\n', ' ') for sent in sent_tokenize(txt)]

#### LibDem ####
with open('Manifestos/2010/LibDemMani.txt', 'r', encoding='UTF-8') as f:
    txt = f.read()
lib_2010_sents = [sent.replace('\n', '') for sent in sent_tokenize(txt)]

#### SNP ####
with open('Manifestos/2010/SNPMani.txt', 'r', encoding='UTF-8') as f:
    txt = f.read()
snp_2010_sents = [sent.replace('\n', '') for sent in sent_tokenize(txt)]

# Helper functions

In [219]:
def create_common_word_list(text, size=100):
    stop = set(stopwords.words('english'))
    stop.update([',', '‘', '(', ')', '.', '--', ':', '[', ']', '\'\'', '``', '-', ',', '’', '–', ';'])
    count = Counter([word.lower() for block, _ in text 
                                  for sent in block for word in nltk.word_tokenize(sent) 
                                  if word.lower() not in stop])
    return count.most_common(size)

def get_max_width(node):
    stack = [node]
    max_width = 0
    while stack:
        node = stack.pop()
        if isinstance(node, str):
            continue
        max_width = max(max_width, len(node))
        for child in node:
            stack.append(child)
    return max_width

def create_blocks(texts, chunk_size=10):
    blocks = []
    for sents, label in texts:
        blocks += [(sents[i:i+chunk_size], label) for i in range(0, len(sents), chunk_size)]
    return blocks

def calc_vocab_score(word_counts):
    m1 = len(word_counts)
    m2 = sum([len(list(g))*(freq**2) for freq, g in groupby(sorted(word_counts.values()))])
    if m2 == m1:
        return 0
    return (m1*m1)/(m2-m1)

def normalize_counts(array):
    if array.sum():
        array = array / array.sum()
    return array

# Analysis functions

Functions for creating features sets for blocks of texts

**full_analysis:** records all phrasal and word tags produced by Stanford parser as well as tree height and max width. Normalises each category of tag values and averages tree heights/widths for the given block of text. **slow**

**simple analysis:** records pos for given block of text using nltk pos tagger. Normalises values.

**word_analysis:** records how frequently popular words are used, also gets average word and sentence length.

In [237]:
def full_analysis(text):
    try:
        parses = [parse for parses in sp.raw_parse_sents(text) for parse in parses]
    except:
        return np.array([])
    
    avg_parse_height = sum([parse.height() for parse in parses]) / len(parses)
    avg_parse_width = sum(map(get_max_width, parses)) / len(parses)
    pos_count = Counter([prod.lhs().__str__() for parse in parses for prod in parse.productions()[1:]])
        

    features = np.array([avg_parse_height, avg_parse_width], dtype='float64')
    for tag_cat in list_of_tag_cats:
        tag_counts = [pos_count[tag] for tag in tag_cat]
        features = np.append(features, normalize_counts(np.array(tag_counts, dtype='float64')))
    return features

def word_analysis(text):
    words = [word.lower() for sent in text for word in nltk.word_tokenize(sent)]
    avg_word_length = sum(map(len, words)) / len(words)
    avg_sent_length = sum(map(len, text)) / len(text)
    word_counter = Counter(words)
    word_counts = [word_counter[word] for word in word_list]
    vocab_score = calc_vocab_score(word_counter)
    
    features = [[avg_word_length, avg_sent_length, vocab_score], normalize_counts(np.array(word_counts))]
    return np.array([el for feature in features for el in feature], dtype='float64')

In [239]:
def create_dataset(blocks, analysis_func):
    features = []
    labels = []
    for i, (text, label) in enumerate(blocks):
        if not i % int(len(blocks) / 10):
            print('Blocks completed: {0} / {1}'.format(i, len(blocks)))
        feats = analysis_func(text)
        if feats.size == 0:
            continue
        features.append(feats)
        labels.append(label)

    return features, labels  

# Build datasets

I used state of the union addresses since they come built in with nltk and are a good proxy while we don't have our dataset yet. All files take the format year-president.txt as far back as 1945-Truman.txt. Sampling the blocks of texts sppeds up the process since the Stanford parser is pretty slow.

In [245]:
dataset = create_blocks([(lab_sents, 'lab'), (con_sents, 'con'), (lib_sents, 'lib'), (snp_sents, 'snp')])
training_set, test_set = train_test_split(dataset, test_size=0.2)
analysis = word_analysis

word_list = create_common_word_list(training_set)

train_data, train_labels = create_dataset(training_set, analysis)
test_data, test_labels = create_dataset(test_set, analysis)

bbc_data, _ = create_dataset(bbc_articles, analysis)
guard_data, _ = create_dataset(guard_articles, analysis)
mirror_data, _ = create_dataset(mirror_articles, analysis)
telegraph_data, _ = create_dataset(telegraph_articles, analysis)
lab_2010_data, _ = create_dataset(random.sample(create_blocks([(lab_2010_sents, 'lab')]), 10), analysis)
con_2010_data, _ = create_dataset(random.sample(create_blocks([(con_2010_sents, 'con')]), 10), analysis)
lib_2010_data, _ = create_dataset(random.sample(create_blocks([(lib_2010_sents, 'lib')]), 10), analysis)
snp_2010_data, _ = create_dataset(random.sample(create_blocks([(snp_2010_sents, 'snp')]), 10), analysis) 

# Classifiers

In [5]:
svc = LinearSVC()
mnb = MultinomialNB()
gnb = GaussianNB()
lr = LogisticRegression()
sgd = SGDClassifier()
rfc = RandomForestClassifier() 
abc = AdaBoostClassifier()
mlp = MLPClassifier()

clfs = [svc, mnb, gnb, lr, sgd, rfc, abc, mlp]
clf_scores = []
for clf in clfs:
    clf.fit(train_data, train_labels)
    print('{0} \ntraining score: {1} \ntest     score: {2}\n\n'.format(type(clf).__name__, 
                                                               clf.score(train_data, train_labels),
                                                               clf.score(test_data, test_labels)))
    clf_scores.append([type(clf).__name__, clf.score(train_data, train_labels), clf.score(test_data, test_labels)])