In [1]:
import matplotlib.pyplot as pt
import numpy as np
import re
import warnings
import nltk
from nltk import FreqDist
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from os import walk
from os import path
from pprint import pprint
from scipy.stats import sem # standard error of mean
from sklearn import metrics
from sklearn.cross_validation import cross_val_score, train_test_split, StratifiedShuffleSplit
from sklearn.feature_selection import SelectPercentile, SelectKBest, chi2, f_classif, f_regression
from sklearn.grid_search import GridSearchCV
from sklearn.linear_model.stochastic_gradient import SGDClassifier
from sklearn.multiclass import OneVsRestClassifier, OneVsOneClassifier, _predict_binary
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from random import randint
from sklearn.preprocessing import MinMaxScaler, StandardScaler, LabelEncoder
from sklearn.svm import SVC, LinearSVC
from syllables_en import count
from sys import maxint
from time import time
from scipy.cluster.vq import whiten

In [2]:
import pdb
sentence_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
word_tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')

In [126]:
warnings.filterwarnings('ignore')
#setting some variables
NUMFOLDS = 10
RANGE = 25 # set to 25 based on Diederich et al. 2000 as cited on page 9 of http://www.cnts.ua.ac.be/stylometry/Papers/MAThesis_KimLuyckx.pdf
SRCDIR = '' #path.dirname(path.realpath(__file__))
FEATURESFILE = path.join(SRCDIR,'bookfeatures.txt')
PICKLEFILE = path.join(SRCDIR,'estimator.pickle')
CORPUSPATH = path.join(SRCDIR,'../corpus')

In [4]:
class MyFreqDist(FreqDist):
    '''
    Extend FreqDist to implement dis legomena
    '''

    def dises(self):
        '''
        @return: A list of all samples that occur twice (dis legomena)
        @rtype: C{list}
        '''

        return [item for item in self if self[item] == 2]

In [5]:
def build_pron_set():
    '''
    Build set of nominative pronouns.
    '''

    return set(open(path.join(SRCDIR,'nompronouns.txt'), 'r').read().splitlines())

def build_conj_set():
    '''
    Build set of coordinating and subordinating conjunctions.
    '''

    return set(open(path.join(SRCDIR,'coordconj.txt'), 'r').read().splitlines()).union(
           set(open(path.join(SRCDIR,'subordconj.txt'), 'r').read().splitlines()))

def build_stop_words_set():
    '''
    Build set of stop words to ignore.
    '''

    # source: http://jmlr.org/papers/volume5/lewis04a/a11-smart-stop-list/english.stop
    return set(open(path.join(SRCDIR,'smartstop.txt'), 'r').read().splitlines())

In [6]:
def get_file_dir_list(dir):
    '''
    Get a list of directories and files. Used to get the corpora.
    Returns
    -------
    dir_list: list of directory names to serve as class labels.
    file_list: list of files in corpus.
    '''

    file_list = []
    dir_list = []
    for (dirpath, dirname, files) in walk(dir):
        if files:
            dir_list.append(path.split(dirpath)[1])
            file_list.append(map(lambda x: path.join(dirpath, x), files))
    return dir_list, file_list

In [7]:
def extract_book_contents(text):
    '''
    Extract the contents of the book after excising the Project Gutenber headers
    and footers.
    '''

    start  = re.compile('START OF.*\r\n')
    end = re.compile('\*\*.*END OF ([THIS]|[THE])')
    # remove PG header and footer
    _1 = re.split(start, text)
    _2 = re.split(end, _1[1])
    return _2[0] # lower-case everything

In [140]:
def extract_book_chapters2(filename):
    '''
    Extract the chapters of the book after excising the Project Gutenber headers
    and footers.
    '''
    text = extract_book_contents(open(filename, 'r').read().decode('utf-8')).lower()
      
    romans = re.compile ('\n[clxvi]+\.')
    #romans2 = re.compile('\n(?=[mdclxvi])m*d?c{0,4}l?x{0,4}v?i{0,4}\.')
    numbers = re.compile ('\n[0-9]{1,3}[\.\)]+') 
    chapter_delim = re.compile('\nchapter[ \t]*[0-9]{1,3}')  
    chapter_delim2 = re.compile('\nchapter[ \t]*[mdclxvi]{1,3}') 
    play_delim = re.compile('scene[ \t]*[mdclxvi]{1,3}') 
    
    c1 = re.split(numbers, text)
    c2 = re.split(chapter_delim, text)
    c3 = re.split(romans, text)
    c4 = re.split(chapter_delim2, text)
    c5 = re.split(play_delim, text)

    c_num = max(len(c1),len(c2),len(c3),len(c4), len(c5))
    if (len(c1) == c_num):
        chapters = c1
    elif (len(c2) == c_num):
        chapters = c2
    elif (len(c3) == c_num):
        chapters = c3
    elif (len(c4) == c_num):
        chapters = c4
    elif (len(c5) == c_num):
        chapters = c5
    #import pdb;pdb.set_trace()
    #If the book couldn't be split into chapters, return it all
    #else, remove the first chapter which is the table of contents or introduction
    if(len(chapters) == 1):
        return chapters
    else:
        return chapters[1:]


# This is the main function that genetrates necessary features

In [119]:
def load_book_features_by_chapters(filename, smartStopWords={}, pronSet={}, conjSet={}):
    '''
    Load features for each book in the corpus. There are 4 + RANGE*4 features
    for each instance. These features are:
       ---------------------------------------------------------------------------------------------------------
       No. Feature Name                                                                         No. of features.
       ---------------------------------------------------------------------------------------------------------
       1.  number of hapax legomena divided by number of unique words                           1
       2.  number of dis legomena divided by number of unique words                             1
       3.  number of unique words divided by number of total words                              1
       4.  flesch readability score divided by 100                                              1

       5.  no. of sentences of length in the range [1, 25] divided by the                       25
           number of total sentences
       6.  no. of words of length in the range [1, 25] divided by the                           25
           number of total words
       7.  no. of nominative pronouns per sentence in the range [1, 25] divided by the          25
           number of total sentences
       8.  no. of (coordinating + subordinating) conjunctions per sentence in the range         25
           [1, 25] divided by the number of total sentences
       9.  Average number of words per sentence                                                  1
       10. Sentence length variation                                                             1
       11. Lexical diversity                                                                     1
       12. Number of Commas per sentence                                                         1
       13. Semicolons per sentence                                                               1
       14. Number of Colons per sentence                                                         1
       15. Number of Exclamation marks per sentence                                              1
       16. Bag of words features (most common 30 words in the whole text )                       30
       17. Count for Part of Speech (POS) representation of ['NN', 'NNP', 'DT', 'IN', 'JJ', 'NNS'] 6
        Total feature count = 147
    '''
    chapters = extract_book_chapters2(filename)
    all_text = ' '.join(chapters)
    features_token_level =  np.zeros((len(chapters), 111), np.float64) #np.zeros((1, 104), np.float64)
    for e, ch_text in enumerate(chapters):
        if (len(ch_text)>100):
            # note: the nltk.word_tokenize includes punctuation
            #ch_text = ch_text_.decode('utf-8')
            contents = re.sub('\'s|(\r\n)|-+|["_]', ' ', ch_text) # remove \r\n, apostrophes, and dashes
            sentenceList = sent_tokenize(contents.strip())
            cleanWords = []
            sentenceLenDist = []
            pronDist = []
            conjDist = []
            sentences = []
            totalWords = 0
            wordLenDist = []
            totalSyllables = 0
            for sentence in sentenceList:
                if sentence != ".":
                    pronCount = 0
                    conjCount = 0
                    sentences.append(sentence)
                    sentenceWords = re.findall(r"[\w']+", sentence)
                    totalWords += len(sentenceWords) # record all words in sentence
                    sentenceLenDist.append(len(sentenceWords)) # record length of sentence in words
                    for word in sentenceWords:
                        totalSyllables += count(word)
                        wordLenDist.append(len(word)) # record length of word in chars
                        if word in pronSet:
                            pronCount+=1 # record no. of pronouns in sentence
                        if word in conjSet:
                            conjCount+=1 # record no. of conjunctions in sentence
                        if word not in smartStopWords:
                            cleanWords.append(word)
                    pronDist.append(pronCount)
                    conjDist.append(conjCount)

            sentenceLengthFreqDist = FreqDist(sentenceLenDist)
            sentenceLengthDist = map(lambda x: sentenceLengthFreqDist.freq(x), range(1, RANGE))
            sentenceLengthDist.append(1-sum(sentenceLengthDist))

            pronounFreqDist = FreqDist(pronDist)
            pronounDist = map(lambda x: pronounFreqDist.freq(x), range(1, RANGE))
            pronounDist.append(1-sum(pronounDist))

            conjunctionFreqDist = FreqDist(conjDist)
            conjunctionDist = map(lambda x: conjunctionFreqDist.freq(x), range(1, RANGE))
            conjunctionDist.append(1-sum(conjunctionDist))

            wordLengthFreqDist= FreqDist(wordLenDist)
            wordLengthDist = map(lambda x: wordLengthFreqDist.freq(x), range(1, RANGE))
            wordLengthDist.append(1-sum(wordLengthDist))

            # calculate readability
            avgSentenceLength = np.mean(sentenceLenDist)
            avgSyllablesPerWord = float(totalSyllables)/totalWords
            readability = float(206.835 - (1.015 * avgSentenceLength) - (84.6 * avgSyllablesPerWord))/100

            wordsFreqDist = MyFreqDist(FreqDist(cleanWords))
            #sentenceDist = FreqDist(sentences)
            #print sentenceDist.keys()[:15] # most common sentences
            #print wordsFreqDist.keys()[:15] # most common words
            #print wordsFreqDist.keys()[-15:] # most UNcommon words

            numUniqueWords = len(wordsFreqDist.keys())
            numTotalWords = len(cleanWords)

            hapax = float(len(wordsFreqDist.hapaxes()))/numUniqueWords # no. words occurring once / total num. UNIQUE words
            dis = float(len(wordsFreqDist.dises()))/numUniqueWords # no. words occurring twice / total num. UNIQUE words
            richness = float(numUniqueWords)/numTotalWords # no. unique words / total num. words

            result = []
            result.append(hapax)
            result.append(dis)
            result.append(richness)
            result.append(readability)
            result.extend(sentenceLengthDist)
            result.extend(wordLengthDist)
            result.extend(pronounDist)
            result.extend(conjunctionDist)

            #some more lexical features
            # note: the nltk.word_tokenize includes punctuation
            tokens = word_tokenize(ch_text.lower())
            words = word_tokenizer.tokenize(ch_text.lower())
            sentences = sentence_tokenizer.tokenize(ch_text)
            vocab = set(words)
            words_per_sentence = np.array([len(word_tokenizer.tokenize(s))
                                           for s in sentences])

            # average number of words per sentence
            result.append(words_per_sentence.mean())
            # sentence length variation
            result.append(words_per_sentence.std())
            # Lexical diversity
            result.append(len(vocab) / float(len(words)))

            # Commas per sentence
            result.append(tokens.count(',') / float(len(sentences)))
            # Semicolons per sentence
            result.append(tokens.count(';') / float(len(sentences)))
            # Colons per sentence
            result.append(tokens.count(':') / float(len(sentences)))
            # Exclamation marks per sentence
            result.append(tokens.count('!') / float(len(sentences)))

            apply whitening to decorrelate the features for normalization divide by std
            fvs_lexical = whiten(fvs_lexical)
            fvs_punct = whiten(fvs_punct)     
            features_token_level[e] = result 
        else:
            np.delete(features_token_level, e, 0)
    #Bag of words features (most common words in the whole text )
    bow = BagOfWords(all_text, chapters)  
    allfeatures1 = np.concatenate((features_token_level, bow), axis=1)
    sf = SyntacticFeatures(chapters)
    allfeatures = np.concatenate((allfeatures1, sf), axis=1)
    return allfeatures, len(chapters)

In [121]:
def BagOfWords(all_text, chapters):
    """
    Compute the bag of words feature vectors, based on the most common words
     in the whole book
    """
    # get most common words in the whole book
    NUM_TOP_WORDS = 30
    all_tokens = nltk.word_tokenize(all_text)
    fdist = nltk.FreqDist(all_tokens)
    vocab = fdist.keys()[:NUM_TOP_WORDS]

    # use sklearn to create the bag for words feature vector for each chapter
    vectorizer = CountVectorizer(vocabulary=vocab, tokenizer=nltk.word_tokenize)
    fvs_bow = vectorizer.fit_transform(chapters).toarray().astype(np.float64)

    # normalise by dividing each row by its Euclidean norm
    fvs_bow /= np.c_[np.apply_along_axis(np.linalg.norm, 1, fvs_bow)]

    return fvs_bow


In [122]:
def SyntacticFeatures(chapters):
    """
    Extract feature vector for part of speech frequencies
    """
    def token_to_pos(ch):
        tokens = nltk.word_tokenize(ch)
        return [p[1] for p in nltk.pos_tag(tokens)]

    chapters_pos = [token_to_pos(ch) for ch in chapters]
    pos_list = ['NN', 'NNP', 'DT', 'IN', 'JJ', 'NNS']
    fvs_syntax = np.array([[ch.count(pos) for pos in pos_list]
                           for ch in chapters_pos]).astype(np.float64)

    # normalise by dividing each row by number of tokens in the chapter
    fvs_syntax /= np.c_[np.array([len(ch) for ch in chapters_pos])]

    return fvs_syntax



In [113]:
def save_book_features_to_file(x, y, le):
    '''
    Save book features to a features file.
    '''

    f = open(FEATURESFILE, 'wb')
    for index, item in enumerate(x):
        f.write("%s\t%d\t%s\n" % (le.inverse_transform(y[index]), y[index], ', '.join(map(str, item))))
    f.close()

    print 'Features saved to file %s' % FEATURESFILE


In [101]:
def load_book_features_from_corpus(dir_list, file_list, smartStopWords={}, pronSet={}, conjSet={}):
    '''
    Parse each book and load its features.
    '''

    x = []
    y = []
    t0 = time()
    totalwords = 0
    for index, files in enumerate(file_list):
        for f in files:
            print index
            print f
           
            features, numChapters = load_book_features_by_chapters(f, smartStopWords, pronSet, conjSet)
        
            x.extend(features)
            for l in range(0, numChapters):
                y.append(dir_list[index])
            #import pdb;pdb.set_trace()
    le = LabelEncoder().fit(y)
    #print 'Processed %d books from %d authors with %d total words in %2.3fs' % (len(x), len(dir_list), totalwords, time()-t0)
    return np.array(x), np.array(le.transform(y)), le

In [147]:
#def run_feature_extraction():
'''
Initiate feature_extraction.
'''

x = []
y = []
if not path.exists(FEATURESFILE):
    print 'Feature file not found. Creating...'
    pronSet = build_pron_set()
    conjSet = build_conj_set()
    smartStopWords = build_stop_words_set()

    dir_list, file_list = get_file_dir_list(CORPUSPATH)

    x, y, le = load_book_features_from_corpus(dir_list, file_list, smartStopWords, pronSet, conjSet)
    
    #import pdb;pdb.set_trace()
    save_book_features_to_file(x, y, le)
    print '... done.'
    print
#else:
#    print 'Feature file found. Reading...'
#    print
#    x, y = load_book_features_from_file()
#    import pdb;pdb.set_trace()
no_samples = x.shape[0]
no_classes = len(set(y))
