In [90]:
import nltk
import os
import sys

In [13]:
def clean_book(document):
    lines = document.split("\n")
    start= 0
    end = len(lines)
    for i in range(len(lines)):
        line = lines[i]
        if line.startswith("*** START OF THIS PROJECT GUTENBERG"):
            start = i + 1
        elif line.startswith("*** END OF THIS PROJECT GUTENBERG"):
            end = i - 1
    return "\n".join(lines[start:end])

In [14]:
import numpy as np
data_folder = r'C:\Users\gdple\AppData\Roaming\nltk_data\corpora\gutenberg'
def load_books_data(folder=data_folder):
    documents = []
    authors = []
    subfolders = [subfolder for subfolder in os.listdir(folder)
                  if os.path.isdir(os.path.join(folder, subfolder))]
    for author_number, subfolder in enumerate(subfolders):
        full_subfolder_path = os.path.join(folder, subfolder)
        for document_name in os.listdir(full_subfolder_path):
            with open(os.path.join(full_subfolder_path, document_name), encoding ='ISO-8859-1') as inf:
                documents.append(clean_book(inf.read()))
                authors.append(author_number)
    return documents, np.array(authors, dtype='int')

In [15]:

documents, classes = load_books_data(data_folder)

In [42]:

import numpy as np
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
import glob
import os
import ntpath
import re
import pandas as pd
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cluster import KMeans
from scipy.cluster.vq import whiten
%matplotlib inline
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
import random
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
sentence_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
word_tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\gdple\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\gdple\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [43]:
papers = r"C:\Users\gdple\AppData\Roaming\nltk_data\corpora\gutenberg"

In [44]:
Percy = sorted(glob.glob(os.path.join(papers, "Percy/*")))
Mary = sorted(glob.glob(os.path.join(papers, "Mary/*")))
Frankenstein = sorted(glob.glob(os.path.join(papers, "Disputed Paper/*")))

In [45]:
Percy_papers = []
for fn in Percy:
    with open(fn,encoding ='ISO-8859-1') as f:
        Percy_papers.append(f.read().replace('\n', ' ').replace('\r',''))
Percy_papers_all = ' '.join(Percy_papers)

Mary_papers = []
for fn in Mary:
    with open(fn,encoding ='ISO-8859-1') as f:
        Mary_papers.append(f.read().replace('\n', ' ').replace('\r',''))
Mary_papers_all = ' '.join(Mary_papers)

disputed_papers = []
disputed_papers_file_names = []
for fn in Frankenstein:
    with open(fn,encoding ='ISO-8859-1') as f:
        disputed_papers.append(f.read().replace('\n', ' ').replace('\r',''))
        disputed_papers_file_names.append(ntpath.basename(fn))
disputed_papers_all = ' '.join(disputed_papers)

In [46]:
known_papers_all = Percy_papers_all + " " + Mary_papers_all
known_papers = Percy_papers + Mary_papers

In [47]:
def LexicalFeatures(papers, all_papers):
    """
    Compute feature vectors for word and punctuation features
    """
    num_papers = len(papers)
    fvs_lexical = np.zeros((len(papers), 2), np.float64)
    fvs_punct = np.zeros((len(papers), 3), np.float64)
    for e, single_paper_text in enumerate(papers):
        # note: the nltk.word_tokenize includes punctuation
        tokens = nltk.word_tokenize(single_paper_text.lower())
        words = word_tokenizer.tokenize(single_paper_text.lower())
        sentences = sentence_tokenizer.tokenize(single_paper_text)
        vocab = set(words)
        words_per_sentence = np.array([len(word_tokenizer.tokenize(s)) for s in sentences])

        # average number of words per sentence
        fvs_lexical[e, 0] = words_per_sentence.mean()
        # Lexical diversity
        fvs_lexical[e, 1] = len(vocab) / float(len(words))

        # Commas per sentence
        fvs_punct[e, 0] = tokens.count(';') / float(len(sentences))
        fvs_punct[e, 1] = tokens.count('"') / float(len(sentences))
        fvs_punct[e, 2] = tokens.count(',') / float(len(sentences))
        

    # apply whitening to decorrelate the features
    fvs_lexical = whiten(fvs_lexical)
    fvs_punct = whiten(fvs_punct)

    return fvs_lexical, fvs_punct

In [48]:
def SyntacticFeatures(papers, all_papers):
    """
    Extract feature vector for part of speech frequencies
    """
    def token_to_pos(paper):
        tokens = nltk.word_tokenize(paper)
        return [p[1] for p in nltk.pos_tag(tokens)]

    paper_pos = [token_to_pos(paper) for paper in papers]
    pos_list = ['NN', 'NNP', 'DT', 'IN', 'JJ', 'NNS']
    fvs_syntax = np.array([[paper.count(pos) for pos in pos_list] for paper in paper_pos]).astype(np.float64)

    # normalise by dividing each row by number of tokens in the paper
    fvs_syntax /= np.c_[np.array([len(paper) for paper in paper_pos])]

    return fvs_syntax

In [49]:
def PredictAuthors(fvs):
    km = KMeans(n_clusters=2, init='k-means++', n_init=100, max_iter=300, verbose=0)
    km.fit(fvs)
    return km

In [50]:
known_set = list(LexicalFeatures(known_papers, known_papers_all))
known_set.append(SyntacticFeatures(known_papers, known_papers_all))

classifications = [PredictAuthors(fvs) for fvs in known_set]



In [51]:
disputed_set = list(LexicalFeatures(disputed_papers, disputed_papers_all))
disputed_set.append(SyntacticFeatures(disputed_papers, disputed_papers_all))



In [52]:
results = list()
results.append([classifications[0].predict(disputed_set[0]),"Lexical Features"]) # Predict results of Lexical Features
results.append([classifications[1].predict(disputed_set[1]),"Lexical Features - Punctuation"]) # Predict results of Lexical Features, Punctuation
results.append([classifications[2].predict(disputed_set[2]),"Syntactic Features"]) # Predict results of their syntactic feature

In [54]:
all_results = []
for i in range(len(classifications)):
    Mary = classifications[i].labels_[0] # Extract first
    individual_classifier_results = []
    for j in range(len(results[i][0])):
        if results[i][0][j] == Mary:
            individual_classifier_results.append("Mary")
        else:
            individual_classifier_results.append("Percy")
    print(individual_classifier_results)
    all_results.append(individual_classifier_results)

['Percy']
['Mary']
['Percy']
