In [1]:
# import relevant libraries

from sklearn.decomposition import PCA as sklearnPCA
from sklearn.preprocessing import StandardScaler
import numpy as np
import os
from io import open

In [2]:
# set directory for text input files
text_files = os.path.join(os.path.dirname("__file__"), 'texts')
files = os.listdir(text_files)

In [3]:
# lists for storing data for PCA processing
X = []  # Stores data on each input text
y = []  # Stores authors of input texts
markers = []  # Stores only unique author names for plotting purposes

In [4]:
from nltk.tokenize import sent_tokenize
import nltk
from numpy import std
from collections import Counter

def bigrams(words):
    """Calculates number of unique bigrams per chunk"""
    from nltk import bigrams
    bg = bigrams(words)
    bg = set(bg)
    bg = Counter(bg)

    return (float(len(bg)) / len(words))
    

def mean_word_length(words):
    """Calculates mean length of words in a text"""
    total_length = 0
    for word in words:
        total_length += len(word)
    return total_length/len(words)


def mean_sentence_length(sentences):
    """Calculates the mean length of each sentence"""
    total_length = 0
    for sentence in sentences:
        total_length += len(sentence)

    mean = total_length/len(sentences)
    return mean


def sd_of_sentence_length(sentences):
    """Returns the standard deviation in sentence length"""
    sentence_lengths = []
    for sentence in sentences:
        sentence_length = len(sentence)
        sentence_lengths.append(sentence_length)

    sd = std(sentence_lengths)
    return sd


def count_words(words, word_to_count):
    """Returns the count of a given word/character per chunk"""
    total = 0
    
    for word in words:
        total += word.count(word_to_count)
    
    #total += words.count(word_to_count)
       
    return (float(total) / len(words))


def type_token_ratio(words):
    """Calculates the type token ratio of a text"""
    c = Counter(words)
    # TTR is the number of unique words in a text divided by the total word count
    #ttr = float(len(c)) / len(words)
    ttr = float(len(c)) / len(words)
    return ttr

In [5]:
common_elems = [',', ';', '"', '!', '-', 'and', 'but', 'however',
                'if', 'that', 'more', 'must', 'might', 'this', 'very',
                'ye', 'wi', 'thou', 'thy', 'may', 'man', 'sae', 'like',
               'thee', 'heart', 'love', 'day']

# Loop through each text file and append relevant data to lists
for doc in os.listdir(text_files):

    text = open(text_files + '\\' + doc, 'r', encoding='utf-8')
    # open the file, store as lower case
    text = text.read().lower()
    
    #get the author from the title (in format <Author> <volume no>.txt)
    author = doc.split()
    author = author[0].title()
    
    # if author isn't already in list of markers, add it
    if author not in markers:
        markers.append(author)
    
    
    words = text.split()
    sentences = sent_tokenize(text)
    # split into chunks of 5000 words
    words = [words[i:i+5000] for i in range(0, len(words), 5000)]
    
    
    full_text_data = [] # Array for storing all chunk data lists
    
    for chunk in words:
        chunk_data = [] # Array for storing the data gathered from this chunk
        chunk_data.append(mean_word_length(chunk))
        chunk_data.append(bigrams(chunk))
        chunk_data.append(type_token_ratio(chunk))
        #chunk_data.append(mean_sentence_length(sentences))
        #chunk_data.append(sd_of_sentence_length(sentences))
        
        for w in common_elems:
            chunk_data.append(count_words(chunk, w))
        
        X.append(chunk_data)
        #add author of chunk to y
        y.append(author)
        
    
    #X.append(full_text_data)
    



In [6]:
###### Convert both X and Y to numpy arrays
X = np.array(X)
y = np.array(y)

In [7]:
# Standardise data, perform PCA

X_std = StandardScaler().fit_transform(X)
sklearn_pca = sklearnPCA(n_components=2)
Y_sklearn = sklearn_pca.fit_transform(X_std)

In [10]:
import plotly.plotly as py
from plotly.graph_objs import *
import plotly.tools as tls

traces = []

for author in markers:

    trace = Scatter(
        x=Y_sklearn[y==author,0],
        y=Y_sklearn[y==author,1],
        mode='markers',
        name=author,
        marker=Marker(
            size=14))
    traces.append(trace)


data = Data(traces)
layout = Layout(title='PCA of Burns and contemporaries',
                xaxis=XAxis(title='Principal Component 1'),
                yaxis=YAxis(title='Principal Component 2'))
fig = Figure(data=data, layout=layout)
py.iplot(fig)

In [9]:
from matplotlib import pyplot as plt

with plt.style.context('fivethirtyeight'):
    plt.figure(figsize=(18, 7), dpi=96)

    # Amend this so a random but unique colour is selected for each author
    for lab, col in zip(markers, ('blue', 'red', 'green', 'black', 'orange', 'pink', 'brown')):

        plt.scatter(Y_sklearn[y == lab, 0],
                    Y_sklearn[y == lab, 1],
                    label=lab,
                    c=col,
                    s=40)
    plt.xlabel('Principal Component 1')
    plt.ylabel('Principal Component 2')
    plt.legend(loc='right')
    plt.tight_layout()
    plt.show()