# Principal Components Analysis (for authorship verification)

Simple implementation of Principal Components Analysis in analysing authorship, using 19 measurements from each text.

In [1]:
# import relevant libraries

from sklearn.decomposition import PCA as sklearnPCA
from sklearn.preprocessing import StandardScaler
import numpy as np
import os
from io import open

In [2]:
# set directory for text input files
text_files = os.path.join(os.path.dirname("__file__"), 'texts')
files = os.listdir(text_files)

In [3]:
# lists for storing data for PCA processing
X = []  # Stores data on each input text
y = []  # Stores authors of input texts
markers = []  # Stores only unique author names for plotting purposes

# Functions to collect relevant data from text 

<b>N.B. Code was written separately so will likely need rewritten</b>

Relevant data: (see Hanlein, H. “Studies in Authorship Recognition: a Corpus-based Approach”)
<ul>
<li>Mean word length</li>
<li>Mean sentence length</li>
<li>Standard deviation of sentence length</li>
<li>Number of unique words in a text (Type Token Ratio)</li>
<li>Instances of ',' per 1000 tokens</li>
<li>Instances of ';' per 1000 tokens</li>
<li>Instances of '"' per 1000 tokens</li>
<li>Instances of '!' per 1000 tokens</li>
<li>Instances of '-' per 1000 tokens</li>
<li>Instances of 'and' per 1000 tokens</li>
<li>Instances of 'but' per 1000 tokens</li>
<li>Instances of 'however' per 1000 tokens</li>
<li>Instances of 'if' per 1000 tokens</li>
<li>Instances of 'that' per 1000 tokens</li>
<li>Instances of 'more' per 1000 tokens</li>
<li>Instances of 'must' per 1000 tokens</li>
<li>Instances of 'might' per 1000 tokens</li>
<li>Instances of 'this' per 1000 tokens</li>
<li>Instances of 'very' per 1000 tokens</li>
</ul>

In addition, it is suggested that chapter length and mean paragraph length should be considered. Given the goal of this project, these are likely to be irrelevant. (Although possibly chapter length becomes stanza length - context dependent)

- TODO will research additional possibilities - w/ emphasis on Scots language determiners/superlatives
- Include some sort of bigram calculation, eg. no. of unique bigrams

In [4]:
from nltk.tokenize import sent_tokenize
import nltk
from numpy import std
from collections import Counter

common_elems = [',', ';', '"', '!', '-', 'and', 'but', 'however',
                'if', 'that', 'more', 'must', 'might', 'this', 'very']


def initial_read(text):
    """Reads the text file and saves it as a string"""

    words = text.split()
    sentences = sent_tokenize(text)
    
    data = list()
    data.append(mean_word_length(words))
    data.append(mean_sentence_length(sentences))
    data.append(sd_of_sentence_length(sentences))
    data.append(type_token_ratio(words))
    
# CODE BELOW APPENDS NUMBER OF UNIQUE BIGRAMS PER TEXT, NOT SURE OF THE VALUE OF THIS AS YET, WILL RESEARCH FURTHER
# Brings some texts closer together

    from nltk import bigrams
    bg = bigrams(words)
    bg = set(bg)
    bg = Counter(bg)

    data.append(float(len(bg)) / len(words))

    for word in common_elems:
        data.append(count_words(text, word, len(words)))

    return data


def mean_word_length(words):
    """Calculates mean length of words in a text"""
    total_length = 0
    for word in words:
        total_length += len(word)
    return total_length/len(words)


def mean_sentence_length(sentences):
    """Calculates the mean length of each sentence"""
    total_length = 0
    for sentence in sentences:
        total_length += len(sentence)

    mean = total_length/len(sentences)
    return mean


def sd_of_sentence_length(sentences):
    """Returns the standard deviation in sentence length"""
    sentence_lengths = []
    for sentence in sentences:
        sentence_length = len(sentence)
        sentence_lengths.append(sentence_length)

    sd = std(sentence_lengths)
    return sd


def count_words(text, word, wordcount):
    """Returns the count of a given word/character per 1000 words"""
    total = text.count(word)
    thousands = wordcount / 500.0
    total /= thousands
    return total


def type_token_ratio(words):
    """Calculates the type token ratio of a text"""
    c = Counter(words)
    # TTR is the number of unique words in a text divided by the total word count
    #ttr = float(len(c)) / len(words)
    ttr = float(len(c)) / 500
    return ttr

# Test Data

Novels by: <ul>
    <li>Jane Austen</li>
    <li>Walter Scott</li>
    <li>Charles Dickens</li>
    <li>Rudyard Kipling</li>
    <li>Henry James</li>


In [5]:
# Loop through each text file and append relevant data to lists
for doc in os.listdir(text_files):

    text = open(text_files + '\\' + doc, 'r', encoding='utf-8')
    # open the file, store as lower case
    text = text.read().lower()
    
    
    
        
    X.append(initial_read(text))
    
    # Right now txt files are named "author <number>.txt", redesign?
    author = doc.split()

    y.append(author[0].title())
    if author[0].title() not in markers:
        markers.append(author[0].title())

In [6]:
###### Convert both X and Y to numpy arrays
X = np.array(X)
y = np.array(y)

In [7]:
# Standardise data, perform PCA

X_std = StandardScaler().fit_transform(X)
sklearn_pca = sklearnPCA(n_components=2)
Y_sklearn = sklearn_pca.fit_transform(X_std)

In [8]:
import plotly.plotly as py
from plotly.graph_objs import *
import plotly.tools as tls

traces = []

for author in markers:

    trace = Scatter(
        x=Y_sklearn[y==author,0],
        y=Y_sklearn[y==author,1],
        mode='markers',
        name=author,
        marker=Marker(
            size=14))
    traces.append(trace)


data = Data(traces)
layout = Layout(title='PCA of 5 authors',
                xaxis=XAxis(title='Principal Component 1'),
                yaxis=YAxis(title='Principal Component 2'))
fig = Figure(data=data, layout=layout)
py.iplot(fig)

# Get stats on Burns most common words

In [9]:
burns = ''

# Loop through each text file and append relevant data to lists
for doc in os.listdir(text_files):

    text = open(text_files + '\\' + doc, 'r', encoding='utf-8')
    # open the file, store as lower case
    text = text.read().lower()
    
    burns += text

In [10]:
def common_words(words):
    
    import regex as re
    
    import pandas as pd
    
    from nltk.tokenize import wordpunct_tokenize
    import string

    words = wordpunct_tokenize(words)
#     remove punctuation
#     words = re.sub(ur"\p{P}+", "", words)
    c = Counter(words)
    
    for p in string.punctuation:
        del c[p]
    
    from IPython.display import display, HTML
    print("Most common Burns words (including stopwords):")
    df = pd.DataFrame.from_dict(c.most_common(25))
    df.columns = ['Word', 'Frequency']
    display(df)

    # remove stopwords
    from nltk.corpus import stopwords
    for word in stopwords.words('english'):
        del c[word]
        
     
    print("Most common Burns words (without stopwords):")
    df = pd.DataFrame.from_dict(c.most_common(25))
    df.columns = ['Word', 'Frequency']
    display(df)

In [11]:
common_words(burns)

Most common Burns words (including stopwords):


Unnamed: 0,Word,Frequency
0,the,26634
1,and,15308
2,to,9050
3,of,8848
4,a,8572
5,s,6787
6,in,6356
7,i,6310
8,that,5036
9,his,4482


Most common Burns words (without stopwords):


Unnamed: 0,Word,Frequency
0,—,1944
1,er,1783
2,thy,1718
3,ye,1665
4,wi,1502
5,thou,1394
6,like,1128
7,may,1103
8,man,1058
9,heart,969
