In [37]:
%matplotlib inline
import numpy as np
import pandas as pd
import re
import sklearn as sk
import nltk
import matplotlib.pyplot as plt
from __future__ import print_function
from __future__ import division

In [38]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\jharvill\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

# Import and Cleaning

### Importing raw text

In [39]:
raw_file_path = './federalist_papers_raw_gutenburg.txt'
with open(raw_file_path, 'r') as f:
    raw = f.read()

### Parsing individual papers into a dataframe

The dataframe contains the paper number (e.g. FEDERALIST No. X) and the text body of the paper

In [40]:
# identifying potential paper starting indices
indices = [word.start() for word in re.finditer('FEDERALIST', raw)]

# But not all instances of 'FEDERALIST' are at the beginning of a paper.
for i in indices:
    print(raw[i:(i+18)])

FEDERALIST PAPERS 
FEDERALIST PAPERS

FEDERALIST No. 1


FEDERALIST No. 2


FEDERALIST No. 3


FEDERALIST No. 4


FEDERALIST No. 5


FEDERALIST No. 6


FEDERALIST No. 7


FEDERALIST No. 8


FEDERALIST No. 9


FEDERALIST No. 10

FEDERALIST No. 11

FEDERALIST No. 12

FEDERALIST No. 13

FEDERALIST No. 14

FEDERALIST No. 15

FEDERALIST No. 16

FEDERALIST No. 17

FEDERALIST No. 18

FEDERALIST No. 19

FEDERALIST No. 20

FEDERALIST No. 21

FEDERALIST No. 22

FEDERALIST No. 23

FEDERALIST No. 24

FEDERALIST No. 25

FEDERALIST No. 26

FEDERALIST No. 27

FEDERALIST No. 28

FEDERALIST No. 29

FEDERALIST No. 30

FEDERALIST No. 31

FEDERALIST No. 32

FEDERALIST No. 33

FEDERALIST No. 34

FEDERALIST No. 35

FEDERALIST No. 36

FEDERALIST No. 37

FEDERALIST No. 38

FEDERALIST No. 39

FEDERALIST No. 40

FEDERALIST No. 41

FEDERALIST No. 42

FEDERALIST No. 43

FEDERALIST No. 44

FEDERALIST No. 45

FEDERALIST No. 46

FEDERALIST No. 47

FEDERALIST No. 48

FEDERALIST No. 49

FEDERALIST No. 50

FEDERALIST N

In [41]:
### Parsing and creating dataframe
data = pd.DataFrame(columns=['num','body'])
for i in range(len(indices)): # iterate over potential paper beginnings
    start = indices[i]
    if i == len(indices) - 1:
        end = None # used if this is the last element of indices
    else:
        end = indices[i+1]

    full = raw[start:end] # extract full text corresponding to this instance of 'FEDERALIST'
    
    # Searching for string that is only found at very beggining of a paper
    body_start = re.search('To the People of the State of New York', full)
    if body_start:
        # if found, then the paper starts immediately after
        body_start = body_start.end() + 1
    else:
        # no body, so this isn't an instance of 'FEDERALIST' that begins a paper
        # skip this iteration
        continue
    
    
    body = full[body_start: ].strip() # extract the body from the full text
    title = full[0:20] # extract the title from the full text
    paper_num = re.findall(r'\d+', title) # extract paper number from the title
    paper_num = int(paper_num[0]) # converting to integer

    # appending row to the dataframe
    data = data.append({'num':paper_num,'body':body}, ignore_index=True)
    
# setting the paper numbers to be the index
data.set_index('num', inplace=True)

In [42]:
data

Unnamed: 0_level_0,body
num,Unnamed: 1_level_1
1,AFTER an unequivocal experience of the ineffic...
2,WHEN the people of America reflect that they a...
3,IT IS not a new observation that the people of...
4,MY LAST paper assigned several reasons why the...
5,"QUEEN ANNE, in her letter of the 1st July, 170..."
6,THE three last numbers of this paper have been...
7,"IT IS sometimes asked, with an air of seeming ..."
8,ASSUMING it therefore as an established truth ...
9,A FIRM Union will be of the utmost moment to t...
10,AMONG the numerous advantages promised by a we...


### Cleaning text bodies

In [43]:
def clean_text_body(body):
    '''
    Function for cleaning the body of a federalist paper.
    It just cleans up the whitespace right now, but we can add more
    '''
    body = re.sub(r'\s+', ' ', body)
    
    return body

In [44]:
data['body'] = data['body'].apply(clean_text_body)

### Saving CSV

In [45]:
save_file_path = './cleaned_papers_testing.csv'
data.to_csv(save_file_path)

### Reading CSV

In [46]:
data = pd.read_csv(save_file_path)

# EDA

Borrowing liberally from two Kaggle NLP tutorials.

Spooky Authorship: https://www.kaggle.com/cgump3rt/spooky-eda

Movie Reviews: https://www.kaggle.com/c/word2vec-nlp-tutorial#part-1-for-beginners-bag-of-words

### Creating Labels and Training Data

Per Mosteller and Wallace (1963), 12 papers are disputed between Hamilton and Madison (49-58, 62, 63). Jay wrote just 5 (2-5, 64). Three were co-written by Hamilton and Madison (18-20), although the level of contribution from each is disputed. Madison wrote 14 and Hamilton wrote 43. Confusingly, this only adds to 77, which is the number they give for how many federalist papers there are. But there appears to actually be 85. Not sure where this discrepancy comes from. Wikipedia lists the papers as described in Douglass Adair's essay _The Disputed Federalist Papers_, with footnotes indicating which are disputed or joint. https://en.wikipedia.org/wiki/The_Federalist_Papers

From this, we have Madison as the sole author for 14 (10, 14, 37-48) and Hamilton as the sole author for the remaining 51 (1, 6-9, 11-13, 15-17, 21-36, 59-61, 65-85).

For labels, h=Hamilton, m=Madison, j=Jay, hm=Hamilton and Madison, d=disputed.

In [47]:
j = np.array([2,3,4,5,64])
m = np.array([10,14,37,38,39,40,41,42,43,44,45,46,47,48])
hm = np.array([18,19,20])
d = np.array([49,50,51,52,53,54,55,56,57,58,62,63])

labels = np.array(['h']*85, dtype=object) # intially label all for Hamilton
labels[j-1] = 'j'
labels[m-1] = 'm'
labels[hm-1] = 'hm'
labels[d-1] = 'd'

data['author'] = labels # adding author labels
data['label'] = sk.preprocessing.LabelEncoder().fit_transform(data['author']) # encoding labels as integers

# Flagging papers with known authors. Treating dual-authorship as unknown.
data['known'] = data['author'].apply(lambda author: author in ['j','h','m'])

In [62]:
# Adair believes that all of the joint and disputed texts were written by Madison, and that Hamilton had little if any input.
adair_labels = labels
adair_labels[np.concatenate([hm,d]) - 1] = 'm'

data['adair'] = adair_labels

In [63]:
data

Unnamed: 0,num,body,author,label,known,adair
0,1,AFTER an unequivocal experience of the ineffic...,h,1,True,h
1,2,WHEN the people of America reflect that they a...,j,3,True,j
2,3,IT IS not a new observation that the people of...,j,3,True,j
3,4,MY LAST paper assigned several reasons why the...,j,3,True,j
4,5,"QUEEN ANNE, in her letter of the 1st July, 170...",j,3,True,j
5,6,THE three last numbers of this paper have been...,h,1,True,h
6,7,"IT IS sometimes asked, with an air of seeming ...",h,1,True,h
7,8,ASSUMING it therefore as an established truth ...,h,1,True,h
8,9,A FIRM Union will be of the utmost moment to t...,h,1,True,h
9,10,AMONG the numerous advantages promised by a we...,m,4,True,m


For now, training data is all known papers

In [67]:
train = data[data.known].copy()
test = data[~data.known].copy()

# Exploring characteristics of entire corpus

In [68]:
# verifying label counts.
# should be 70 papers, since we have 12 disputed and 3 joint witheld from the full 85
train.groupby('author').label.value_counts()

author  label
h       1        51
j       3         5
m       4        14
Name: label, dtype: int64

In [69]:
# Counting number of sentences, words, and characters in each paper
train['n_sentences'] = train.body.transform(lambda x: len(nltk.sent_tokenize(x)))
train['n_words'] = train.body.transform(lambda x: len(nltk.word_tokenize(x)))
train['n_characters'] = train.body.transform(lambda x: len(x))

# Grouping counts by author
train.groupby('author')[['n_sentences', 'n_words', 'n_characters']].sum()

Unnamed: 0_level_0,n_sentences,n_words,n_characters
author,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
h,3465,125577,667001
j,220,9328,50036
m,1176,43309,233306


In [80]:
# initialize count vectorizer
cv = sk.feature_extraction.text.CountVectorizer(analyzer = "word")

# fitting bag of words model and learning the vocabulary
train_features = cv.fit_transform(train.body)
vocab = cv.get_feature_names()

print('Learned vocab size:', len(vocab))
print('Shape of term-document matrix [n_samples, vocabulary_size]:', train_features.shape)

Learned vocab size: 8095
Shape of term-document matrix [n_samples, vocabulary_size]: (70, 8095)


In [101]:
authors = train.author.unique()
binarized = sk.preprocessing.label_binarize(train.author, authors)

In [102]:
word_counts = binarized.T * train_features

In [103]:
word_counts = word_counts.T
word_counts

array([[ 2,  0,  0],
       [ 2,  0,  0],
       [ 2,  0,  0],
       ..., 
       [ 0,  0,  1],
       [12,  1,  9],
       [ 6,  0,  2]], dtype=int64)

In [107]:
count_df.sum(axis=1).shape

(8095,)

In [110]:
count_df = pd.DataFrame(word_counts, columns=authors, index=vocab)
count_df['total'] = count_df.sum(axis=1)
count_df.sort_values(by='total', inplace=True, ascending=False)
count_df

Unnamed: 0,h,j,m,total
the,10351,516,3876,14743
of,7230,359,2307,9896
to,4547,288,1247,6082
and,2721,408,1164,4293
in,2829,164,808,3801
be,2300,160,754,3214
that,1717,150,542,2409
it,1549,138,497,2184
is,1329,57,481,1867
which,1245,56,424,1725


In [95]:
np.sum(temp)

156457

In [81]:
train_features

<70x8095 sparse matrix of type '<class 'numpy.int64'>'
	with 50025 stored elements in Compressed Sparse Row format>

In [24]:
print(cv.get_feature_names()[:])

