In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import re

### Importing raw text

In [2]:
raw_file_path = '../data/federalist_papers_raw_gutenburg.txt'
with open(raw_file_path, 'r') as f:
    raw = f.read()

### Parsing individual papers into a dataframe

The dataframe contains the paper number (e.g. FEDERALIST No. X) and the text body of the paper

In [3]:
# identifying potential paper starting indices
indices = [word.start() for word in re.finditer('FEDERALIST', raw)]

# But not all instances of 'FEDERALIST' are at the beginning of a paper.
for i in indices:
    print raw[i:(i+18)]

FEDERALIST PAPERS 
FEDERALIST PAPERS

FEDERALIST No. 1


FEDERALIST No. 2


FEDERALIST No. 3


FEDERALIST No. 4


FEDERALIST No. 5


FEDERALIST No. 6


FEDERALIST No. 7


FEDERALIST No. 8


FEDERALIST No. 9


FEDERALIST No. 10

FEDERALIST No. 11

FEDERALIST No. 12

FEDERALIST No. 13

FEDERALIST No. 14

FEDERALIST No. 15

FEDERALIST No. 16

FEDERALIST No. 17

FEDERALIST No. 18

FEDERALIST No. 19

FEDERALIST No. 20

FEDERALIST No. 21

FEDERALIST No. 22

FEDERALIST No. 23

FEDERALIST No. 24

FEDERALIST No. 25

FEDERALIST No. 26

FEDERALIST No. 27

FEDERALIST No. 28

FEDERALIST No. 29

FEDERALIST No. 30

FEDERALIST No. 31

FEDERALIST No. 32

FEDERALIST No. 33

FEDERALIST No. 34

FEDERALIST No. 35

FEDERALIST No. 36

FEDERALIST No. 37

FEDERALIST No. 38

FEDERALIST No. 39

FEDERALIST No. 40

FEDERALIST No. 41

FEDERALIST No. 42

FEDERALIST No. 43

FEDERALIST No. 44

FEDERALIST No. 45

FEDERALIST No. 46

FEDERALIST No. 47

FEDERALIST No. 48

FEDERALIST No. 49

FEDERALIST No. 50

FEDERALIST N

In [4]:
### Parsing and creating dataframe
data = pd.DataFrame(columns=['num','body'])
for i in range(len(indices)): # iterate over potential paper beginnings
    start = indices[i]
    if i == len(indices) - 1:
        end = None # used if this is the last element of indices
    else:
        end = indices[i+1]

    full = raw[start:end] # extract full text corresponding to this instance of 'FEDERALIST'
    
    # Searching for string that is only found at very beggining of a paper
    body_start = re.search('To the People of the State of New York', full)
    if body_start:
        # if found, then the paper starts immediately after
        body_start = body_start.end() + 1
    else:
        # no body, so this isn't an instance of 'FEDERALIST' that begins a paper
        # skip this iteration
        continue
    
    
    body = full[body_start: ].strip() # extract the body from the full text
    title = full[0:20] # extract the title from the full text
    paper_num = re.findall(r'\d+', title) # extract paper number from the title
    paper_num = int(paper_num[0]) # converting to integer

    # appending row to the dataframe
    data = data.append({'num':paper_num,'body':body}, ignore_index=True)
    
# setting the paper numbers to be the index
data.set_index('num', inplace=True)

In [5]:
data

Unnamed: 0_level_0,body
num,Unnamed: 1_level_1
1.0,AFTER an unequivocal experience of the ineffic...
2.0,WHEN the people of America reflect that they a...
3.0,IT IS not a new observation that the people of...
4.0,MY LAST paper assigned several reasons why the...
5.0,"QUEEN ANNE, in her letter of the 1st July, 170..."
6.0,THE three last numbers of this paper have been...
7.0,"IT IS sometimes asked, with an air of seeming ..."
8.0,ASSUMING it therefore as an established truth ...
9.0,A FIRM Union will be of the utmost moment to t...
10.0,AMONG the numerous advantages promised by a we...


### Cleaning text bodies

In [6]:
def clean_text_body(body):
    '''
    Function for cleaning the body of a federalist paper.
    It just cleans up the whitespace right now, but we can add more
    '''
    body = re.sub(r'\s+', ' ', body)
    
    return body

In [7]:
data['body'] = data['body'].apply(clean_text_body)

### Saving CSV

In [8]:
save_file_path = './cleaned_papers_testing.csv'
data.to_csv(save_file_path)

### Reading CSV

In [145]:
body = pd.read_csv(save_file_path)