In [75]:
import pandas as pd
import requests
import re
import itertools
from tqdm.notebook import tqdm
from bs4 import BeautifulSoup

In [76]:
index = pd.read_csv("../raw/napier_index.csv", encoding='windows-1252')
index = index.dropna()

In [77]:
index.head()

Unnamed: 0,Vol,Page,Area,Witness,Location,Url
0,1,1,"Skye, Braes",Angus Stewart,Beinn-a-chorrain,http://napier-skye.blogspot.com/2010/07/braes-...
1,1,2,"Skye, Braes",Alexander Macdonald,Portree,http://napier-skye.blogspot.com/2010/07/braes-...
2,1,3,"Skye, Braes",Angus Stewart,Beinn-a-chorrain,http://napier-skye.blogspot.com/2010/07/braes-...
3,1,11,"Skye, Braes",Samuel Nicolson,Balmeanach,http://napier-skye.blogspot.com/2010/07/braes-...
4,1,16,"Skye, Braes",Neil Macpherson,Gedentailler,http://napier-skye.blogspot.com/2010/07/braes-...


In [None]:
def extract_page(url):
    res = requests.get(url)
    soup = BeautifulSoup(res.text, 'html.parser')

    t = soup.find(class_ = "post-body entry-content")
    t = t.text.strip()
    lines = [line.strip() 
            for line in t.split("\n\n") 
            if line.strip()]
    lines = [l.strip() 
            for line in lines 
            for l in re.split(r"(?<![^\n])(\d+)(?:\.)?(?=\s+)", line, flags=re.M) 
            if l.strip()]
    lines = [("".join([c for c in line_num if c.isdigit()]), line) 
            for line_num, line in itertools.pairwise(lines) 
            if line_num[0].isdigit()]
    
    records = []
    for line_num, line in lines:
        parts = line.split("\n—")
        # Remove line numbers
        parts = [re.sub("(\\d+\\.)", "", p) for p in parts]
        interviewer, utterance, question, answer = None, None, None, None
        # If 3 parts, assume interviewer identification, question, and witness answer
        if len(parts) == 3:
            interviewer, question, answer = parts
            interviewer = interviewer.strip()
            question = question.strip()
            answer = answer.strip()
        # If 2 parts, assume question and answer
        elif len(parts) == 2:
            utterance, answer = parts
            utterance = utterance.strip()
            answer = answer.strip()
        elif len(parts) == 1:
            utterance = parts[0].strip()
        else:
            print("error:", url)
            return (url, "ERROR", parts)
            raise ValueError("Encountered line with more than 3 parts")
        if utterance and utterance.endswith("?"):
            question = utterance
            utterance = None
        records.append([line_num, interviewer, utterance, question, answer])
    df = pd.DataFrame.from_records(records, columns=["LineNo", "Interviewer", "Statement", "Question", "Answer"])

    return df

In [169]:
extract_page(index.iloc[60]["Url"])

http://napier-skye.blogspot.com/2010/08/dunvegan-skye-15-may-1883-murdo-mclean.html


Unnamed: 0,LineNo,Interviewer,Statement,Question,Answer
0,3589,The Chairman.,,Have you been freely elected a delegate by you...,Yes.
1,3590,,,How long have you been on your croft ?,Twenty-nine years.
2,3591,,,On the same croft?,"Yes. I would like, before I say anything furth..."
3,3592,,,Who is the factor on the estate to whom you be...,"Mr Robertson, Grishornish."
4,3593,,,Is he here to-day?,He is not present.
...,...,...,...,...,...
179,3768,,,"Did Mr Robertson, or anybody in his behalf, as...",I cannot say that Mr Robertson did ask us out ...
180,3769,Sheriff Nicolson.,,Had you a brother in the army ?,Yes.
181,3770,,,Was your father a crofter?,Yes.
182,3771,,,What rank did your brother rise to?,Lieutenant-Colonel.


In [189]:
def make_record(row):
    page = extract_page(row["Url"])
    if type(page) is tuple:
        return page
    for col in index.columns[:-1]:
        page[col] = row[col]
    return page

In [190]:
tqdm.pandas(desc="Parse")
dfs = index.progress_apply(make_record, axis = 1)

Parse:   0%|          | 0/715 [00:00<?, ?it/s]

error: http://napier-skye.blogspot.com/2010/08/skeabost-skye-9-may-1883-john-bethune.html
error: http://napier-skye.blogspot.com/2010/08/uig-10-may-1883-hector-mckenzie.html
error: http://napier-skye.blogspot.com/2010/08/uig-10-may-1883-donald-beaton-herbosta.html
error: http://napier-skye.blogspot.com/2010/08/uig-10-may-1883-donald-mathieson.html
error: http://napier-skye.blogspot.com/2010/08/stenscholl-skye-11-may-1883-donald-ross.html
error: http://napier-skye.blogspot.com/2010/08/stenscholl-skye-11-may-1883-archibald.html
error: http://napier-skye.blogspot.com/2010/08/stenscholl-skye-11-may-1883-norman_6593.html
error: http://napier-skye.blogspot.com/2010/08/dunvegan-skye-15-may-1883-john-mcswan.html
error: http://napier-skye.blogspot.com/2010/08/isle-ornsay-skye-17-may-1883-donald_6530.html
error: http://napier-skye.blogspot.com/2010/08/isle-ornsay-skye-17-may-1883-john_30.html
error: http://napier-skye.blogspot.com/2010/08/isle-ornsay-skye-17-may-1883-donald_1961.html
error: http

In [192]:
dfs[10]

Unnamed: 0,LineNo,Interviewer,Statement,Question,Answer,Vol,Page,Area,Witness,Location
0,593,The Chairman.,,Will you describe to us the condition of the p...,The principal grievances of which they complai...,1,34,"Skye, Braes",John Mcintyre,Sconser
1,594,,,Might I ask whether you have yourself personal...,Yes.,1,34,"Skye, Braes",John Mcintyre,Sconser
2,595,,,If the proprietors were disposed to give to th...,Yes.,1,34,"Skye, Braes",John Mcintyre,Sconser
3,596,,,Would it be possible to put a wire fence round...,"Yes, but at a great deal of expense, which the...",1,34,"Skye, Braes",John Mcintyre,Sconser
4,597,,,What would be the extent of the wire fence nec...,About 3 miles.,1,34,"Skye, Braes",John Mcintyre,Sconser
5,598,,,"If such a fence could be put up, and the wande...",Yes.,1,34,"Skye, Braes",John Mcintyre,Sconser
6,599,,,Would an ordinary wire fence of six or seven w...,In some cases the deer would pass over such a ...,1,34,"Skye, Braes",John Mcintyre,Sconser
7,600,,,Are the houses of the place more than usually ...,They are very bad. Some of them are about the ...,1,34,"Skye, Braes",John Mcintyre,Sconser
8,601,,,Do you attribute the present prevalence of fev...,I could hardly answer that.,1,34,"Skye, Braes",John Mcintyre,Sconser
9,602,Mr Fraser-Mackintosh.,,What kind of fever is it?,Typhoid.,1,34,"Skye, Braes",John Mcintyre,Sconser
