In [1]:
# !python --version

Python 3.11.4


In [3]:
# !pip install selenium

# Crafting links

In [43]:
[i for i in range(1, 4)]

[1, 2, 3]

In [48]:
# Home page for Eyewitness Testimonies
root_url = 'https://www.testifyingtothetruth.co.uk/viewer/search/-/PI%3A*/1/SORT_SHELFMARK/BOOL_HASENGLISHTRANSLATION%3Atrue%3B%3B/'

In [53]:
# Get all pages to visit 
eyewitness_pages = []

for i in range(1, 62):
    url = 'https://www.testifyingtothetruth.co.uk/viewer/search/-/PI%3A*/' + str(i) + '/SORT_SHELFMARK/BOOL_HASENGLISHTRANSLATION%3Atrue%3B%3B/'
    eyewitness_pages.append(url)

In [55]:
eyewitness_pages[0]

'https://www.testifyingtothetruth.co.uk/viewer/search/-/PI%3A*/1/SORT_SHELFMARK/BOOL_HASENGLISHTRANSLATION%3Atrue%3B%3B/'

Next, for each page, extract each Report using BeautifulSoup.

# Beautiful Soup

In [2]:
import requests
from bs4 import BeautifulSoup
import pickle

In [63]:
# Function to scrape and process links
def get_page_links(url):
    link_urls = []
    # Send a GET request to the URL
    response = requests.get(url)
    
    # Check if the request was successful
    if response.status_code == 200:
        # Parse the HTML content
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Find all links with class "search-list__hit-title"
        divs = soup.find_all('div', class_='search-list__hit-title')
        
        for div in divs:
            link = div.find('a')
            if link:
                href = link.get('href')
                link_urls.append(href)
                
    return link_urls

In [64]:
# Transform links (get english version of pages)
def get_english_page(url):
    # Step 1: Replace "metadata" with "fulltext"
    url = url.replace("/metadata/", "/fulltext/")

    # Step 2: Replace "1/eng/" with "en/"
    url = url.replace("1/eng/", "en/")

    return url

In [65]:
# Put it all together: extract and create ALL the links I will actually use to 
# visit with bs4 and scrape. 
urls_to_visit = []

for page in eyewitness_pages:
    page_links = get_page_links(page)

    for subPage in page_links:
        urls_to_visit.append(get_english_page(subPage))

In [66]:
# Resulting list should be of length 721
len(urls_to_visit)

721

In [67]:
# # Save list as pickle
# import pickle

# # File path for the pickle file
# file_path = 'all_urls_to_visit.pickle'

# # Write the list to the pickle file
# with open(file_path, 'wb') as f:
#     pickle.dump(urls_to_visit, f)

# print("List has been written to", file_path)

List has been written to all_urls_to_visit.pickle


In [5]:
import os

In [6]:
os.getcwd()

'/Users/ez/Desktop/text_memory_and_identity/data'

In [3]:
# READ IN DATA
# Specify the path to your pickle file
pickle_file_path = 'all_urls_to_visit.pickle'

# Open the pickle file in read-binary mode
with open(pickle_file_path, 'rb') as file:
    # Load the data from the pickle file
    urls_to_visit = pickle.load(file)

In [4]:
urls_to_visit[640:650]

['https://www.testifyingtothetruth.co.uk/viewer/fulltext/106396/en/',
 'https://www.testifyingtothetruth.co.uk/viewer/fulltext/106397/en/',
 'https://www.testifyingtothetruth.co.uk/viewer/fulltext/106398/en/',
 'https://www.testifyingtothetruth.co.uk/viewer/fulltext/106399/en/',
 'https://www.testifyingtothetruth.co.uk/viewer/fulltext/106405/en/',
 'https://www.testifyingtothetruth.co.uk/viewer/fulltext/106408/en/',
 'https://www.testifyingtothetruth.co.uk/viewer/fulltext/106410/en/',
 'https://www.testifyingtothetruth.co.uk/viewer/fulltext/106413/en/',
 'https://www.testifyingtothetruth.co.uk/viewer/fulltext/106415/en/',
 'https://www.testifyingtothetruth.co.uk/viewer/fulltext/106420/en/']

# Scrape Page Content

In [5]:
import regex as re

In [23]:
def get_metadata_text(soup):
    """
    Extracts metadata (title, date, num pages, etc.), 9 total elements, from 
    the first part of each page.

    Returns: 
        (list): A list of tuples containing: ('metadata type', 'value')
    """
    bullet_points = []
    
    # Find all <strong> elements
    bullet_names = soup.find_all('strong')

    # Extract the text of the bullet points and remove leading number and period
    for bullet_name in bullet_names:
        # Extract the text of the <strong> element
        bullet_name_text = bullet_name.get_text(strip=True)
        
        # Check if the text starts with a number followed by a period
        if re.match(r'^\d+\.', bullet_name_text):
            # If it does, remove leading number and period from bullet name
            try: 
                bullet_name_text = bullet_name_text.split('. ', 1)[1] if bullet_name_text[0].isdigit() else bullet_name_text
            except IndexError:
                continue # skips this line of code

            # Find the next sibling containing text
            bullet_text_sibling = bullet_name.find_next_sibling(text=True)
            
            # If bullet_text_sibling is not None, extract its text and strip leading/trailing whitespace
            if bullet_text_sibling:
                bullet_text = bullet_text_sibling.strip()

                # Remove trailing bullet point using regular expressions
                bullet_text = re.sub(r'\.\s*$', '', bullet_text)
                # Remove colon and space at the beginning of bullet text
                bullet_text = bullet_text.lstrip(': ')
            else:
                bullet_text = ""  # If bullet_text_sibling is None, set bullet_text to an empty string
            
            # Add the bullet point to the list
            bullet_points.append((bullet_name_text, bullet_text))
    
    return bullet_points


def get_main_text(soup):
    """
    Extracts main body of eyewitness testimony text from beautiful soup object.

    Returns: 
        (tuple): formatted as ('Text', 'main body text here')
    """
    # Parse the HTML content
    # soup = BeautifulSoup(response.content, 'html.parser')
    
    # Find all <strong> tags containing bullet point names
    segments = soup.find_all('seg')

    # Extract text content from all paragraphs within the <seg> tag
    paragraphs = segments[1].find_all('p')
    paragraph_texts = [paragraph.get_text(strip=True) for paragraph in paragraphs]
    
    # Join the text content of paragraphs with a space
    seg_text = ' '.join(paragraph_texts)

    # Format into tuple
    return  ('Text', seg_text)

In [234]:
# # TESTING
# response = requests.get(urls_to_visit[200])

# bullet_points = []
# if response.status_code == 200:
#     soup = BeautifulSoup(response.content, 'html.parser')

#     # Find all <strong> elements
#     bullet_names = soup.find_all('strong')

#     # Extract the text of the bullet points and remove leading number and period
#     for bullet_name in bullet_names:
#         # Extract the text of the <strong> element
#         bullet_name_text = bullet_name.get_text(strip=True)
        
#         # Check if the text starts with a number followed by a period
#         if re.match(r'^\d+\.', bullet_name_text):
#             # If it does, remove leading number and period from bullet name
#             bullet_name_text = bullet_name_text.split('. ', 1)[1] if bullet_name_text[0].isdigit() else bullet_name_text
            
#             # Find the next sibling containing text
#             bullet_text_sibling = bullet_name.find_next_sibling(text=True)
            
#             # If bullet_text_sibling is not None, extract its text and strip leading/trailing whitespace
#             if bullet_text_sibling:
#                 bullet_text = bullet_text_sibling.strip()

#                 # Remove trailing bullet point using regular expressions
#                 bullet_text = re.sub(r'\.\s*$', '', bullet_text)
#                 # Remove colon and space at the beginning of bullet text
#                 bullet_text = bullet_text.lstrip(': ')
#             else:
#                 bullet_text = ""  # If bullet_text_sibling is None, set bullet_text to an empty string
            
#             # Add the bullet point to the list
#             bullet_points.append((bullet_name_text, bullet_text))

# print(len(bullet_points))
# bullet_points

  bullet_text_sibling = bullet_name.find_next_sibling(text=True)


In [239]:
# # Scrape page text - TEST

# urls_to_visit2 = urls_to_visit[:10]
# eyewitness_data = []

# for url in urls_to_visit2:
#     response = requests.get(url)

#     if response.status_code == 200:
#         soup = BeautifulSoup(response.content, 'html.parser')

#         metadata_text = get_metadata_text(soup)

#         main_text = get_main_text(soup)

#     # Since metadata_text is a list, append main_text to metadata_text, and 
#     # append both to new list
#     metadata_text.append(main_text)
#     eyewitness_data.append(metadata_text)

  bullet_text_sibling = bullet_name.find_next_sibling(text=True)


In [242]:
# # TEST output
# len(eyewitness_data)
# eyewitness_data[:1]

[[('Index Number', 'P.I.a. No. 62'),
  ('Title of Document', 'The Jews in Pomerania'),
  ('Date', ''),
  ('Number of pages', '10'),
  ('Author of Source', 'Dr. Ernst Alban'),
  ('Recorded by', 'as above, March 1955'),
  ('References',
   'Rabbiner Dr. Elk, Rabbiner Dr. Vogelstein, Rabbiner Dr. Max Wiener (p.10); Paul Hirschfeld, liaison officer between'),
  ('Form and Contents',
   'A report (in two parts: A. The Jews in Pomerania before 1933, B. The Jews in Pomerania since 1933) dealing with the remarkable economic prosperity of the Pomeranian Jews up to 1933 and their gradual well-planned elimination thereafter'),
  ('Text',
   'The Jews in Pomerania I, Dr. Ernst Alban, hereby transfer all literary copyright of the 12 attached handwritten pages “Die Juden in Pommern” [The Jews in Pomerania] to TheWiener Library, London, W.1. I further declare that I will not subject TheWiener Libraryor any individual to any pecuniary claims on my behalf for writing or taking minutes of the attached w

In [55]:
# # PUTTING IT ALL TOGETHER!
# # Export to pickle as data is collected
# eyewitness_data = []

# # Open the pickle file in append mode
# with open('text_data_TEST.pkl', 'ab') as file:
#     # Generate data

#     for url in urls_to_visit[3:6]:
#         response = requests.get(url)

#         if response.status_code == 200:
#             soup = BeautifulSoup(response.content, 'html.parser')

#             metadata_text = get_metadata_text(soup)

#             main_text = get_main_text(soup)

#         # Since metadata_text is a list, append main_text to metadata_text, and 
#         # append both to new list
#         metadata_text.append(main_text)
#         eyewitness_data.append(metadata_text)

#     pickle.dump(metadata_text, file)

  bullet_text_sibling = bullet_name.find_next_sibling(text=True)


In [None]:
# print(len(eyewitness_data))
# eyewitness_data

In [30]:
# NEW & Timed
from tqdm import tqdm

def save_to_pickle(data, filename):
    with open(filename, 'ab') as file:
        pickle.dump(data, file)

eyewitness_data = []
# Use tqdm to create a progress bar
with tqdm(total=len(urls_to_visit[514:])) as pbar:
    for url in urls_to_visit[514:]:
        response = requests.get(url)

        if response.status_code == 200:
            soup = BeautifulSoup(response.content, 'html.parser')

            metadata_text = get_metadata_text(soup)

            main_text = get_main_text(soup)

            # Since metadata_text is a list, append main_text to metadata_text, and 
            # append both to new list
            metadata_text.append(main_text)
            eyewitness_data.append(metadata_text)

        # Update the progress bar
        pbar.update(1)

for inner_list in eyewitness_data:
    save_to_pickle(inner_list, 'eyewitness_testimonies_514_to_end.pickle')

  bullet_text_sibling = bullet_name.find_next_sibling(text=True)
100%|██████████| 207/207 [1:03:09<00:00, 18.31s/it]


In [31]:
len(eyewitness_data)

207

In [28]:
# for inner_list in eyewitness_data:
#     save_to_pickle(inner_list, 'eyewitness_testimonies_514_to_end.pickle')

# Combine & make complete raw Eyewitness data

In [48]:
import pandas as pd
import pickle

In [34]:
# Load pickles 2
def load_pickle_file(pickle_file_path, loaded_data):
    '''
    Load data from a pickle file and append it to the list loaded_data.
    
    Inputs:
        pickle_file_path (str): Path to the pickle file
        loaded_data (list): List to store the loaded data
        
    Returns: None
    '''
    # Open the pickle file in read-binary mode
    with open(pickle_file_path, 'rb') as file:
        # Load the data from the pickle file
        while True:
            try:
                # Load each object from the pickle file and append it to the list
                loaded_data.append(pickle.load(file))
            except EOFError:
                # Exit the loop when the end of the file is reached
                break

# Specify the paths to your pickle files
pickle_file_paths = ['eyewitness_testimonies_44.pickle',
                     'eyewitness_testimonies_44_to_131.pickle', 
                     'eyewitness_testimonies_131_to_242.pickle',
                     'eyewitness_testimonies_242_to_296.pickle',
                     'eyewitness_testimonies_297_to_315.pickle',
                     'eyewitness_testimonies_316_to_514.pickle',
                     'eyewitness_testimonies_514_to_end.pickle'
                     ]

# Initialize an empty list to store all the loaded data
loaded_data = []

# Load data from each pickle file and append it to loaded_data
for pickle_file_path in pickle_file_paths:
    load_pickle_file(pickle_file_path, loaded_data)

In [None]:
# Display the loaded data
print(len(loaded_data))
loaded_data

In [41]:
# Turn list of lists of tuples into DF
eyewitness_all_data = pd.DataFrame([dict(row) for row in loaded_data])

In [44]:
eyewitness_all_data.describe()

Unnamed: 0,Index Number,Title of Document,Date,Number of pages,Author of Source,Recorded by,References,Form and Contents,Text,Author or Source,...,Recoded by,Remarks :,Remark by the analyst,Remarks by analyst and further references,Contents and Form,Form and Contrats,index Number,Recorded :,Recorded:,Title of document :
count,700,692.0,684.0,514,14,553,163.0,684.0,721.0,650,...,1.0,1,1,1,1,1,1,1,1,1
unique,680,625.0,420.0,58,14,318,150.0,676.0,717.0,558,...,1.0,1,1,1,1,1,1,1,1,1
top,P.III.h. (,,,3,Dr. Ernst Alban,as above,,,,Anonymous,...,,This report is copied from the original photo-...,"Josef Klaber, a Jew, was the founder of the Gh...",The author of this report is the descendant of...,A statement by the author who was an eyewitnes...,A personal report by the composer R. H. on his...,P.III.i. (Sweden) No. 465,May 1957,"Melbourne, 12 March 1957",Wartime Conditions in Indonesia
freq,17,38.0,30.0,73,1,78,13.0,7.0,4.0,46,...,1.0,1,1,1,1,1,1,1,1,1


In [45]:
# Drop duplicates, leaving us with 717 unique texts! 
eyewitness_all_data = eyewitness_all_data.drop_duplicates(subset = ['Text'])

In [47]:
eyewitness_all_data.shape

(717, 109)

In [61]:
eyewitness_all_data[:2]

Unnamed: 0,Index Number,Title of Document,Date,Number of pages,Author of Source,Recorded by,References,Form and Contents,Text,Author or Source,...,Recoded by,Remarks :,Remark by the analyst,Remarks by analyst and further references,Contents and Form,Form and Contrats,index Number,Recorded :,Recorded:,Title of document :
0,P.I.a. No. 62,The Jews in Pomerania,,10,Dr. Ernst Alban,"as above, March 1955","Rabbiner Dr. Elk, Rabbiner Dr. Vogelstein, Rab...",A report (in two parts: A. The Jews in Pomeran...,"The Jews in Pomerania I, Dr. Ernst Alban, here...",,...,,,,,,,,,,
1,P.I.a. No. 115,Reminiscences Regarding the Youth of Dr. Josep...,,1,Dr. Johnston,"Dr. Berent, July 1955","Dr. Friedrich Gundolf (Germanist); Rheydt, Rhi...",A few - not very significant - recollections o...,Reminiscences Regarding the Youth of Dr. Josep...,,...,,,,,,,,,,


In [54]:
# Save
# eyewitness_all_data.to_csv("eyewitness_all_data.csv", header=True, index=False)

In [55]:
# Read in
# test = pd.read_csv("eyewitness_all_data.csv")

#### Test

In [2]:
import pandas as pd

In [4]:
test = pd.read_csv('/Users/ez/Desktop/text_memory_and_identity/data collection/eyewitness_testimonies_subcollection/eyewitness_all_data.csv')

In [5]:
test.shape
test[:2]

Unnamed: 0,Index Number,Title of Document,Date,Number of pages,Author of Source,Recorded by,References,Form and Contents,Text,Author or Source,...,Recoded by,Remarks :,Remark by the analyst,Remarks by analyst and further references,Contents and Form,Form and Contrats,index Number,Recorded :,Recorded:,Title of document :
0,P.I.a. No. 62,The Jews in Pomerania,,10,Dr. Ernst Alban,"as above, March 1955","Rabbiner Dr. Elk, Rabbiner Dr. Vogelstein, Rab...",A report (in two parts: A. The Jews in Pomeran...,"The Jews in Pomerania I, Dr. Ernst Alban, here...",,...,,,,,,,,,,
1,P.I.a. No. 115,Reminiscences Regarding the Youth of Dr. Josep...,,1,Dr. Johnston,"Dr. Berent, July 1955","Dr. Friedrich Gundolf (Germanist); Rheydt, Rhi...",A few - not very significant - recollections o...,Reminiscences Regarding the Youth of Dr. Josep...,,...,,,,,,,,,,


In [6]:
from collections import Counter

In [7]:
def gen_avg_num_chrs(df):
    total_characters = df['Text'].str.len().sum()
    num_cells = len(df)
    return total_characters / num_cells

In [8]:
gen_avg_num_chrs(test)

14757.87029288703

#### Count chr num for abstracts

In [9]:
test_abs = pd.read_csv('/Users/ez/Desktop/text_memory_and_identity/data collection/s2orc_abstracts_subcollection/abstracts_all_data.csv')

In [13]:
test_abs.shape
# test_abs[:2]

(766, 3)

In [11]:
def gen_avg_num_chrs(df):
    total_characters = df['abstract'].str.len().sum()
    num_cells = len(df)
    return total_characters / num_cells

In [12]:
gen_avg_num_chrs(test_abs)

1703.7232375979113