In [5]:
# Import dependancies
try:
    from splinter import Browser
    from bs4 import BeautifulSoup as soup
    import pprint
    import pandas as pd
    import configparser
    print("imported successfully")
except ImportError as e: # If failed to import, return notice
    print("Import faliure:", e)
    
config = configparser.ConfigParser()
config.read('config.ini')

username = config['credentials']['username']
password = config['credentials']['password']

imported successfully


In [6]:
# Initialize splinter/browser
browser = Browser('chrome')

# Visit hymnal
url = 'https://hymnal.pcusastore.com/Hymn/HymnSearch'
browser.visit(url)

# Assuming 'UserName' and 'Password' are the names of the input fields
browser.fill('UserName', (username))
browser.fill('Password', (password))
browser.find_by_css('.linkbutton-login').first.click() #click the login button

# Click on search page
browser.find_by_css('ul.nav a[href="/Hymn/HymnSearch"]').first.click()
# Find the 'Hymn #' link by XPath and click it
hymn_sort_link_xpath = "//a[contains(@class, 'k-link') and contains(text(), 'Hymn #')]"
browser.find_by_xpath(hymn_sort_link_xpath).first.click()

# Click on the first hymn
browser.find_by_css('a[href="/Hymn/Index/1"]').click()

# After logging in, parse the first hymn's HTML with Beautiful Soup
first_hymn_html = browser.html
html_soup = soup(first_hymn_html, 'html.parser')

# Find the 'mainContent' div
main_content = html_soup.find('div', class_='mainContent')

# Extract the text nodes (ignoring any nested tags like <h2>, <h3> etc.)
text_lines = [text for text in main_content.stripped_strings] # 'text_lines' now contains all the text parts separated by <br> tags.

In [3]:
# Initialize an empty list to store hymn data
hymns_data = []

for hymn_number in range(2, 853):  # Assuming hymn numbers go from 2 to 853
    # Construct the URL for the hymn page
    url = f"https://hymnal.pcusastore.com/Hymn/Index/{hymn_number}"
    browser.visit(url)

    # Scrape the hymn text
    html = browser.html
    html_soup = soup(html, 'html.parser')
    main_content = html_soup.find('div', class_='mainContent')
    text_lines = ' '.join([text for text in main_content.stripped_strings])

    # Store the hymn number and text in a dictionary, then add it to the list
    hymns_data.append({'HymnNumber': hymn_number, 'Text': text_lines})

# Once all hymns are processed, create a DataFrame
df_hymns = pd.DataFrame(hymns_data)

In [None]:
# Save the DataFrame to a CSV file for later processing
df_hymns.to_csv('./GitHub/hymnal-web-scraping/hymns.csv', index=False)

In [None]:
# Load unclean hymn data into a new DataFrame
new_df_hymns = pd.read_csv('./GitHub/hymnal-web-scraping/hymns.csv')

# Define a function to extract text between "Lyrics" and "Informational Notes"
def extract_lyrics(text):
    # Find the start of the lyrics
    start_idx = text.find('Lyrics')
    if start_idx == -1:
        # If 'Lyrics' is not found, return an empty string or the whole text as preferred
        return ''
    start_idx += len('Lyrics')  # Adjust start index to skip the word 'Lyrics' itself
    
    # Find the end of the lyrics
    end_idx = text.find('Informational Notes', start_idx)
    if end_idx == -1:
        # If 'Informational Notes' is not found, return the text from 'Lyrics' to the end
        return text[start_idx:].strip()
    
    # Extract and return the text between 'Lyrics' and 'Informational Notes'
    return text[start_idx:end_idx].strip()

# Apply the function to the 'Text' column on the DataFrame
new_df_hymns['CleanedText'] = new_df_hymns['Text'].apply(extract_lyrics)

# Now new_df_hymns has a new column 'CleanedText' with the extracted lyrics

In [None]:
# Send to CSV as a new file
new_df_hymns.to_csv('./GitHub/hymnal-web-scraping/cleaned_hymns.csv', index=False)

In [7]:
# Close the brower's connection
browser.quit()