In [1]:
import re
import os
from collections import Counter
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
import pandas as pd
import numpy as np
import chardet

In [40]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data] Error downloading 'punkt' from
[nltk_data]     <https://raw.githubusercontent.com/nltk/nltk_data/gh-
[nltk_data]     pages/packages/tokenizers/punkt.zip>:   <urlopen error
[nltk_data]     [Errno -3] Temporary failure in name resolution>
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

By building a preprocessing text function we can ensure all text being used will be all in a uniform manner. Some of the text, when scraped, gets their spaces excluded. We can use a dictionary-based approach or natural language processing (**NLP**) techniques. However, for a simple regex-based fix, we can try adding spaces based on common English word patterns.

In [41]:
import re

def fix_spacing(text):
    """
    Fixes missing spaces in a text:
    - Adds spaces before uppercase letters following lowercase letters.
    - Splits improperly joined words.
    - Handles cases where words are merged without uppercase hints.
    """
    words = text.split()  # Split into words based on existing spaces
    fixed_words = []

    for word in words:
        if len(word) > 8:
            # Fix missing spaces before uppercase letters (e.g., "sinnersNor" → "sinners Nor")
            word = re.sub(r'(?<=[a-z])(?=[A-Z])', ' ', word)

            # Handle common merging issues (heuristic approach)
            word = re.sub(r'(\w)([A-Z][a-z])', r'\1 \2', word)  # "sinnersNor" → "sinners Nor"
            word = re.sub(r'([a-z])([A-Z])', r'\1 \2', word)  # "NorSitsIn" → "Nor Sits In"

            # Attempt to split words that might be missing spaces (fallback)
            word = re.sub(r'([a-z])([A-Za-z]{2,})', r'\1 \2', word)

        # Store the processed word
        fixed_words.append(word)

    # Rejoin the words into a corrected sentence
    return ' '.join(fixed_words)

# Example Test
text = "Blessed is the man Who walks not in the counsel of the ungodly Nor stands in the way of sinnersNorsitsintheseatofthetroublesome"
print(fix_spacing(text))


Blessed is the man Who walks not in the counsel of the ungodly Nor stands in the way of s inners No rsitsintheseatofthetroublesome


In [42]:
def preprocess_text(text):
    # Tokenize text
    tokens = word_tokenize(text)
    # Convert to lowercase
    tokens = [word.lower() for word in tokens]
    # Remove punctuation
    tokens = [word for word in tokens if word not in string.punctuation]
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    return tokens

After doing some preprocessing, the Psalms will be the easiest to access within a csv file. The following few code cels is going to work at putting the psalms in a CSV file. Since I am going to do this we are also going to separate the psalms by verse as well.

Creating a data frame to start to load the psalms into by their verses.

In [43]:
psalms_df = pd.DataFrame(columns=["tradition", "text", "psalm_num", "verse_num", "verse"])
psalms_df

Unnamed: 0,tradition,text,psalm_num,verse_num,verse


In [44]:
def organize_psalms(directory, trad, text, debug=True):
    psalm_num = 1
    for filename in os.listdir(directory):
        if filename.endswith('.txt'):
            # Read a small part of the file to detect encoding, different file are being endcoded in a different way
            with open(os.path.join(directory, filename), 'rb') as file:
                raw_data = file.read(10000)  # Read first 10KB
                detected = chardet.detect(raw_data)
                encoding = detected['encoding']

            with open(os.path.join(directory, filename), 'r', encoding=encoding, errors="replace") as file:
                # While reading in a file, I am removing all line breaks
                psalm = file.read().replace("\n", " ").replace("\r", " ")
                # Handling words that are lacking their spaces.
                #psalm = add_missing_spaces(psalm)
                psalms_to_df(psalm, trad, text, psalm_num)
                # Debug statement to see if the code works for inputing the first psalm.
                #break
            psalm_num += 1

            if debug:
                break

In [45]:
def psalms_to_df(psalm, tradition, text, psalm_num):
    global psalms_df  # If you're using a global variable, declare it
    if 'psalms_df' not in globals():
        psalms_df = pd.DataFrame(columns=["tradition", "text", "psalm_num", "verse_num", "verse"])

    verses = psalm.split("[NEXT VERSE]")
    # Setting the verse number back to 1 incase it is something else.
    num = 1
    for verse in verses:
        new_verse = pd.DataFrame([{"tradition": tradition, "text": text, "psalm_num": psalm_num, "verse_num": num,
                                  "verse": verse}])
        
        # Adding the data of the new target verse to the existing psalms dataframe.
        psalms_df = pd.concat([psalms_df, new_verse])
        
        # incrementing the verse number
        num += 1
        
    # droping possible duplicates incase I have added the same data twice
    psalms_df.drop_duplicates()

    '''
    # Iterating through each verse to prefrom tasks on each verse.
        print(f"Verse {num}: {verse}.")
        num += 1
    '''

Splitting up psalms by verse and adding them to a data to further clean and gather data on them.
# Implementation
## Psalms from the Orthodox Bible

In [46]:
import os

os.getcwd()

'/opt/notebooks/website/scripts/cleaning'

In [47]:
organize_psalms("../../data/psalms/bible", "Orthodox", "Bible", debug=False)
# the above should give us 2722 rows of data
psalms_df

Unnamed: 0,tradition,text,psalm_num,verse_num,verse
0,Orthodox,Bible,1,1,PSALM 152 1Alleluia Praise God in His saints P...
0,Orthodox,Bible,1,2,2 Praise Him for His mighty acts Praise Him a...
0,Orthodox,Bible,1,3,3 Praise Him with the sound of a trumpet Prai...
0,Orthodox,Bible,1,4,4 Praise Him with timbrel and dance Praise Hi...
0,Orthodox,Bible,1,5,5 Praise Him with resounding cymbals Praise H...
...,...,...,...,...,...
0,Orthodox,Bible,155,3,3 And whoshall tell my Lord The Lord Himself ...
0,Orthodox,Bible,155,4,4 He sent forth His Angel And took me from my...
0,Orthodox,Bible,155,5,5 My brothers were handsome and tall But the ...
0,Orthodox,Bible,155,6,6 I went out to meet the foreigner And he cur...


In [48]:
# Display all rows and columns
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    display(psalms_df)

Unnamed: 0,tradition,text,psalm_num,verse_num,verse
0,Orthodox,Bible,1,1,PSALM 152 1Alleluia Praise God in His saints P...
0,Orthodox,Bible,1,2,2 Praise Him for His mighty acts Praise Him a...
0,Orthodox,Bible,1,3,3 Praise Him with the sound of a trumpet Prai...
0,Orthodox,Bible,1,4,4 Praise Him with timbrel and dance Praise Hi...
0,Orthodox,Bible,1,5,5 Praise Him with resounding cymbals Praise H...
0,Orthodox,Bible,1,6,6 Let everything that breathes praise the Lor...
0,Orthodox,Bible,2,1,PSALM 153 1This is a psalm written with Davids...
0,Orthodox,Bible,2,2,2 My hands built a musical instrument My fing...
0,Orthodox,Bible,2,3,3 And whoshall tell my Lord The Lord Himself ...
0,Orthodox,Bible,2,4,4 He sent forth His Angel And took me from my...


In [49]:
psalms_df = psalms_df.iloc[16:].reset_index(drop=True)


In [50]:
psalms_df

Unnamed: 0,tradition,text,psalm_num,verse_num,verse
0,Orthodox,Bible,5,1,PSALM 1 Blessed is the man Who walks not in th...
1,Orthodox,Bible,5,2,2 But his will is in the law of the Lord And ...
2,Orthodox,Bible,5,3,3 He shall be like a tree Planted by streams ...
3,Orthodox,Bible,5,4,4 Not so are the ungodly not so But they are ...
4,Orthodox,Bible,5,5,5 Therefore the ungodly shall not rise in the...
...,...,...,...,...,...
2717,Orthodox,Bible,155,3,3 And whoshall tell my Lord The Lord Himself ...
2718,Orthodox,Bible,155,4,4 He sent forth His Angel And took me from my...
2719,Orthodox,Bible,155,5,5 My brothers were handsome and tall But the ...
2720,Orthodox,Bible,155,6,6 I went out to meet the foreigner And he cur...


Resetting the psalm numbering

In [51]:
psalms_df['psalm_num'] = psalms_df['psalm_num'] - 4


psalms_df

Unnamed: 0,tradition,text,psalm_num,verse_num,verse
0,Orthodox,Bible,1,1,PSALM 1 Blessed is the man Who walks not in th...
1,Orthodox,Bible,1,2,2 But his will is in the law of the Lord And ...
2,Orthodox,Bible,1,3,3 He shall be like a tree Planted by streams ...
3,Orthodox,Bible,1,4,4 Not so are the ungodly not so But they are ...
4,Orthodox,Bible,1,5,5 Therefore the ungodly shall not rise in the...
...,...,...,...,...,...
2717,Orthodox,Bible,151,3,3 And whoshall tell my Lord The Lord Himself ...
2718,Orthodox,Bible,151,4,4 He sent forth His Angel And took me from my...
2719,Orthodox,Bible,151,5,5 My brothers were handsome and tall But the ...
2720,Orthodox,Bible,151,6,6 I went out to meet the foreigner And he cur...


## Psalms from and Orthodox Psalter 
<INSERT_LINK>

In [57]:
organize_psalms("../../data/psalms/psalter", "Orthodox", "Psalter", debug=False)
#psalms_df

In [58]:
# Display all rows and columns
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    display(psalms_df)

Unnamed: 0,tradition,text,psalm_num,verse_num,verse
0,Orthodox,Bible,1,2,2 But his will is in the law of the Lord And ...
1,Orthodox,Bible,1,3,3 He shall be like a tree Planted by streams ...
2,Orthodox,Bible,1,4,4 Not so are the ungodly not so But they are ...
3,Orthodox,Bible,1,5,5 Therefore the ungodly shall not rise in the...
4,Orthodox,Bible,1,6,6 For the Lord knows the way of the righteous...
5,Orthodox,Bible,2,1,PSALM 2 Why do the nations rage And the people...
6,Orthodox,Bible,2,2,2 The kings of the earth set themselves And t...
7,Orthodox,Bible,2,3,3 Let us break their bands And cast away thei...
8,Orthodox,Bible,2,4,4 He who sits in the heavens shall laugh at t...
9,Orthodox,Bible,2,5,5 Then shall He speak to them in His wrath An...


In [56]:
# Display all rows and columns
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    display(psalms_df)

Unnamed: 0,tradition,text,psalm_num,verse_num,verse
0,Orthodox,Bible,1,2,2 But his will is in the law of the Lord And ...
1,Orthodox,Bible,1,3,3 He shall be like a tree Planted by streams ...
2,Orthodox,Bible,1,4,4 Not so are the ungodly not so But they are ...
3,Orthodox,Bible,1,5,5 Therefore the ungodly shall not rise in the...
4,Orthodox,Bible,1,6,6 For the Lord knows the way of the righteous...
5,Orthodox,Bible,2,1,PSALM 2 Why do the nations rage And the people...
6,Orthodox,Bible,2,2,2 The kings of the earth set themselves And t...
7,Orthodox,Bible,2,3,3 Let us break their bands And cast away thei...
8,Orthodox,Bible,2,4,4 He who sits in the heavens shall laugh at t...
9,Orthodox,Bible,2,5,5 Then shall He speak to them in His wrath An...


# Psalms, by verse, to CSV

In [54]:
# converting the file to a csv to do further steramline cleaning in another notebook or file. 
psalms_df.to_csv("../../data/csv/psalm_verse.csv", index=False)

In [31]:
'''import spacy

def add_spaces(text):
   nlp = spacy.load('en_core_web_sm')
   doc = nlp(text)
   words = []
   for token in doc:
      if not token.is_space:
         words.append(token.text)
      else:
         words.append(' ')
   return ''.join(words)

# Example usage
input_text = "Thisisatestsentencewithnospaces."
output_text = add_spaces(input_text)
print(output_text)
'''

'import spacy\n\ndef add_spaces(text):\n   nlp = spacy.load(\'en_core_web_sm\')\n   doc = nlp(text)\n   words = []\n   for token in doc:\n      if not token.is_space:\n         words.append(token.text)\n      else:\n         words.append(\' \')\n   return \'\'.join(words)\n\n# Example usage\ninput_text = "Thisisatestsentencewithnospaces."\noutput_text = add_spaces(input_text)\nprint(output_text)\n'

In [30]:
#spacy.load('en_core_web_sm')

# Playground Stuff

In [29]:
def analyze_psalms(directory):
    all_words = []
    for filename in os.listdir(directory):
        if filename.endswith('.txt'):
            with open(os.path.join(directory, filename), 'r', encoding='utf-8') as file:
                text = file.read()
                tokens = preprocess_text(text)
                all_words.extend(tokens)


    # Counting word frequencies
    word_counts = Counter(all_words)

    # Returnig the 20 most common word
    return word_counts.most_common(20)