# Creating a Hawaiian Pidgin-English Parallel Corpus 

---

## Import packages

---

In [1]:
import pprint as pp
import re
import pandas as pd
import os
import requests
from bs4 import BeautifulSoup
import numpy as np

## Scraping verses from [biblegateway.com](https://www.biblegateway.com/)

---

According to [Christodouloupoulos and Steedman](https://link.springer.com/article/10.1007/s10579-014-9287-y), the King James Version of the Bible is the oldest translation, and older translations tend to be more literal (2014). Therefore, I chose this translation for the English dataset.

In [2]:
translation = "KJV"
# Dictionary with each New Testament book's number of chapters
book_chp_dict = {"Matthew": 28,
					"Mark": 16,
					"Luke": 24,
					"John": 21,
					"Acts": 28,
					"Romans": 16,
					"1 Corinthians": 16,
					"2 Corinthians": 13,
					"Galatians": 6,
					"Ephesians": 6,
					"Philippians": 4,
					"Colossians": 4,
					"1 Thessalonians": 5, 
					"2 Thessalonians": 3,
					"1 Timothy": 6,
					"2 Timothy": 4,
					"Titus": 3,
					"Philemon": 1,
					"Hebrews": 13,
					"James": 5,
					"1 Peter": 5,
					"2 Peter": 3,
					"1 John": 5,
					"2 John": 1,
					"3 John": 1,
					"Jude": 1,
					"Revelation": 22}

all_text = []

for book, chapter_max in book_chp_dict.items():
	
	for chapter_num in range(1, (chapter_max+1)):
		chapter = str(chapter_num)

		# Get full URL | Example: https://www.biblegateway.com/passage/?search=Matthew+1&version=HWP&interface=print
		full_url = f"https://www.biblegateway.com/passage/?search={book}+{chapter}&version={translation}&interface=print" 

		page = requests.get(full_url)
		soup = BeautifulSoup(page.text, "html.parser")

		chapter_text = soup.find_all('p') # All verses are in <p>
		all_text.extend(chapter_text)

original_text = all_text


To collect the Hawaiian Pidgin version, replace the ``translation`` variable and ``book_chp_dict`` dictionary with the following:

In [3]:
translation = "HWP"

book_chp_dict = {"Matthew": 28,
"Mark": 16,
"Luke": 24,
"John": 21,
"Jesus Guys": 28,
"Fo Da Rome Peopo": 16,
"Numba 1 Fo Da Corint Peopo": 16,
"Numba 2 Fo Da Corint Peopo": 13,
"Fo Da Galatia Peopo": 6,
"Fo Da Efesus Peopo": 6,
"Fo Da Philippi Peopo": 4,
"Fo Da Colosse Peopo": 4,
"Numba 1 Fo Da Tessalonika Peopo": 5, 
"Numba 2 Fo Da Tessalonika Peopo": 3,
"1 Timoty": 6,
"2 Timoty": 4,
"Fo Titus": 3,
"Fo Filemon": 1,
"Fo Da Hebrew Peopo": 13,
"From James": 5,
"Numba 1 From Peter": 5,
"Numba 2 From Peter": 3,
"Numba 1 From John": 5,
"Numba 2 From John": 1,
"Numba 3 From John": 1,
"From Jude": 1,
"Wat Jesus Show John": 22}

## Cleaning the data

---

The code below separates multiple verses that might be in the same ``<p>`` tag.

In [4]:
all_text_long = []
for text in original_text:
    text = str(text)
    text_split = text.split(">")  
    all_text_long.extend(text_split)

This is the start of preparing a two-column dataset: one column with **verse numbers** and one column with actual **verses**. The verse numbers will be useful for joining the English and Hawaiian Pidgin dataset together later.

In [5]:
verse_num_list = []
verse_list = []
verse_temp = ""

# Regex to find book-chapter-verse
verse_num_regex = re.compile(r"(\w+-\d+-(\d+))")

all_text_long = [x.strip().rstrip("<sup") for x in all_text_long]

# Removing beginning and ending tags
for text in all_text_long:
    test_text = str(text)
    if test_text[:17] == '<span class="text':
        result = verse_num_regex.search(test_text)
        verse_num = result.group(0)
    elif test_text[-6:] == "</span":
        verse_temp = test_text[:-6]
        verse_num_list.append(verse_num)
        verse_list.append(verse_temp)
    elif '<sup class="footnote" data-fn="' in text:
        verse_temp = test_text[:-128]
        verse_num_list.append(verse_num)
        verse_list.append(verse_temp)
    elif '<sup class="crossreference" data-cr="' in text:
        verse_temp = test_text[:-141]
        verse_num_list.append(verse_num)
        verse_list.append(verse_temp)

In [6]:
kjv_df = pd.DataFrame({"verse_num": verse_num_list,
                        "verses": verse_list})

kjv_df = kjv_df.dropna()
kjv_df = kjv_df.astype(str)

# Replacing smart quotes with regular quotes
kjv_df["verses"] = kjv_df["verses"].str.replace("“", '"')
kjv_df["verses"] = kjv_df["verses"].str.replace("”", '"')
kjv_df["verses"] = kjv_df["verses"].str.replace("’", "'")
kjv_df["verses"] = kjv_df["verses"].str.replace("‘", "'")
kjv_df["verses"] = kjv_df["verses"].str.replace("<sup", "")
kjv_df["verses"] = kjv_df["verses"].str.strip()

This joins multi-line verses, such as those with many quotes or line breaks.

In [7]:
verse_list = kjv_df["verses"].tolist()
verse_num_list = kjv_df["verse_num"].tolist()

current_verse = ""
next_verse = ""
temp_verse = ""

verse_list_clean = []
verse_num_list_clean = []

for i in range(len(verse_num_list)):
    
    next_verse = verse_num_list[i]

    # Add full verses to list
    if next_verse != current_verse and i != 0:
        verse_list_clean.append(temp_verse.strip())
        verse_num_list_clean.append(verse_num_list[i-1])
        temp_verse = str(verse_list[i])

    # Combine parts of same verse together
    elif next_verse == current_verse and str(verse_list[i]).isdigit() == False:
        temp_verse = str(temp_verse) + " " + str(verse_list[i])
    
    current_verse = next_verse

kjv_df = pd.DataFrame({"verse_num": verse_num_list_clean,
                                "verses": verse_list_clean})
kjv_df.to_csv(r".\kjv_bible_clean.csv",
                                index=False)

## Joining the datasets together

Once I ran the above steps with both the King James Version and Hawaiian Pidgin Version, I merged the two datasets on the verse numbers.

In [9]:
hwp_df = pd.read_csv(r".\hwp_bible_clean.csv")
kjv_df = pd.read_csv(r".\kjv_bible_clean.csv")

full_df = kjv_df.merge(hwp_df,
                        how="inner",
                        on="verse_num",
                        suffixes=('_eng', '_hwp'))

full_df = full_df.drop(columns="verse_num")     # No need verse numbers any more
full_df = full_df.rename(columns={"verses_eng": "eng", "verses_hwp": "hwp"})

full_df.to_csv(r".\full_data.csv",
                                index=False)