This notebook takes as input a folder containing .txt Hathi book files. It removes newline characters so fasttext can process, strips first and last 8% of text (as a rough front/back matter trimmer), fuses sentences with characters of n length to their neighbors, and applies a fasttext English detection on the sentences. It also calculates each sentence's distance from the center of the book.

The final output is a tsv file that has the following columns:


*   Filename
*   previous two sentences
*   sentence (the sentence we are interested in)
*   next two sentences
*   whether or not fasttext determined the sentence was English (T/F)
*   the English probability for the sentence
*   The non-English language with the highest fasttext probability
*   The probability for that non-English language (will outperform English if the sentence is False for English)
*   Center-distance


In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import os
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [3]:
#Function to remove front and back 8% of text

def remove_header_footer(input_folder, output_folder):
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    input_files = [f for f in os.listdir(input_folder) if f.endswith('.txt')]

    for file in input_files:
        file_path = os.path.join(input_folder, file)

        with open(file_path, 'r', encoding="utf-8") as j:
            book = j.read()

        split_book = book.split('<pb>')

        book_len = len(split_book)
        #pages_to_remove = int(book_len * 0.08)
        start= int(book_len * 0.08)
        end = book_len - start

        if book_len > 0:
            #split_book = split_book[pages_to_remove:-pages_to_remove]
            split_book = split_book[start:end]

        new_book = '<pb>'.join(split_book)


        output_files = os.path.join(output_folder, file)
        with open(output_files, 'w', encoding="utf-8") as j:
            j.write(new_book)

input_folder_path = "/content/drive/MyDrive/UIUC_Summer2024/RA_Underwood/GPT1914/headerless_all"
output_folder_path = "/content/drive/MyDrive/UIUC_Summer2024/RA_Underwood/GPT1914/headerless_all_nofrontback"
remove_header_footer(input_folder_path, output_folder_path)

In [4]:
# An extra preprocessing function that removes newline characters. This is necessary to run fasttext because otherwise get an error about the \n

def preprocess_text(text):
    text = re.sub(r'\n+', '\n', text)
    text = re.sub(r'\n', " ", text)
    #text = re.sub(r'\s+', ' ', text)
    #text = text.strip()
    return text

In [5]:
# Running the preprocessing and saving the result

import re

def process_files_in_folder(folder_path):
    for filename in os.listdir(folder_path):
        if filename.endswith(".txt"):
            file_path = os.path.join(folder_path, filename)
            with open(file_path, 'r', encoding='utf-8') as file:
                text = file.read()

            processed_text = preprocess_text(text)

            with open(file_path, 'w', encoding='utf-8') as file:
                file.write(processed_text)

# Specify the path to your folder
folder_path = "/content/drive/MyDrive/UIUC_Summer2024/RA_Underwood/GPT1914/headerless_all_nofrontback"

process_files_in_folder(folder_path)

In [6]:
# This function takes sentences as input and if the sentence is fewer than 5 characters,
# it fuses it to the next or preceding sentence.

def fuse_one_word_sentences(sentences, threshold=5):
    fused_sentences = []
    i = 0
    while i < len(sentences):
        if len(sentences[i]) <= threshold:
            fused_sentence = sentences[i]
            while i + 1 < len(sentences) and len(sentences[i + 1]) <= threshold:
                i += 1
                fused_sentence += " " + sentences[i]
            if i + 1 < len(sentences):
                fused_sentence += " " + sentences[i + 1]
                i += 1
            else:
                if fused_sentences:
                    fused_sentences[-1] += " " + fused_sentence
                else:
                    fused_sentences.append(fused_sentence)
                i += 1
                continue
            fused_sentences.append(fused_sentence)
        else:
            fused_sentences.append(sentences[i])
        i += 1
    return fused_sentences

#try:
sentences = ["hi.", "hi.             ", "here's another.", "And...", "What about a third sentence.", ".", ",", ".", "9          4", "9.........4", "Now", "Now?", "Now!!", "But not now."]
fuse_one_word_sentences(sentences)

['hi. hi.             ',
 "here's another.",
 'And...',
 'What about a third sentence.',
 '. , . 9          4',
 '9.........4',
 'Now Now? Now!! But not now.']

In [7]:
# The new fasttext function. This function looks for the whether the first label (labels[0]) is english or not,
# for a T/F boolean. It also looks for the english label and assigns the probability for that label to
# "english_prob". Finally, it looks for the top probability of a non-english
# label, and assigns that label and probability to the non-English language/probability feature.

!pip install fasttext
import fasttext

# Downloading and loading fasttext model
!wget -O lid.176.bin https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin
model = fasttext.load_model('lid.176.bin')

def find_english_probability(sentence):
    predictions = model.predict(sentence, k=-1)  # Use k=-1 to get all predictions
    labels = predictions[0]
    probabilities = predictions[1]

    english_probability = 0.0
    top_non_english_lang = ''
    top_non_english_probability = 0.0

    for label, probability in zip(labels, probabilities):
        if label == '__label__en':
            english_probability = probability
        else:
            if probability > top_non_english_probability:
                top_non_english_lang = label
                top_non_english_probability = probability

    is_english = labels[0] == '__label__en'

    return is_english, english_probability, top_non_english_lang, top_non_english_probability


Collecting fasttext
  Downloading fasttext-0.9.3.tar.gz (73 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/73.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m73.4/73.4 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting pybind11>=2.2 (from fasttext)
  Using cached pybind11-2.13.1-py3-none-any.whl.metadata (9.5 kB)
Using cached pybind11-2.13.1-py3-none-any.whl (238 kB)
Building wheels for collected packages: fasttext
  Building wheel for fasttext (pyproject.toml) ... [?25l[?25hdone
  Created wheel for fasttext: filename=fasttext-0.9.3-cp310-cp310-linux_x86_64.whl size=4246766 sha256=46e8c151b88f97c29853186d9e3ec39fd4cf473e896bca2d2613c190d9f2ab9e
  Stored in directory: /root/.cache/pip/wheels/0d/a2/00/81db54d3e6a8199b829d58

In [8]:
#Here's the master function that takes a folder with txt files that have had newline characters
# removed and 8% removed. The function tokenizes by sentence, fuses short sentences, and then
# applies fasttext. It also calculates
# the sentence's center distance (distance from the center of the book). And it gives the preceding TWO
# sentences and next TWO sentences, relative to the sentence in question. It outputs the
# first 9 columns of the tsv.

################################################# TEST #########################################

def find_noneng_sent(input_folder):
    all_sentences = []

    for file in os.listdir(input_folder):
        if file.endswith('.txt'):
            file_path = os.path.join(input_folder, file)
            with open(file_path, 'r', encoding="utf-8") as j:
                book = j.read()
                sentences = sent_tokenize(book)
                fused_sentences = fuse_one_word_sentences(sentences)
                total_sentences = len(fused_sentences)

                for i, sent in enumerate(fused_sentences):
                    is_english, english_probability, non_english_lang, non_english_probability = find_english_probability(sent)

                    prev_sent_2 = fused_sentences[i-2] if i > 1 else ""
                    prev_sent_1 = fused_sentences[i-1] if i > 0 else ""
                    prev_sents = (prev_sent_2 + " " + prev_sent_1).strip()

                    next_sent_1 = fused_sentences[i+1] if i < total_sentences-1 else ""
                    next_sent_2 = fused_sentences[i+2] if i < total_sentences-2 else ""
                    next_sents = (next_sent_1 + " " + next_sent_2).strip()

                    # calculate center distance here
                    center_distance = abs((i / total_sentences) - 0.5)

                    all_sentences.append((file, prev_sents, sent, next_sents, is_english, english_probability, non_english_lang, non_english_probability, center_distance))

    return all_sentences

# Running this on the folder of hathi txt files
input_folder = "/content/drive/MyDrive/UIUC_Summer2024/RA_Underwood/GPT1914/headerless_all_nofrontback"
results = find_noneng_sent(input_folder)


In [9]:
#Writing the results to a tsv

import csv

with open("/content/drive/MyDrive/UIUC_Summer2024/RA_Underwood/GPT1914/all_hathi_sents_macro1.tsv", 'w', newline='', encoding="utf-8") as tsvfile:
    writer = csv.writer(tsvfile, delimiter='\t')
    writer.writerow(['file', 'prev_sent', 'sent', 'next_sent', 'is_english', 'english_prob', 'non_english_lang', 'non_english_prob', 'center_dist'])
    writer.writerows(results)


Function below is old and shouldn't be used, but I'm keeping it for reference's sake!

In [None]:
#Here's the master function that takes a folder with txt files that have had newline characters removed and 8% removed. The function
# tokenizes by sentence, fuses short sentences, and then applies fasttext. It outputs the first 8 columns of the tsv.

def find_noneng_sent(input_folder):
    all_sentences = []

    for file in os.listdir(input_folder):
        if file.endswith('.txt'):
            file_path = os.path.join(input_folder, file)
            with open(file_path, 'r', encoding="utf-8") as j:
                book = j.read()
                sentences = sent_tokenize(book)
                fused_sentences = fuse_one_word_sentences(sentences)

                for i, sent in enumerate(fused_sentences):
                    is_english, english_probability, non_english_lang, non_english_probability = find_english_probability(sent)
                    prev_sent = fused_sentences[i-1] if i > 0 else ""
                    next_sent = fused_sentences[i+1] if i < len(fused_sentences)-1 else ""
                    all_sentences.append((file, prev_sent, sent, next_sent, is_english, english_probability, non_english_lang, non_english_probability))

    return all_sentences

# Example usage
input_folder = "/content/drive/MyDrive/UIUC_Summer2024/RA_Underwood/GPT1914/headerless_test_nofrontback"
results = find_noneng_sent(input_folder)