In [1]:
import csv
import pandas as pd
import re

from collections import defaultdict

In [2]:
def is_valid_word(token):
    return re.sub(r'[^\wþȝð-]', '', token)

In [3]:
def analyze_middle_english_text(filepath):
    word_freq = defaultdict(int)
    word_lines = defaultdict(list)
    all_lines = {}

    with open(filepath, 'r', encoding='utf-8') as f:
        for line_number, line in enumerate(f, 1):
            original_line = line.rstrip("\n")
            all_lines[line_number] = original_line
            tokens = re.split(r'\s+|--', line.lower())
            for token in tokens:
                word = is_valid_word(token)
                if word:
                    word_freq[word] += 1
                    if line_number not in [ln for ln, _ in word_lines[word]]:
                        word_lines[word].append((line_number, original_line.strip()))

    # Prepare rows for CSV or DataFrame
    output_rows = []
    for word in sorted(word_freq.keys()):
        lines = word_lines[word]
        first_line = lines[0]
        output_rows.append([word, word_freq[word], first_line[0], first_line[1]])
        for line_info in lines[1:]:
            output_rows.append(["", "", line_info[0], line_info[1]])

    return output_rows

In [4]:
def export_to_csv(data, output_path):
    df = pd.DataFrame(data, columns=['Word', 'Frequency', 'Line Number', 'Line Content'])
    df.to_csv(output_path, index=False, encoding='utf-8')
    print(f"Output saved to {output_path}")

In [5]:
if __name__ == "__main__":
    filepath = '../translated_emotions/corpus/full-sggk.txt'
    output_csv = 'word_occurrences.csv'

    try:
        csv_output = analyze_middle_english_text(filepath)
        df = pd.DataFrame(csv_output, columns=['Word', 'Frequency', 'Line Number', 'Line Content'])
        display(df.head(50))

        # export_to_csv(csv_output, output_csv)
    except FileNotFoundError:
        print("File not found. Please check the filepath.")

Unnamed: 0,Word,Frequency,Line Number,Line Content
0,a,271.0,28,"Þat a selly in siȝt summe men hit holden,"
1,,,59,So hardy a here on hille.
2,,,76,"Smal sendal bisides, a selure hir ouer"
3,,,83,A semloker þat euer he syȝe
4,,,92,Vpon such a dere day er hym deuised were
5,,,110,And Agrauayn a la dure mayn on þat oþer syde s...
6,,,134,"For vneþe watz þe noyce not a whyle sesed,"
7,,,152,"A strayte cote ful streȝt, þat stek on his sides,"
8,,,153,"A meré mantile abof, mensked withinne"
9,,,175,"A grene hors gret and þikke,"
