<a href="https://colab.research.google.com/github/clotpoledollophead/SGGK-project/blob/main/middle_english_analyzer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%pip install cltk  # analyzes historical and classical texts, including Middle English
%pip install nltk 
%pip install spacy  # advanced text processing (can be customized for Middle English).
%pip install unidecode  # normalizing text by converting non-standard characters to their closest ASCII equivalents
%pip install gensim  # topic modeling, text similarity analysis, and other advanced linguistic tasks
%pip install scikit-learn
%pip install matplotlib seaborn
%pip install pandas
%pip install pytesseract  # enabling digitization of scanned texts (OCR)

In [20]:
import cltk
import nltk
import spacy as sp
import unidecode as ud
import gensim as gm
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import pytesseract as pt
from collections import Counter
import requests
import os

In [6]:
text_files = ["corpus/passus-i-sggk.txt", "corpus/passus-ii-sggk.txt", "corpus/passus-iii-sggk.txt", "corpus/passus-iv-sggk.txt"]
STOPWORDS_URL = "https://raw.githubusercontent.com/cltk/cltk/master/src/cltk/stops/enm.py"

In [7]:
def fetch_stopwords(url):
    response = requests.get(url)
    response.raise_for_status()
    content = response.text

    start_marker = "STOPS: list[str] = ["
    start_idx = content.index(start_marker) + len(start_marker)
    end_idx = content.index("]", start_idx)
    stopwords_content = content[start_idx:end_idx].strip()

    stopwords = [
        word.strip().strip('"')
        for word in stopwords_content.split(",")
        if word.strip()
    ]
    return stopwords

In [8]:
def process_file(filepath, stopwords):
    with open(filepath, "r", encoding="utf-8") as f:
        text = f.read().lower()

    tokens = text.split()

    word_counts_with_stopwords = Counter(tokens)

    tokens_no_stops = [word for word in tokens if word not in stopwords]
    word_counts_without_stopwords = Counter(tokens_no_stops)

    return word_counts_with_stopwords, word_counts_without_stopwords


In [22]:
def to_dataframe(counter):
    df = pd.DataFrame(counter.items(), columns = ['Word', 'Count'])
    df.sort_values(by = 'Count', ascending = False, inplace = True)
    return df

In [27]:
stopwords = fetch_stopwords(STOPWORDS_URL)
print(f"Fetched {len(stopwords)} stop words.")

total_with_stopwords = Counter()
total_without_stopwords = Counter()

dfs_with_stopwords = {}
dfs_without_stopwords = {}

for filepath in text_files:
    print(f"Processing: {filepath}")
    counts_with, counts_without = process_file(filepath, stopwords)
    
    total_with_stopwords.update(counts_with)
    total_without_stopwords.update(counts_without)

    file_name = os.path.splitext(os.path.basename(filepath))[0]
    dfs_with_stopwords[file_name] = to_dataframe(counts_with)
    dfs_with_stopwords
    dfs_without_stopwords[file_name] = to_dataframe(counts_without)
    dfs_without_stopwords

total_with_df = to_dataframe(total_with_stopwords)
total_without_df = to_dataframe(total_without_stopwords)
total_with_df
total_without_df

Fetched 275 stop words.
Processing: corpus/passus-i-sggk.txt
Processing: corpus/passus-ii-sggk.txt
Processing: corpus/passus-iii-sggk.txt
Processing: corpus/passus-iv-sggk.txt


Unnamed: 0,Word,Count
3,watz,180
64,ful,150
107,bot,112
152,so,110
132,ȝe,88
...,...,...
2947,fynde.,1
682,þer-vnder,1
2949,negh,1
2950,inwyth,1
