<a href="https://colab.research.google.com/github/clotpoledollophead/SGGK-project/blob/main/middle_english_analyzer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%pip install cltk  # analyzes historical and classical texts, including Middle English
%pip install nltk  # Natural Language Toolkit: text preprocessing, tokenization, and linguistic analysis
%pip install spacy  # advanced text processing (can be customized for Middle English).
%pip install unidecode  # normalizing text by converting non-standard characters to their closest ASCII equivalents
%pip install gensim  # topic modeling, text similarity analysis, and other advanced linguistic tasks
%pip install scikit-learn  # machine learning, text classification, and clustering analysis
%pip install matplotlib seaborn  # creates visualizations.
%pip install pandas  # handles structured data
%pip install pytesseract  # enabling digitization of scanned texts (OCR)

Collecting cltk
  Downloading cltk-1.4.0-py3-none-any.whl.metadata (6.1 kB)
Collecting boltons<22.0.0,>=21.0.0 (from cltk)
  Downloading boltons-21.0.0-py2.py3-none-any.whl.metadata (1.5 kB)
Collecting greek-accentuation<2.0.0,>=1.2.0 (from cltk)
  Downloading greek_accentuation-1.2.0-py2.py3-none-any.whl.metadata (669 bytes)
Collecting rapidfuzz<4.0.0,>=3.4.0 (from cltk)
  Downloading rapidfuzz-3.11.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Collecting scipy<1.13.0 (from cltk)
  Downloading scipy-1.12.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.4/60.4 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
Collecting stanza==1.9.2 (from cltk)
  Downloading stanza-1.9.2-py3-none-any.whl.metadata (13 kB)
Collecting stringcase<2.0,>=1.2 (from cltk)
  Downloading stringcase-1.2.0.tar.gz (3.0 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting emoji 

In [None]:
import cltk
import nltk
import spacy as sp
import unidecode as ud
import gensim as gm
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import pytesseract as pt
from google.colab import files
from collections import Counter

In [None]:
uploaded = files.upload()

Saving passus-iii-sggk.txt to passus-iii-sggk.txt
Saving passus-ii-sggk.txt to passus-ii-sggk.txt
Saving passus-i-sggk.txt to passus-i-sggk.txt
Saving passus-iv-sggk.txt to passus-iv-sggk.txt


In [None]:
p = PunktLanguageVars()
pipeline = MiddleEnglishPipeline()

In [None]:
individual_counts_with_stopwords = {}
individual_counts_without_stopwords = {}
total_with_stopwords = Counter()
total_without_stopwords = Counter()

In [None]:
for file_name, file_content in uploaded.items():
    text = file_content.decode('utf-8')

    processed = pipeline.run(text)

    word_counts_with_stopwords = Counter(processed.tokens)
    word_counts_without_stopwords = Counter(processed.filtered_tokens)

    individual_counts_with_stopwords[file_name] = word_counts_with_stopwords
    individual_counts_without_stopwords[file_name] = word_counts_without_stopwords

    total_with_stopwords.update(word_counts_with_stopwords)
    total_without_stopwords.update(word_counts_without_stopwords)

AttributeError: 'MiddleEnglishPipeline' object has no attribute 'run'

In [None]:
for file_name in uploaded.keys():
    print(f"\nWord Counts for {file_name} (Including Stopwords):")
    print(individual_counts_with_stopwords[file_name])

    print(f"\nWord Counts for {file_name} (Excluding Stopwords):")
    print(individual_counts_without_stopwords[file_name])

print("\nTotal Word Counts (Including Stopwords):")
print(total_with_stopwords)

print("\nTotal Word Counts (Excluding Stopwords):")
print(total_without_stopwords)

In [8]:
from cltk import NLP
from cltk.core.data_types import Doc
from collections import Counter
from google.colab import files
import re
import os
import pandas as pd
from cltk.utils.file_operations import open_pickle
from cltk.utils import CLTK_DATA_DIR
import os.path

class MiddleEnglishFileAnalyzer:
    def __init__(self):
        # Initialize NLP pipeline for Middle English
        self.nlp = NLP(language="enm")

        # Load CLTK stopwords for Middle English
        self.stops = self.load_cltk_stopwords()
        print(f"Loaded {len(self.stops)} stopwords from CLTK")

    def load_cltk_stopwords(self):
        """Load stopwords from CLTK data directory."""
        try:
            # Try to load Middle English stopwords
            stopwords_path = os.path.join(
                CLTK_DATA_DIR,
                'enm',
                'stops',
                'stops_enm.json'
            )

            if os.path.exists(stopwords_path):
                with open(stopwords_path, 'r', encoding='utf-8') as f:
                    stops = set(f.read().splitlines())
            else:
                # Fallback to pre-defined list if file not found
                stops = {
                    "the", "and", "of", "to", "in", "that", "for", "with", "is", "was",
                    "it", "this", "but", "they", "at", "by", "from", "be", "as", "on",
                    "not", "so", "what", "all", "were", "we", "when", "your", "said", "there",
                    "an", "my", "his", "their", "her", "he", "she", "or", "which", "me",
                    "him", "them", "such", "some", "upon", "ye", "thee", "thy", "thou"
                }
                print("Warning: Could not find CLTK stopwords, using fallback list")

            return stops

        except Exception as e:
            print(f"Error loading CLTK stopwords: {e}")
            print("Using fallback stopword list")
            return {
                "the", "and", "of", "to", "in", "that", "for", "with", "is", "was",
                "it", "this", "but", "they", "at", "by", "from", "be", "as", "on",
                "not", "so", "what", "all", "were", "we", "when", "your", "said", "there",
                "an", "my", "his", "their", "her", "he", "she", "or", "which", "me",
                "him", "them", "such", "some", "upon", "ye", "thee", "thy", "thou"
            }

    def clean_text(self, text):
        """Clean and normalize text."""
        text = text.lower()
        text = re.sub(r'\s+', ' ', text)
        return text.strip()

    def tokenize(self, text):
        """Tokenize text using CLTK's pipeline."""
        doc = self.nlp.analyze(text=self.clean_text(text))
        return [token.text.lower() for token in doc.tokens]

    def analyze_file(self, file_path):
        """Analyze a single file."""
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                text = f.read()

            tokens = self.tokenize(text)

            # Count all words
            word_counts = Counter(tokens)

            # Count stopwords
            stopword_counts = Counter(word for word in tokens if word in self.stops)

            # Count non-stopwords
            non_stopword_counts = Counter(word for word in tokens if word not in self.stops)

            return {
                'filename': os.path.basename(file_path),
                'total_words': len(tokens),
                'unique_words': len(word_counts),
                'stopwords': dict(stopword_counts.most_common()),
                'non_stopwords': dict(non_stopword_counts.most_common()),
                'stopword_ratio': len([w for w in tokens if w in self.stops]) / len(tokens) if tokens else 0
            }

        except Exception as e:
            print(f"Error processing file {file_path}: {e}")
            return None

    def create_summary_dataframe(self, results):
        """Create a pandas DataFrame summarizing the analysis results."""
        summary_data = []
        for result in results:
            if result:
                summary_data.append({
                    'Filename': result['filename'],
                    'Total Words': result['total_words'],
                    'Unique Words': result['unique_words'],
                    'Stopword Count': sum(result['stopwords'].values()),
                    'Non-stopword Count': sum(result['non_stopwords'].values()),
                    'Stopword Ratio': f"{result['stopword_ratio']:.2%}"
                })
        return pd.DataFrame(summary_data)

def analyze_uploaded_files():
    """Handle file uploads and analysis in Colab."""
    print("Please upload your text files...")
    uploaded = files.upload()

    analyzer = MiddleEnglishFileAnalyzer()
    results = []
    total_stopwords = Counter()
    total_non_stopwords = Counter()

    # Process each uploaded file
    for filename in uploaded.keys():
        result = analyzer.analyze_file(filename)
        if result:
            results.append(result)
            total_stopwords.update(result['stopwords'])
            total_non_stopwords.update(result['non_stopwords'])

    # Create summary DataFrame
    summary_df = analyzer.create_summary_dataframe(results)

    # Display summary statistics
    print("\n=== Summary Statistics ===")
    print(summary_df.to_string())

    # Display total stopword counts
    print("\n=== Most Common Stopwords Across All Files ===")
    for word, count in total_stopwords.most_common(10):
        print(f"{word}: {count}")

    # Display top non-stopwords
    print("\n=== Most Common Non-stopwords Across All Files ===")
    for word, count in total_non_stopwords.most_common(10):
        print(f"{word}: {count}")

    return results, summary_df, analyzer.stops

# Run the analysis
if __name__ == "__main__":
    results, summary_df, stopwords = analyze_uploaded_files()

ImportError: cannot import name 'Stops' from 'cltk.stops' (/usr/local/lib/python3.10/dist-packages/cltk/stops/__init__.py)