# TAHLR Week 9d: Text Classification Algorithms

Code notebook for TAHLR course at ISAW (Fall 2023) including a streamlined example of notebook 9d

In [None]:
# # installs
# # uncomment and install if necessary

# !python -m pip install lime
# !python -m pip install git+https://github.com/diyclassics/cltk_readers.git#egg=cltkreaders

In [None]:
# imports

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, ConfusionMatrixDisplay
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDClassifier
from lime.lime_text import LimeTextExplainer

from cltkreaders.grc import GreekTesseraeCorpusReader

In [None]:
# Utils script for working with AG text

import re
import html
import unicodedata


# Helper function for preprocessing
def preprocess(
    text,
    lower=True,
    normalize=True,
    punctuation=False,
    numbers=False,
    unhyphenate=False,
    remove_lines=False,
    remove_spaces=False,
    entities=False,
    diacriticals=True,
    fill=" ",
):
    if not entities:
        text = html.unescape(text)

    if unhyphenate:
        text = re.sub(r"[-»—]\s?\n", "", text, flags=re.MULTILINE)

    if lower:
        text = text.lower()  # Lowercase

    if not punctuation:
        # Remove punctuation
        punctuation = "\"#$%&'()*+,/:;<=>@[\]^_`{|}~.?!«»—“-”"
        misc = "¡£¤¥¦§¨©¯°±²³´µ¶·¸¹º¼½¾¿÷·–‘’†•ↄ∞⏑〈〉（）"
        misc += punctuation
        translator = str.maketrans({key: fill for key in misc})
        text = text.translate(translator)

    if not numbers:
        # Remove numbers
        translator = str.maketrans({key: fill for key in "0123456789"})
        text = text.translate(translator)

    if remove_lines:
        text = " ".join(text.split("\n"))

    if remove_spaces:
        text = fill.join(text.split())

    if not diacriticals:
        # text = remove_diacriticals(text)
        pass

    # Fix spacing
    text = re.sub(" +", " ", text)

    text = unicodedata.normalize("NFC", text)

    return text.strip()

In [None]:
# get corpus readers/files

GCR = GreekTesseraeCorpusReader()

plato = GCR.fileids(match='plato')
aristotle = GCR.fileids(match='aristotle')
herodotus = GCR.fileids(match='herodotus')
thucydides = GCR.fileids(match='thucydides')

plato_sents = list(GCR.sents(plato))
aristotle_sents = list(GCR.sents(aristotle))
herodotus_sents = list(GCR.sents(herodotus))
thucydides_sents = list(GCR.sents(thucydides))


In [None]:
# preprocess texts

def custom_preprocess(text):
    import unicodedata
    def strip_diacritics(text):
        text = preprocess(text)
        # strip diacritics from greek words with function
        stripped_text = ''.join(c for c in unicodedata.normalize('NFD', text)
                    if unicodedata.category(c) != 'Mn')
        return unicodedata.normalize('NFC', stripped_text)
    return strip_diacritics(text)

plato_sents = [custom_preprocess(sent) for sent in plato_sents][:5000]
aristotle_sents = [custom_preprocess(sent) for sent in aristotle_sents][:5000]
herodotus_sents = [custom_preprocess(sent) for sent in herodotus_sents]
thucydides_sents = [custom_preprocess(sent) for sent in thucydides_sents]


In [None]:
# make dataframe

df = pd.DataFrame()

In [None]:
# fill dataframe

class_names = ['philosopy', 'historiography']
df['class'] = [0 for sent in plato_sents] + [0 for sent in aristotle_sents] + [1 for sent in herodotus_sents] + [1 for sent in thucydides_sents]    
df['texts'] = plato_sents + aristotle_sents + herodotus_sents + thucydides_sents

In [None]:
# get summary info for class

df['class'].value_counts()

In [None]:
# stops

all_words = [word for sent in df['texts'] for word in sent.split()]

from collections import Counter
word_counts = Counter(all_words)

STOPWORDS = [word for word, count in word_counts.most_common(50)]

In [None]:
# make train/test splits

X_train, X_test, Y_train, Y_test = train_test_split(df['texts'],
                                                    df['class'],
                                                    test_size=0.2,
                                                    random_state=42,
                                                    stratify=df['class'])

print('Size of Training Data ', X_train.shape[0])
print('Size of Test Data ', X_test.shape[0])

In [None]:
# build classifier pipeline

pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words=STOPWORDS, max_features=10000)),
    ('svm', SGDClassifier(loss='log_loss', max_iter=1000, tol=1e-3, random_state=42))
])

pipeline.fit(X_train, Y_train)

In [None]:
# Get accuracy

Y_pred = pipeline.predict(X_test)
print ('Accuracy Score - ', accuracy_score(Y_test, Y_pred))

In [None]:
# # make confusion matrix

cm = confusion_matrix(Y_test, Y_pred)
cm

In [None]:
# make confusion matrix
# nb: plot_confusion_matrix as shown in *Blueprints* is deprecated; use ConfusionMatrixDisplay instead as shown below [PJB 11.3.2023]

CMD = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=pipeline.classes_)
CMD.plot(cmap='Blues');

In [None]:
# create explainer

explainer = LimeTextExplainer(class_names=class_names)

In [None]:
# change X_test to a list 

X_test_list = X_test.tolist()
Y_test_list = Y_test.tolist()

In [None]:
# write function for "verbose" explanation

def generate_explanation(idx, class_names=class_names):
    exp = explainer.explain_instance(X_test_list[idx], pipeline.predict_proba, num_features = 5)
    print(f'Document id: {idx}')
    print(f'Probability (0 = {class_names[0]}, 1 = {class_names[1]}) =', pipeline.predict_proba([X_test_list[idx]])[0,1])
    print(f'True class: {Y_test.iloc[idx]} ({class_names[0] if Y_test_list[idx] == 0 else class_names[1]})')
    return exp

In [None]:
# given an explanation example

idx = 6
exp = generate_explanation(idx)

In [None]:
# Show explanation as list

exp.as_list()

In [None]:
# Show explanation as barplot

fig = exp.as_pyplot_figure()

In [None]:
# Show explanation as annotated text

idx = 6
exp = generate_explanation(idx)
exp.show_in_notebook(text = True)

In [None]:
# Generate random explanation from test set

import random

for idx in random.sample(range(len(X_test)), 5):
    exp = generate_explanation(idx)
    exp.show_in_notebook(text = True)
