# TAHLR Week 9d: Text Classification Algorithms

Code notebook for TAHLR course at ISAW (Fall 2023) including a streamlined example of notebook 9d

In [None]:
# # installs
# # uncomment and install if necessary

# !python -m pip install -U scikit-learn
# !python -m pip install lime
# !python -m pip install git+https://github.com/diyclassics/cltk_readers.git#egg=cltkreaders
# !python -m pip install -U ipywidgets

In [None]:
# imports

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, ConfusionMatrixDisplay
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDClassifier
from lime.lime_text import LimeTextExplainer

from cltkreaders.grc import GreekTesseraeCorpusReader

In [None]:
# get corpus readers/files

GCR = GreekTesseraeCorpusReader()

iliad = GCR.fileids(match='iliad')
odyssey = GCR.fileids(match='odyssey')

iliad_sents = list(GCR.sents(iliad))
odyssey_sents = list(GCR.sents(odyssey))

In [None]:
# preprocess texts

def preprocess(text):
    import unicodedata
    def strip_diacritics(text):
        # strip diacritics from greek words with function
        stripped_text = ''.join(c for c in unicodedata.normalize('NFD', text)
                    if unicodedata.category(c) != 'Mn')
        return unicodedata.normalize('NFC', stripped_text)
    return strip_diacritics(text)

iliad_sents = [preprocess(sent) for sent in iliad_sents]
odyssey_sents = [preprocess(sent) for sent in odyssey_sents]

In [None]:
# make dataframe

df = pd.DataFrame()

In [None]:
# fill dataframe

class_names = ['iliad', 'odyssey']
df['class'] = [0 for sent in iliad_sents] + [1 for sent in odyssey_sents]
df['texts'] = iliad_sents + odyssey_sents

In [None]:
# get summary info for class

df['class'].value_counts()

In [None]:
# make train/test splits

X_train, X_test, Y_train, Y_test = train_test_split(df['texts'],
                                                    df['class'],
                                                    test_size=0.2,
                                                    random_state=42,
                                                    stratify=df['class'])

print('Size of Training Data ', X_train.shape[0])
print('Size of Test Data ', X_test.shape[0])

In [None]:
# build classifier pipeline

pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(min_df = 2, max_features=1000)),
    ('svm', SGDClassifier(loss='log_loss', max_iter=1000, tol=1e-3, random_state=42))
])

pipeline.fit(X_train, Y_train)

In [None]:
# Get accuracy

Y_pred = pipeline.predict(X_test)
print ('Accuracy Score - ', accuracy_score(Y_test, Y_pred))

In [None]:
# # make confusion matrix

cm = confusion_matrix(Y_test, Y_pred)
cm

In [None]:
# make confusion matrix
# nb: plot_confusion_matrix as shown in *Blueprints* is deprecated; use ConfusionMatrixDisplay instead as shown below [PJB 11.3.2023]

CMD = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=pipeline.classes_)
CMD.plot(cmap='Blues');

In [None]:
# create explainer

explainer = LimeTextExplainer(class_names=class_names)

In [None]:
# change X_test to a list 

X_test_list = X_test.tolist()
Y_test_list = Y_test.tolist()

In [None]:
# write function for "verbose" explanation

def generate_explanation(idx, class_names=class_names):
    exp = explainer.explain_instance(X_test_list[idx], pipeline.predict_proba, num_features = 5)
    print(f'Document id: {idx}')
    print(f'Probability (0 = {class_names[0]}, 1 = {class_names[1]}) =', pipeline.predict_proba([X_test_list[idx]])[0,1])
    print(f'True class: {Y_test.iloc[idx]} ({class_names[0] if Y_test_list[idx] == 0 else class_names[1]})')
    return exp

In [None]:
# given an explanation example

idx = 0
exp = generate_explanation(idx)

In [None]:
# Show explanation as list

exp.as_list()

In [None]:
# Show explanation as barplot

fig = exp.as_pyplot_figure()

In [None]:
# Show explanation as annotated text

idx = 0
exp = generate_explanation(idx)
exp.show_in_notebook(text = True)

In [None]:
# Generate random explanation from test set

import random

for idx in random.sample(range(len(X_test)), 5):
    exp = generate_explanation(idx)
    exp.show_in_notebook(text = True)
