## Chapter 3A Code Examples

This Jupyter notebook builds the classifiers demonstrated in Chapter 3A.
Run this notebook on the data sets provided in the book's supporting material first, then feel free to experiment with your own data or your own classification algorithms!


In [None]:
# Install pre-requisites used by this notebook
# Every notebook starts by installing the modules it needs!
!pip install scikit-learn==0.23.2
!pip install eli5==0.10.1
!pip install pandas==1.0.5
!pip install scipy==1.4.1

In [None]:
# After installing the libraries you need to import them into the notebook also.
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegressionCV
from sklearn.pipeline import make_pipeline
import eli5
import scipy.sparse
import math
from IPython.display import display

# The eli5 library causes a warning from scikit-learn when you import it.
# It is safe to ignore while exploring this notebook.

The next cell disables vertical scrollbars in the notebook - useful because we have so much output

In [None]:
%%javascript
IPython.OutputArea.prototype._should_scroll = function(lines) {
    return false;
}

In [None]:
# Some functions we will reuse throughout all examples

'''
Reads a CSV file into a pandas dataframe.
The CSV file is assumed to not have headers
'''
def read_data_from_csv_file(path:str):
    data = pd.read_csv(filepath_or_buffer=path, names=["utterance", "label"])
    return data

'''
Given a dataframe with multiple possible target classes, we will rewrite all
other classes besides the specified class to "negative example".
If the input dataframe has classes "reset_password", "store_hours", and "store_location"
then the output dataframe will have classes "reset_password" and "negative_example" when the
target class is "reset_password"
'''
def convert_data_for_target_class(target_class: str, all_data: pd.DataFrame):
    class_data = all_data.copy()
    class_data["label"] = class_data["label"].apply(
        lambda x: x if x == target_class else "negative_example"
    )
    return class_data

'''
Trains a binary classifier to detect a single target_class from a set of training data
provided as the 'data' input.
'''
def train_binary_classifier_pipeline(target_class: str, data: pd.DataFrame):
    class_data = convert_data_for_target_class(target_class, data)

    X = class_data["utterance"]
    Y = class_data["label"]

    # CountVectorizer turns a text phrase into an array of numbers by counting how many times each word shows up.
    # It is common to remove common words, also known as "stop words", like "the", since these common words
    # are assumed to not have significant signal in determining the classification of a text phrase
    vec = CountVectorizer(stop_words=["the"])
    clf = LogisticRegressionCV(max_iter=500, cv=3)
    binary_classifier_pipeline = make_pipeline(vec, clf)
    binary_classifier_pipeline.fit(X, Y)

    return vec, clf, binary_classifier_pipeline

In [None]:
# Training the simplest binary classifier - just two classes and we only train one classifier (for reset_password)

training_data = read_data_from_csv_file("training_2class_small.csv")

print("Training data for this example")
display(training_data)

# First we demonstrate a single binary classifier before demonstrating several classifiers.
# Since each classifier has the same training data and pipeline construction, they will each have the same features.
vec, clf, pipe = train_binary_classifier_pipeline("reset_password", training_data)
print()
print("List of features used in the bag of words classifier:")
print(
    [
        list((i, vec.get_feature_names()[i]))
        for i in range(len(vec.get_feature_names()))
    ])
print()

# Exploring the features produced for a single utterance
# We use a sparse matrix here for ease of visualization.
# Note that sparse matrices are more memory intensive and most algorithms internally use dense matrices.
# Thus we need a special conversion from the vectorization transform to a sparse matrix as shown below.

reset_my_password_sparse_features = pd.DataFrame.sparse.from_spmatrix(
    vec.transform(["reset my password"])
)

how_late_are_you_open_sparse_features = pd.DataFrame.sparse.from_spmatrix(
    vec.transform(["how late are you open"])
)

combined_df = pd.concat(
    [reset_my_password_sparse_features, how_late_are_you_open_sparse_features]
)

combined_df["Output"] = [1, 0]
combined_df.insert(
    0, "Utterance", ["reset my password", "how late are you open"]
)

print()
print("Features extracted from multiple inputs:")
display(combined_df.reset_index(drop=True))

print("Feature weights")
display(eli5.show_weights(clf, vec=vec, top=20, target_names=pipe.classes_))

In [None]:
# Computing probabilities
print("Exploring the phrase 'reset my password' with a binary classifier trained on two examples")
# We add 0.189 for the bias parameter, and each word of "reset", "my", "password" has equal weight (0.378)
reset_password_sigmoid_score = 0.189+0.378+0.378+0.378

logit=math.exp(reset_password_sigmoid_score)
#First method of computing probability
prob=1/(1+math.exp(-1*reset_password_sigmoid_score))
#Second method of computing probability
prob = math.exp(reset_password_sigmoid_score) / (1+math.exp(reset_password_sigmoid_score))

print(f"sigmoid score: {reset_password_sigmoid_score}")
print(f"logit: {logit}")
print(f"probability: {prob}")

# The probabilities above should match the prediction generated from the classifier
print()
print("Predicting the phrase 'reset my password' with binary classifier")
display(eli5.show_prediction(clf, "reset my password", vec=vec, show_feature_values=True))

In [None]:
# Now let's train and test a series of binary classifiers.  
# Here we will train three classifiers: reset_password, store_hours, store_location
# We will examine the weights learned by each classifier and how they predict the phrase "reset my password"
print("Training three binary classifiers, each with one (repeated) example")
training_data = read_data_from_csv_file("training_3class_small.csv")

print("Training data for this example - just one example for each class (repeated multiple times)")
display(training_data)

print("Training three classifiers")
reset_password_vec, reset_password_clf, reset_password_pipe = \
    train_binary_classifier_pipeline("reset_password", training_data)
store_hours_vec, store_hours_clf, store_hours_pipe = \
    train_binary_classifier_pipeline("store_hours", training_data)
store_location_vec, store_location_clf, store_location_pipe = \
    train_binary_classifier_pipeline("store_location", training_data)

print("Examining the feature weights learned by each classifier")
display(eli5.show_weights(reset_password_clf, vec=reset_password_vec, top=20, target_names=reset_password_pipe.classes_))
display(eli5.show_weights(store_hours_clf, vec=store_hours_vec, top=20, target_names=store_hours_pipe.classes_))
display(eli5.show_weights(store_location_clf, vec=store_location_vec, top=20, target_names=store_location_pipe.classes_))


print("Predicting the phrase 'reset my password' with reset_password classifier")
display(eli5.show_prediction(reset_password_clf, "reset my password", vec=reset_password_vec, show_feature_values=True))

print("Predicting the phrase 'reset my password' with store_hours classifier")
display(eli5.show_prediction(store_hours_clf, "reset my password", vec=store_hours_vec, show_feature_values=True))

print("Predicting the phrase 'reset my password' with store_location classifier")
display(eli5.show_prediction(store_location_clf, "reset my password", vec=store_location_vec, show_feature_values=True))

In [None]:
# All-in-one classifier
print("Training an all-in-one classifier on a small data set")

#CH3A : Smallest 3 class
training_data = read_data_from_csv_file("training_3class_small.csv")

print("Training data for this example")
display(training_data)

X = training_data["utterance"]
Y = training_data["label"]

# CountVectorizer turns a text phrase into an array of numbers by counting how many times each word shows up.
# It is common to remove common words, also known as "stop words", like "the", since these common words
# are assumed to not have significant signal in determining the classification of a text phrase
vec = CountVectorizer(stop_words=["the"])
clf = LogisticRegressionCV(max_iter=500, cv=3)  
pipe = make_pipeline(vec, clf)
pipe.fit(X, Y)
print("An all-in-one classifier was trained.")


#WARNING - these weights are for an all-in-one classifier, NOT the binary classifiers!!
print("The all-in-one classifier learned these weights from the training data")
display(eli5.show_weights(clf, vec=vec, top=10, target_names=pipe.classes_))

# # If you are NOT in Jupyter notebook, you can print this way instead
# explanation = eli5.explain_weights(clf, vec=vec, top=10, target_names=pipe.classes_)
# print(eli5.formatters.text.format_as_text(explanation))

print()
print("Predictions from an all-in-one classifier.")
print("Since the classifier is all-in-one, the probabilities will add up to 1.")
print("Predictions from 'reset my password'")
display(eli5.show_prediction(clf, "reset my password", vec=vec, show_feature_values=True))
print()
print("Predictions from 'tell me a joke'")
display(eli5.show_prediction(clf, "tell me a joke", vec=vec, show_feature_values=True))

In [None]:
# Now let's update the three binary classifiers with more training data
# Instead of one single example for each class (repeated several times),
# we will train with a more varied set of training data.
# Again, we examine the weights learned by each classifier and how they predict the phrase "reset my password"
print("Training three binary classifiers, each with several examples")
training_data = read_data_from_csv_file("training_3class_medium.csv")

print("Training data for this example")
display(training_data)

print("Training three classifiers")
reset_password_vec, reset_password_clf, reset_password_pipe = \
    train_binary_classifier_pipeline("reset_password", training_data)
store_hours_vec, store_hours_clf, store_hours_pipe = \
    train_binary_classifier_pipeline("store_hours", training_data)
store_location_vec, store_location_clf, store_location_pipe = \
    train_binary_classifier_pipeline("store_location", training_data)

print("Examining the feature weights learned by each classifier")
display(eli5.show_weights(reset_password_clf, vec=reset_password_vec, top=20, target_names=reset_password_pipe.classes_))
display(eli5.show_weights(store_hours_clf, vec=store_hours_vec, top=20, target_names=store_hours_pipe.classes_))
display(eli5.show_weights(store_location_clf, vec=store_location_vec, top=20, target_names=store_location_pipe.classes_))


print("Predicting the phrase 'reset my password' with reset_password classifier")
display(eli5.show_prediction(reset_password_clf, "reset my password", vec=reset_password_vec, show_feature_values=True))

print("Predicting the phrase 'reset my password' with store_hours classifier")
display(eli5.show_prediction(store_hours_clf, "reset my password", vec=store_hours_vec, show_feature_values=True))

print("Predicting the phrase 'reset my password' with store_location classifier")
display(eli5.show_prediction(store_location_clf, "reset my password", vec=store_location_vec, show_feature_values=True))

In [None]:
# This cell is an alternate exploration of the three classifiers on varied input phrases
# Try changing the input file, the test utterances, or the output types
training_data = read_data_from_csv_file("training_3class_small.csv")

print("Training data for this example")
display(training_data)

test_utterances_from_training = [
    "reset my password",
    "Where are you located?",
    "How late are you open",
]

# Try adding new phrases to this list and see what happens!
test_unseen_utterances = [
    "I can't login",
    "I can't remember my password",
    "When do you close",
    "tell me a joke"
]

'''
Runs a single text utterance through an array of text classifiers
Prints out the predictions and associated probabilities to those predictions.
'''
def multi_predict_and_score(one_utterance: str, classifier_pipelines: dict):
    print(f"Utterance '{one_utterance}' predictions with multiple classifiers")
    for target_class in classifier_pipelines:
        classifier_pipeline = classifier_pipelines[target_class]
        prediction = classifier_pipeline.predict([one_utterance])
        probabilities = pd.DataFrame(
            classifier_pipeline.predict_proba([one_utterance]),
            columns=classifier_pipeline.classes_,
        )

        # This makes the print-outs look nicer
        probabilities.columns = probabilities.columns.str.pad(
            width=14, side="left", fillchar=" "
        )

        print(
            probabilities.to_string(index=False), "\n",
        )
'''
Builds and tests multiple classifiers
'''
target_classes = sorted(training_data["label"].unique())
classifier_pipelines = {}
for target_class in target_classes:
    print(
        "**********************************************************************",
        f"\nDemonstrating a binary classifier for class {target_class}",
    )

    vec, clf, pipe = train_binary_classifier_pipeline(target_class, training_data)
    print("Predicting the phrase 'reset my password'")
    display(eli5.show_prediction(clf, "reset my password", vec=vec, show_feature_values=True))

    classifier_pipelines[target_class] = pipe
    
print(
    f"\nEvaluating examples from the original training data against '{target_class}'"
    + "\n(Note that scikit-learn does NOT perfectly remember training data.)",
    "\n",
    "------------------------------------------------------------------------",
)

for utterance in test_utterances_from_training:
    multi_predict_and_score(utterance, classifier_pipelines)

print(
    f"\nEvaluating brand new examples not found in the original training data against '{target_class}'.",
    "\n",
    "------------------------------------------------------------------------",
)

for utterance in test_unseen_utterances:
    multi_predict_and_score(utterance, classifier_pipelines)
