# Filter classes by accuracy order - Jeopardy dataset
- Object of this notebook is to Filter Classes
  - order classes by SVC class accuracy
  - drop most accurate classes
  - Get ballpark of 80% acuracy and 90 examples

In [1]:
from abc import ABC, abstractmethod
import gzip
from IPython.display import display, HTML
import collections
import json
import math
import numpy as np
import os
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import StratifiedKFold
from sklearn.svm import SVC
import time
from typing import List

pd.options.display.max_colwidth = 100

%load_ext autoreload
%autoreload 2

# Increase the width of the notebook so that it is the width of the browser 
# which allows larger size for the dashboard
display(HTML('<style>.container { width:100% !important; }</style>'))

#### Load workspace dataset

In [2]:
csv_file = '../../../data/jeopardy/jeopardy_200.csv'
if os.path.exists(csv_file):
    df_ws = pd.read_csv(csv_file)
else:
    print(f'file not found = {csv_file}')
    # Load the json file
    json_gzip_file = '../../../data/jeopardy/JEOPARDY_QUESTIONS1.json.gzip'
    with gzip.open(json_gzip_file, 'r') as fin:
        questions_list = json.loads(fin.read().decode('utf-8'))
    
    # Gather the most common categories
    categories = collections.Counter([q["category"] for q in questions_list])
    common_categories = categories.most_common(201)
    top_intents = [c[0] for c in common_categories]
    print(f'len(top_intents) = {len(top_intents)}')

    # Create the example and intent from the top most categories.
    # Note: category 'CROSSWORD CLUES "F"' is composed of questions with '<' 
    #       thus get 201 categories to end up with 200
    data = []
    for q in questions_list:
        if '<' not in q['question'] and '<' not in q['answer'] and q['category'] in top_intents:
            data.append({'intent': q['category'].replace('&', 'AND'), 'example': f'{q["question"]} {q["answer"]}'})
    df_ws = pd.DataFrame(data)
    df_ws.to_csv(csv_file, index=False)

print(f'n_intents   = {len(df_ws["intent"].unique())}')
print(f'df_ws.shape = {df_ws.shape}')
# display(HTML(df_ws.head().to_html()))

n_intents   = 200
df_ws.shape = (32623, 2)


In [3]:
x = df_ws['example'].to_numpy()
y = df_ws['intent'].to_numpy().ravel()
print(f'x.shape        = {x.shape}')
print(f'y.shape        = {y.shape}')

x.shape        = (32623,)
y.shape        = (32623,)


#### Encode with USE encoder

In [4]:
%%time
class MiniLMEmbedding:
    def __init__(self):
        self.transformer = SentenceTransformer('paraphrase-MiniLM-L6-v2')
    def encode(self, input_sentences: List[str]) -> np.array:
        sentences = [sentence.lower() for sentence in input_sentences]
        embedded_sentences = [self.embed_sentence(s) for s in sentences]
        return np.array(embedded_sentences)
    def embed_sentence(self, sentence: str) -> np.array:
        embedding = self.transformer.encode(sentence, show_progress_bar=False, convert_to_numpy=True)
        return embedding

encoded_file = '../../../data/jeopardy/x_encoded.csv'
if os.path.exists(encoded_file):
    df = pd.read_csv(encoded_file, header=None)
    x_encoded = df.to_numpy()
else:
    encoder = MiniLMEmbedding()
    x_encoded = encoder.encode(x)
    # Save to file
    df = pd.DataFrame(x_encoded)
    df.to_csv(encoded_file, header=False, index=False)

print(f'x_encoded.shape = {x_encoded.shape}')

x_encoded.shape = (32623, 384)
CPU times: user 2.57 s, sys: 1.59 s, total: 4.17 s
Wall time: 6.08 s


#### Run a cross validation on SVM classifiers
- Split the combined (x_encoded) dataset into 7 splits
- Each train (x_trn) is 4650 (32,549/7)
- Each test (x_tst) is 27,899 (32,549 * 6/7)
- Score the class accuracy of each cross split

In [5]:
# aimtk (AI Metrics Toolkit) 
# Classes from aimtk which score class accuracy
class Metric(ABC):
    @abstractmethod
    def score(self, model, X_test, y_test, **kwargs):
        """Return the value of the metric with respect to the test data."""

    @abstractmethod
    def __str__(self):
        """Return a unique metric name."""

class PerClassAccuracyScoresMetric(Metric):

    def __init__(self):
        super().__init__()

    def score(self, model, X_test, y_test, class_name=None, **kwargs):
        # Use the confusion matrix to generate the accuracy per class
        from sklearn.metrics import confusion_matrix
        conf_mat = confusion_matrix(model.predict(X_test), y_test)
        acc_per_class = conf_mat.diagonal() / conf_mat.sum(axis=0)

        # Get uniques from test to get the labels
        unique, unique_indices = np.unique(y_test, return_index=True)

        # Change nan's into 0.  In the cases where there were no correct predictions
        acc_per_class = [0 if math.isnan(a) else a for a in acc_per_class]

        # Generate dictionary from classes and accuracy scores
        acc_scores = {y_test[ui]: a for ui, a in zip(unique_indices, acc_per_class)}

        # add any missing scores
        for c in model.classes_:
            if c not in acc_scores:
                acc_scores[c] = 0

        if class_name is None:
            return acc_scores
        else:
            return acc_scores[class_name]

    def __str__(self):
        return "PerClassAccuracy"

In [6]:
%%time
# cross validation split to gather class accuracies
n_splits = 7
skf = StratifiedKFold(n_splits=n_splits, random_state=42, shuffle=True)

runs = None
run = 0
# reverse the normal train/test split sizes.
# Keep the train small and the test large
# So the trains are similar in size to the representative dataset
for tst_index, trn_index in skf.split(x_encoded, y):
    x_trn = x_encoded[trn_index]
    x_tst = x_encoded[tst_index]
    y_trn = y[trn_index]
    y_tst = y[tst_index]
    start = time.time()
    model = SVC(probability=True, random_state=42)
    model.fit(x_trn, y_trn)
    print(f'run={run} fit()   dur={time.time() - start}')
    start = time.time()
    scores = PerClassAccuracyScoresMetric().score(model, x_tst, y_tst)
    if runs == None:
        runs = {k: {'intent': k, f'run {run} accuracy': v} for k,v in scores.items()}
    else:
        for k,v in scores.items():
            runs[k][f'run {run} accuracy'] = v
    print(f'run={run} score() dur={time.time() - start}')
    start = time.time()
    run += 1

run=0 fit()   dur=50.18264102935791
run=0 score() dur=120.30399370193481
run=1 fit()   dur=45.836108922958374
run=1 score() dur=118.04988622665405
run=2 fit()   dur=46.50671672821045
run=2 score() dur=114.67674517631531
run=3 fit()   dur=51.217140674591064
run=3 score() dur=117.45987701416016
run=4 fit()   dur=42.91872429847717
run=4 score() dur=118.99607110023499
run=5 fit()   dur=48.0297646522522
run=5 score() dur=122.12489914894104
run=6 fit()   dur=43.84250044822693
run=6 score() dur=120.09338974952698
CPU times: user 19min 20s, sys: 600 ms, total: 19min 20s
Wall time: 19min 20s


#### Gather class accuracies

In [13]:
df_intent_runs = pd.DataFrame(runs.values())

# Add mean accuracy for each class across the runs
df_intent_runs['mean accuracy'] = df_intent_runs[[f'run {i} accuracy' for i in range(n_splits)]].mean(axis=1)

# Sort high to low class accuracies
df_intent_runs.sort_values(by=[f'mean accuracy', 'intent'], ascending=[False, True], inplace=True, ignore_index=True)

# Display the top and low accuracies
fmt = {f'run {i} accuracy': '{:,.2%}'.format for i in range(n_splits)}
fmt['mean accuracy'] = '{:,.2%}'.format
display(HTML(df_intent_runs.to_html(formatters=fmt)))
# display(HTML(df_intent_runs.head().to_html(formatters=fmt)))
# display(HTML(df_intent_runs.tail().to_html(formatters=fmt)))

# Print out the bottom 10 accuracies (only display the class means)
fmt = {'mean accuracy':'{:,.2%}'.format}
display(HTML(df_intent_runs[['intent', 'mean accuracy']].tail(10).to_html(formatters=fmt)))

# Display the mean accuracies for each run
data = [{'run': i, 'accuracy': f'{float(df_intent_runs[[f"run {i} accuracy"]].mean()):.2%}'} for i in range(n_splits)]
data.append({'run': 'mean', 'accuracy':  f'{float(df_intent_runs[["mean accuracy"]].mean()):.2%}'})
display(HTML(pd.DataFrame(data).to_html()))

Unnamed: 0,intent,run 0 accuracy,run 1 accuracy,run 2 accuracy,run 3 accuracy,run 4 accuracy,run 5 accuracy,run 6 accuracy,mean accuracy
0,LANGUAGES,89.12%,87.87%,87.03%,90.38%,89.54%,88.70%,90.42%,89.01%
1,SPORTS,82.76%,79.66%,79.66%,75.86%,80.00%,83.04%,86.16%,81.02%
2,COLLEGES AND UNIVERSITIES,76.85%,78.52%,77.52%,79.80%,83.50%,82.15%,78.11%,79.49%
3,ANIMALS,76.52%,78.41%,81.06%,81.44%,75.38%,76.89%,79.55%,78.46%
4,FIRST LADIES,77.55%,74.15%,79.59%,79.59%,71.92%,76.03%,71.23%,75.72%
5,THE BIBLE,73.57%,74.89%,66.52%,77.63%,70.93%,80.18%,74.45%,74.02%
6,MUSICAL INSTRUMENTS,71.51%,72.09%,70.93%,67.44%,77.33%,76.88%,73.99%,72.88%
7,BALLET,73.86%,72.20%,70.54%,75.10%,69.71%,70.42%,71.78%,71.94%
8,ASTRONOMY,76.40%,64.60%,66.88%,75.62%,71.88%,78.75%,63.12%,71.03%
9,BODIES OF WATER,73.26%,78.02%,64.34%,71.32%,71.69%,70.33%,63.37%,70.33%


Unnamed: 0,intent,mean accuracy
190,POP CULTURE,0.00%
191,PRESIDENTS,0.00%
192,QUASI-RELATED PAIRS,0.00%
193,QUOTES,0.00%
194,RIVERS,0.00%
195,THE 20th CENTURY,0.00%
196,THE OLD TESTAMENT,0.00%
197,U.S.A.,0.00%
198,WHERE AM I?,0.00%
199,WORLD LITERATURE,0.00%


Unnamed: 0,run,accuracy
0,0,19.10%
1,1,19.20%
2,2,19.29%
3,3,18.84%
4,4,19.15%
5,5,18.93%
6,6,18.71%
7,mean,19.03%


#### Determine which intents to keep and remove from jeopardy dataset
- Representative dataset needs 92, so get a list of keep for highest 92.
- since the accuracies were so low, keep the highest 92.

In [8]:
keep_count = 92

# Gather the classes to keep.
# Dump into a list which can easily be copied into another notebook
keep_intents = df_intent_runs[['intent']].head(keep_count).to_numpy().flatten()
print(f'len(keep_intents) = {len(keep_intents)}')
print(f'keep_intents = {list(keep_intents)}')
# Gather the example indices for the examples to keep
keep_indices = [i for i in range(len(y)) if y[i] in keep_intents]
print(f'len(keep_indices) = {len(keep_indices)}')

# Gather the classes to be removed.
remove_intents = df_intent_runs[['intent']].tail(len(df_intent_runs) - keep_count).to_numpy().flatten()
print(f'len(remove_intents) = {len(remove_intents)}')
print(f'remove_intents = {list(remove_intents)}')
# Gather the example indices for the examples to remove
remove_indices = [i for i in range(len(y)) if y[i] in remove_intents]
print(f'len(remove_indices) = {len(remove_indices)}')

len(keep_intents) = 92
keep_intents = ['LANGUAGES', 'SPORTS', 'COLLEGES AND UNIVERSITIES', 'ANIMALS', 'FIRST LADIES', 'THE BIBLE', 'MUSICAL INSTRUMENTS', 'BALLET', 'ASTRONOMY', 'BODIES OF WATER', 'RELIGION', 'BUSINESS AND INDUSTRY', 'MUSEUMS', 'SHAKESPEARE', 'POTENT POTABLES', 'LITERATURE', 'HOLIDAYS AND OBSERVANCES', 'WORLD CAPITALS', 'FRUITS AND VEGETABLES', 'ISLANDS', 'AMERICAN HISTORY', 'FOOD', 'OPERA', 'U.S. CITIES', 'MOUNTAINS', 'POP MUSIC', 'LIBRARIES', 'U.S. PRESIDENTS', 'SCIENCE', 'MYTHOLOGY', 'TRANSPORTATION', 'BEFORE AND AFTER', 'WORD ORIGINS', 'ARCHITECTURE', 'WEIGHTS AND MEASURES', 'ART AND ARTISTS', 'MAGAZINES', 'ORGANIZATIONS', 'U.S. GEOGRAPHY', 'WORLD HISTORY', 'STATE CAPITALS', 'POETS AND POETRY', 'CLASSICAL MUSIC', 'FASHION', 'THE CIVIL WAR', 'ANNUAL EVENTS', 'HOMOPHONES', 'AWARDS', 'TELEVISION', 'THE BODY HUMAN', 'EXPLORERS', 'FLAGS', 'BOTANY', 'TREES', 'RHYME TIME', 'MEDICINE', 'WORLD GEOGRAPHY', 'QUOTATIONS', 'BIRDS', 'NATIONAL PARKS', 'THE OSCARS', 'SCIENTISTS', '

#### Run a cross validation on the subset x and y to determine class accuracy

In [9]:
y_sub = y[keep_indices]
x_sub_encoded = x_encoded[keep_indices]
print(f'y_sub.shape         = {y_sub.shape}')
print(f'x_sub_encoded.shape = {x_sub_encoded.shape}')

y_sub.shape         = (20035,)
x_sub_encoded.shape = (20035, 384)


In [10]:
%%time
n_sub_splits = 7
skf = StratifiedKFold(n_splits=n_sub_splits, random_state=42, shuffle=True)

sub_runs = None
run = 0
# reverse the normal train/test split sizes.
# Keep the train small and the test large
# So the trains are similar in size to the representative dataset
for tst_index, trn_index in skf.split(x_sub_encoded, y_sub):
    x_trn = x_sub_encoded[trn_index]
    x_tst = x_sub_encoded[tst_index]
    y_trn = y_sub[trn_index]
    y_tst = y_sub[tst_index]
    start = time.time()
    model = SVC(probability=True, random_state=42)
    model.fit(x_trn, y_trn)
    print(f'fit() dur={time.time() - start}')
    start = time.time()
    scores = PerClassAccuracyScoresMetric().score(model, x_tst, y_tst)
    if sub_runs == None:
        sub_runs = {k: {'intent': k, f'run {run} accuracy': v} for k,v in scores.items()}
    else:
        for k,v in scores.items():
            sub_runs[k][f'run {run} accuracy'] = v
    print(f'score()s dur={time.time() - start}')
    start = time.time()
    run += 1

fit() dur=19.63417148590088
score()s dur=33.547220945358276
fit() dur=17.06532597541809
score()s dur=33.53352928161621
fit() dur=20.032387733459473
score()s dur=34.92536926269531
fit() dur=17.508482456207275
score()s dur=31.681472063064575
fit() dur=17.591039657592773
score()s dur=33.22845983505249
fit() dur=18.841301202774048
score()s dur=34.899332761764526
fit() dur=18.868735313415527
score()s dur=36.901384592056274
CPU times: user 6min 8s, sys: 66.4 ms, total: 6min 8s
Wall time: 6min 8s


#### Gather the class accuracies on the subset x and y

In [11]:
df_sub_intent_runs = pd.DataFrame(sub_runs.values())

# Add mean accuracy for each class across the runs
df_sub_intent_runs['mean accuracy'] = df_sub_intent_runs[[f'run {i} accuracy' for i in range(n_sub_splits)]].mean(axis=1)

# Sort high to low class accuracies
df_sub_intent_runs.sort_values(by=[f'mean accuracy', 'intent'], ascending=[False, True], inplace=True, ignore_index=True)

# Display the top and low accuracies
fmt = {f'run {i} accuracy': '{:,.2%}'.format for i in range(n_sub_splits)}
fmt['mean accuracy'] = '{:,.2%}'.format
display(HTML(df_sub_intent_runs.head().to_html(formatters=fmt)))
display(HTML(df_sub_intent_runs.tail().to_html(formatters=fmt)))

# Print out the bottom 10 accuracies (only display the class means)
fmt = {'mean accuracy':'{:,.2%}'.format}
display(HTML(df_sub_intent_runs[['intent', 'mean accuracy']].tail(10).to_html(formatters=fmt)))

# Display the mean accuracies for each run
data = [{'run': i, 'accuracy': f'{float(df_sub_intent_runs[[f"run {i} accuracy"]].mean()):.2%}'} for i in range(n_sub_splits)]
data.append({'run': 'mean', 'accuracy':  f'{float(df_sub_intent_runs[["mean accuracy"]].mean()):.2%}'})
display(HTML(pd.DataFrame(data).to_html()))

Unnamed: 0,intent,run 0 accuracy,run 1 accuracy,run 2 accuracy,run 3 accuracy,run 4 accuracy,run 5 accuracy,run 6 accuracy,mean accuracy
0,LANGUAGES,90.38%,90.79%,89.12%,89.96%,90.38%,87.87%,93.33%,90.26%
1,SPORTS,82.76%,78.55%,80.28%,77.59%,83.45%,83.79%,82.76%,81.31%
2,COLLEGES AND UNIVERSITIES,81.88%,76.17%,79.87%,80.13%,78.79%,80.47%,78.45%,79.39%
3,ANIMALS,75.76%,70.45%,74.24%,79.92%,81.82%,78.79%,81.06%,77.44%
4,FIRST LADIES,76.19%,63.27%,76.03%,78.08%,80.82%,76.19%,69.39%,74.28%


Unnamed: 0,intent,run 0 accuracy,run 1 accuracy,run 2 accuracy,run 3 accuracy,run 4 accuracy,run 5 accuracy,run 6 accuracy,mean accuracy
87,SCULPTURE,7.37%,7.29%,2.11%,11.58%,3.16%,25.26%,13.68%,10.06%
88,FLOWERS,6.67%,10.00%,6.67%,18.33%,6.67%,16.67%,3.33%,9.76%
89,POTPOURRI,7.62%,10.37%,7.32%,8.23%,8.21%,5.17%,6.10%,7.57%
90,U.S. STATES,4.00%,8.06%,16.94%,2.42%,1.61%,11.29%,4.80%,7.02%
91,ROYALTY,5.21%,1.04%,1.04%,5.21%,5.21%,19.79%,9.38%,6.70%


Unnamed: 0,intent,mean accuracy
82,ARCHAEOLOGY,11.96%
83,TRAVEL AND TOURISM,11.56%
84,FOOD AND DRINK,11.31%
85,SIGNS AND SYMBOLS,10.62%
86,WORLD CITIES,10.59%
87,SCULPTURE,10.06%
88,FLOWERS,9.76%
89,POTPOURRI,7.57%
90,U.S. STATES,7.02%
91,ROYALTY,6.70%


Unnamed: 0,run,accuracy
0,0,40.29%
1,1,40.61%
2,2,40.86%
3,3,40.47%
4,4,40.17%
5,5,40.96%
6,6,41.72%
7,mean,40.73%
