# Filter classes by accuracy order - clinc 150 dataset
- clinc has 150 intents each with 150 examples
  - clinc has 3 datasets each with 150 intents
    - train (tn)
    - test (te)
    - val (va)
  - combining datasets get 150 examples for each intent
  - Pointer to dataset foun din Box note: 
    - https://ibm.ent.box.com/notes/643621749124
    - Intent Classification and Out-of-Scope Prediction
    - Original dataset
      - https://github.com/clinc/oos-eval/blob/master/data/data_full.json
- Object of this notebook is to Filter Classes
  - order classes by SVC class accuracy
  - drop most accurate classes
  - Get ballpark of 80% acuracy and 90 examples

In [1]:
import gzip
from abc import ABC, abstractmethod
from IPython.display import display, HTML
import json
import math
import numpy as np
import os
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import StratifiedKFold
from sklearn.svm import SVC
import time
from typing import List

pd.options.display.max_colwidth = 100
pd.options.display.float_format = '{:,.2%}'.format

%load_ext autoreload
%autoreload 2

# Increase the width of the notebook so that it is the width of the browser 
# which allows larger size for the dashboard
display(HTML('<style>.container { width:100% !important; }</style>'))

#### Load workspace dataset

In [2]:
json_gzip_file = '../../../data/clinc150/data_full.json.gzip'
with gzip.open(json_gzip_file, 'r') as fin:
    data = json.loads(fin.read().decode('utf-8'))

datasets = {}
for k,v in data.items():
    datasets[k] = []
    for i in v:
        datasets[k].append({'intent': i[1], 'example':i[0]})

dfs = {}
for k,v in datasets.items():
    dfs[k] = pd.DataFrame(v)
#     print(f'{k} n_xmp {len(dfs[k])}')
#     print(f'{k} n_int {len(np.unique(dfs[k]["intent"].to_numpy()))}')
#     display(HTML(dfs[k].head(1).to_html()))

#### Verify intents are same in 3 datasets (train, test & val)

In [3]:
tr_intents = [v['intent'] for v in datasets['train']]
te_intents = [v['intent'] for v in datasets['test']]
va_intents = [v['intent'] for v in datasets['val']]
co_intents = tr_intents + te_intents + va_intents
print(f'tr unique = {len(np.unique(tr_intents))}')
print(f'te unique = {len(np.unique(te_intents))}')
print(f'va unique = {len(np.unique(va_intents))}')
print(f'co unique = {len(np.unique(co_intents))}')
print(f'tr min(count)  = {min(np.unique(tr_intents, return_counts=True)[1])}')
print(f'tr max(count)  = {max(np.unique(tr_intents, return_counts=True)[1])}')
print(f'te min(count)  = {min(np.unique(te_intents, return_counts=True)[1])}')
print(f'te max(count)  = {max(np.unique(te_intents, return_counts=True)[1])}')
print(f'va min(count)  = {min(np.unique(va_intents, return_counts=True)[1])}')
print(f'va max(count)  = {max(np.unique(va_intents, return_counts=True)[1])}')
print(f'co min(count)  = {min(np.unique(co_intents, return_counts=True)[1])}')
print(f'co max(count)  = {max(np.unique(co_intents, return_counts=True)[1])}')

tr unique = 150
te unique = 150
va unique = 150
co unique = 150
tr min(count)  = 100
tr max(count)  = 100
te min(count)  = 30
te max(count)  = 30
va min(count)  = 20
va max(count)  = 20
co min(count)  = 150
co max(count)  = 150


#### Extract the X and Y values for each of the datasets

In [4]:
x_tr = dfs['train']['example'].to_numpy()
x_te = dfs['test']['example'].to_numpy()
x_va = dfs['val']['example'].to_numpy()
y_tr = dfs['train']['intent'].to_numpy().ravel()
y_te = dfs['test']['intent'].to_numpy().ravel()
y_va = dfs['val']['intent'].to_numpy().ravel()
x_co = np.concatenate((x_tr, x_te, x_va))
y_co = np.concatenate((y_tr, y_te, y_va))

print(f'x_tr.shape  = {x_tr.shape}')
print(f'y_tr.shape  = {y_tr.shape}')
print(f'x_te.shape  = {x_te.shape}')
print(f'y_te.shape  = {y_te.shape}')
print(f'x_va.shape  = {x_va.shape}')
print(f'y_va.shape  = {y_va.shape}')
print(f'x_co.shape  = {x_co.shape}')
print(f'y_co.shape  = {y_co.shape}')

x_tr.shape  = (15000,)
y_tr.shape  = (15000,)
x_te.shape  = (4500,)
y_te.shape  = (4500,)
x_va.shape  = (3000,)
y_va.shape  = (3000,)
x_co.shape  = (22500,)
y_co.shape  = (22500,)


#### Encode with USE encoder

In [5]:
%%time
class MiniLMEmbedding:
    def __init__(self):
        self.transformer = SentenceTransformer('paraphrase-MiniLM-L6-v2')
    def encode(self, input_sentences: List[str]) -> np.array:
        sentences = [sentence.lower() for sentence in input_sentences]
        embedded_sentences = [self.embed_sentence(s) for s in sentences]
        return np.array(embedded_sentences)
    def embed_sentence(self, sentence: str) -> np.array:
        embedding = self.transformer.encode(sentence, show_progress_bar=False, convert_to_numpy=True)
        return embedding

encoded_file = '../../../data/clinc150/x_co_encoded.csv'
if os.path.exists(encoded_file):
    df = pd.read_csv(encoded_file, header=None)
    x_co_encoded = df.to_numpy()
else:
    encoder = MiniLMEmbedding()

    start = time.time()
    x_tr_encoded = encoder.encode(x_tr)
    print(f'done tr dur={time.time() - start}')

    start = time.time()
    x_te_encoded = encoder.encode(x_te)
    print(f'done te dur={time.time() - start}')

    start = time.time()
    x_va_encoded = encoder.encode(x_va)
    print(f'done va dur={time.time() - start}')

    start = time.time()
    x_co_encoded = np.concatenate((x_tr_encoded, x_te_encoded, x_va_encoded))
    print(f'done co dur={time.time() - start}')
    start = time.time()

    print(f'x_tr_encoded.shape = {x_tr_encoded.shape}')
    print(f'x_te_encoded.shape = {x_te_encoded.shape}')
    print(f'x_va_encoded.shape = {x_va_encoded.shape}')

    # Save to file
    df = pd.DataFrame(x_co_encoded)
    df.to_csv(encoded_file, header=False, index=False)

print(f'x_co_encoded.shape = {x_co_encoded.shape}')

x_co_encoded.shape = (22500, 384)
CPU times: user 2 s, sys: 122 ms, total: 2.12 s
Wall time: 2.12 s


#### Run a cross validation on SVM classifiers
- Split the combined (x_co) dataset into 5 splits
- Each train (x_trn) is 4500 (22500/5)
- Each test (x_tst) is 18000 (22500 * 4/5)
- Score the class accuracy of each cross split

In [6]:
# aimtk (AI Metrics Toolkit) 
# Classes from aimtk which score class accuracy
class Metric(ABC):
    @abstractmethod
    def score(self, model, X_test, y_test, **kwargs):
        """Return the value of the metric with respect to the test data."""

    @abstractmethod
    def __str__(self):
        """Return a unique metric name."""

class PerClassAccuracyScoresMetric(Metric):

    def __init__(self):
        super().__init__()

    def score(self, model, X_test, y_test, class_name=None, **kwargs):
        # Use the confusion matrix to generate the accuracy per class
        from sklearn.metrics import confusion_matrix
        conf_mat = confusion_matrix(model.predict(X_test), y_test)
        acc_per_class = conf_mat.diagonal() / conf_mat.sum(axis=0)

        # Get uniques from test to get the labels
        unique, unique_indices = np.unique(y_test, return_index=True)

        # Change nan's into 0.  In the cases where there were no correct predictions
        acc_per_class = [0 if math.isnan(a) else a for a in acc_per_class]

        # Generate dictionary from classes and accuracy scores
        acc_scores = {y_test[ui]: a for ui, a in zip(unique_indices, acc_per_class)}

        # add any missing scores
        for c in model.classes_:
            if c not in acc_scores:
                acc_scores[c] = 0

        if class_name is None:
            return acc_scores
        else:
            return acc_scores[class_name]

    def __str__(self):
        return "PerClassAccuracy"

In [7]:
%%time
# cross validation split to gather class accuracies
n_splits = 5
skf = StratifiedKFold(n_splits=n_splits, random_state=42, shuffle=True)

runs = None
run = 0
# reverse the normal train/test split sizes.
# Keep the train small and the test large
# So the trains are similar in size to the representative dataset
for tst_index, trn_index in skf.split(x_co_encoded, y_co):
    x_trn = x_co_encoded[trn_index]
    x_tst = x_co_encoded[tst_index]
    y_trn = y_co[trn_index]
    y_tst = y_co[tst_index]
    start = time.time()
    model = SVC(probability=True, random_state=42)
    model.fit(x_trn, y_trn)
    print(f'fit() dur={time.time() - start}')
    start = time.time()
    scores = PerClassAccuracyScoresMetric().score(model, x_tst, y_tst)
    if runs == None:
        runs = {k: {'intent': k, f'run {run} accuracy': v} for k,v in scores.items()}
    else:
        for k,v in scores.items():
            runs[k][f'run {run} accuracy'] = v
    print(f'score()s dur={time.time() - start}')
    start = time.time()
    run += 1

fit() dur=32.769962787628174
score()s dur=58.239023208618164
fit() dur=34.74569034576416
score()s dur=63.00217938423157
fit() dur=32.43782377243042
score()s dur=65.9853847026825
fit() dur=32.066147327423096
score()s dur=61.907475233078
fit() dur=34.02885556221008
score()s dur=61.159531116485596
CPU times: user 7min 56s, sys: 242 ms, total: 7min 56s
Wall time: 7min 56s


#### Gather class accuracies

In [8]:
df_intent_runs = pd.DataFrame(runs.values())

# Add mean accuracy for each class across the runs
df_intent_runs['mean accuracy'] = df_intent_runs[[f'run {i} accuracy' for i in range(n_splits)]].mean(axis=1)

# Sort high to low class accuracies
df_intent_runs.sort_values(by=[f'mean accuracy', 'intent'], ascending=[False, True], inplace=True, ignore_index=True)

# Display the top and low accuracies
fmt = {f'run {i} accuracy': '{:,.2%}'.format for i in range(n_splits)}
fmt['mean accuracy'] = '{:,.2%}'.format
display(HTML(df_intent_runs.head().to_html(formatters=fmt)))
display(HTML(df_intent_runs.tail().to_html(formatters=fmt)))

# Print out the bottom 10 accuracies (only display the class means)
fmt = {'mean accuracy':'{:,.2%}'.format}
display(HTML(df_intent_runs[['intent', 'mean accuracy']].tail(10).to_html(formatters=fmt)))

# Display the mean accuracies for each run
data = [{'run': i, 'accuracy': f'{float(df_intent_runs[[f"run {i} accuracy"]].mean()):.2%}'} for i in range(n_splits)]
data.append({'run': 'mean', 'accuracy':  f'{float(df_intent_runs[["mean accuracy"]].mean()):.2%}'})
display(HTML(pd.DataFrame(data).to_html()))

Unnamed: 0,intent,run 0 accuracy,run 1 accuracy,run 2 accuracy,run 3 accuracy,run 4 accuracy,mean accuracy
0,routing,100.00%,100.00%,100.00%,100.00%,100.00%,100.00%
1,vaccines,100.00%,100.00%,100.00%,100.00%,100.00%,100.00%
2,do_you_have_pets,100.00%,100.00%,99.17%,100.00%,100.00%,99.83%
3,lost_luggage,99.17%,100.00%,100.00%,100.00%,100.00%,99.83%
4,international_fees,100.00%,99.17%,99.17%,100.00%,100.00%,99.67%


Unnamed: 0,intent,run 0 accuracy,run 1 accuracy,run 2 accuracy,run 3 accuracy,run 4 accuracy,mean accuracy
145,change_user_name,70.00%,82.50%,90.83%,85.83%,71.67%,80.17%
146,shopping_list,76.67%,85.83%,85.00%,70.83%,74.17%,78.50%
147,pto_used,80.00%,69.17%,84.17%,80.00%,75.00%,77.67%
148,change_ai_name,86.67%,61.67%,74.17%,76.67%,80.83%,76.00%
149,todo_list,80.83%,80.83%,74.17%,60.00%,79.17%,75.00%


Unnamed: 0,intent,mean accuracy
140,make_call,84.67%
141,calendar,84.00%
142,todo_list_update,83.00%
143,ingredients_list,82.50%
144,goodbye,81.00%
145,change_user_name,80.17%
146,shopping_list,78.50%
147,pto_used,77.67%
148,change_ai_name,76.00%
149,todo_list,75.00%


Unnamed: 0,run,accuracy
0,0,93.77%
1,1,93.56%
2,2,93.69%
3,3,93.61%
4,4,93.49%
5,mean,93.62%


#### Determine which intents to keep and remove from clinc 150 dataset
- since the accuracies were so high, keep the lowest 90.

In [9]:
keep_count = 90

# Gather the classes to keep.
# Dump into a list which can easily be copied into another notebook
keep_intents = df_intent_runs[['intent']].tail(keep_count).to_numpy().flatten()
print(f'len(keep_intents) = {len(keep_intents)}')
print(f'keep_intents = {list(keep_intents)}')
# Gather the example indices for the examples to keep
keep_indices = [i for i in range(len(y_co)) if y_co[i] in keep_intents]
print(f'len(keep_indices) = {len(keep_indices)}')

# Gather the classes to be removed.
remove_intents = df_intent_runs[['intent']].head(len(df_intent_runs) - keep_count).to_numpy().flatten()
print(f'len(remove_intents) = {len(remove_intents)}')
print(f'remove_intents = {list(remove_intents)}')
# Gather the example indices for the examples to remove
remove_indices = [i for i in range(len(y_co)) if y_co[i] in remove_intents]
print(f'len(remove_indices) = {len(remove_indices)}')

len(keep_intents) = 90
keep_intents = ['next_holiday', 'report_lost_card', 'current_location', 'nutrition_info', 'min_payment', 'w2', 'text', 'mpg', 'repeat', 'smart_home', 'who_made_you', 'freeze_account', 'weather', 'payday', 'where_are_you_from', 'fun_fact', 'travel_suggestion', 'ingredient_substitution', 'balance', 'how_old_are_you', 'meaning_of_life', 'schedule_meeting', 'cancel_reservation', 'meeting_schedule', 'pto_request_status', 'accept_reservations', 'pto_request', 'card_declined', 'no', 'time', 'order_status', 'cancel', 'directions', 'income', 'maybe', 'schedule_maintenance', 'what_song', 'gas', 'account_blocked', 'confirm_reservation', 'next_song', 'credit_limit_change', 'restaurant_suggestion', 'what_are_your_hobbies', 'gas_type', 'how_busy', 'change_volume', 'restaurant_reviews', 'share_location', 'spelling', 'expiration_date', 'translate', 'who_do_you_work_for', 'timer', 'bill_due', 'restaurant_reservation', 'whisper_mode', 'pay_bill', 'user_name', 'meal_suggestion', 's

In [10]:
keep_count = 92

# Gather the classes to keep.
# Dump into a list which can easily be copied into another notebook
keep_intents = df_intent_runs[['intent']].tail(keep_count).to_numpy().flatten()
print(f'len(keep_intents) = {len(keep_intents)}')
print(f'keep_intents = {list(keep_intents)}')
# Gather the example indices for the examples to keep
keep_indices = [i for i in range(len(y_co)) if y_co[i] in keep_intents]
print(f'len(keep_indices) = {len(keep_indices)}')

# Gather the classes to be removed.
remove_intents = df_intent_runs[['intent']].head(len(df_intent_runs) - keep_count).to_numpy().flatten()
print(f'len(remove_intents) = {len(remove_intents)}')
print(f'remove_intents = {list(remove_intents)}')
# Gather the example indices for the examples to remove
remove_indices = [i for i in range(len(y_co)) if y_co[i] in remove_intents]
print(f'len(remove_indices) = {len(remove_indices)}')

len(keep_intents) = 92
keep_intents = ['credit_limit', 'food_last', 'next_holiday', 'report_lost_card', 'current_location', 'nutrition_info', 'min_payment', 'w2', 'text', 'mpg', 'repeat', 'smart_home', 'who_made_you', 'freeze_account', 'weather', 'payday', 'where_are_you_from', 'fun_fact', 'travel_suggestion', 'ingredient_substitution', 'balance', 'how_old_are_you', 'meaning_of_life', 'schedule_meeting', 'cancel_reservation', 'meeting_schedule', 'pto_request_status', 'accept_reservations', 'pto_request', 'card_declined', 'no', 'time', 'order_status', 'cancel', 'directions', 'income', 'maybe', 'schedule_maintenance', 'what_song', 'gas', 'account_blocked', 'confirm_reservation', 'next_song', 'credit_limit_change', 'restaurant_suggestion', 'what_are_your_hobbies', 'gas_type', 'how_busy', 'change_volume', 'restaurant_reviews', 'share_location', 'spelling', 'expiration_date', 'translate', 'who_do_you_work_for', 'timer', 'bill_due', 'restaurant_reservation', 'whisper_mode', 'pay_bill', 'user

#### Run a cross validation on the subset x and y to determine class accuracy

In [11]:
y_sub = y_co[keep_indices]
x_sub_encoded = x_co_encoded[keep_indices]
print(f'y_sub.shape         = {y_sub.shape}')
print(f'x_sub_encoded.shape = {x_sub_encoded.shape}')

y_sub.shape         = (13800,)
x_sub_encoded.shape = (13800, 384)


In [12]:
%%time
n_sub_splits = 3
skf = StratifiedKFold(n_splits=n_sub_splits, random_state=42, shuffle=True)

sub_runs = None
run = 0
# reverse the normal train/test split sizes.
# Keep the train small and the test large
# So the trains are similar in size to the representative dataset
for tst_index, trn_index in skf.split(x_sub_encoded, y_sub):
    x_trn = x_sub_encoded[trn_index]
    x_tst = x_sub_encoded[tst_index]
    y_trn = y_sub[trn_index]
    y_tst = y_sub[tst_index]
    start = time.time()
    model = SVC(probability=True, random_state=42)
    model.fit(x_trn, y_trn)
    print(f'fit() dur={time.time() - start}')
    start = time.time()
    scores = PerClassAccuracyScoresMetric().score(model, x_tst, y_tst)
    if sub_runs == None:
        sub_runs = {k: {'intent': k, f'run {run} accuracy': v} for k,v in scores.items()}
    else:
        for k,v in scores.items():
            sub_runs[k][f'run {run} accuracy'] = v
    print(f'score()s dur={time.time() - start}')
    start = time.time()
    run += 1

fit() dur=23.694291830062866
score()s dur=25.639363765716553
fit() dur=26.41112780570984
score()s dur=24.819358110427856
fit() dur=22.009212493896484
score()s dur=24.124150276184082
CPU times: user 2min 26s, sys: 46 ms, total: 2min 26s
Wall time: 2min 26s


#### Gather the class accuracies on the subset x and y

In [13]:
df_sub_intent_runs = pd.DataFrame(sub_runs.values())

# Add mean accuracy for each class across the runs
df_sub_intent_runs['mean accuracy'] = df_sub_intent_runs[[f'run {i} accuracy' for i in range(n_sub_splits)]].mean(axis=1)

# Sort high to low class accuracies
df_sub_intent_runs.sort_values(by=[f'mean accuracy', 'intent'], ascending=[False, True], inplace=True, ignore_index=True)

# Display the top and low accuracies
fmt = {f'run {i} accuracy': '{:,.2%}'.format for i in range(n_sub_splits)}
fmt['mean accuracy'] = '{:,.2%}'.format
display(HTML(df_sub_intent_runs.head().to_html(formatters=fmt)))
display(HTML(df_sub_intent_runs.tail().to_html(formatters=fmt)))

# Print out the bottom 10 accuracies (only display the class means)
fmt = {'mean accuracy':'{:,.2%}'.format}
display(HTML(df_sub_intent_runs[['intent', 'mean accuracy']].tail(10).to_html(formatters=fmt)))

# Display the mean accuracies for each run
data = [{'run': i, 'accuracy': f'{float(df_sub_intent_runs[[f"run {i} accuracy"]].mean()):.2%}'} for i in range(n_sub_splits)]
data.append({'run': 'mean', 'accuracy':  f'{float(df_sub_intent_runs[["mean accuracy"]].mean()):.2%}'})
display(HTML(pd.DataFrame(data).to_html()))

Unnamed: 0,intent,run 0 accuracy,run 1 accuracy,run 2 accuracy,mean accuracy
0,schedule_maintenance,100.00%,100.00%,100.00%,100.00%
1,nutrition_info,100.00%,97.00%,100.00%,99.00%
2,redeem_rewards,99.00%,99.00%,99.00%,99.00%
3,time,100.00%,97.00%,100.00%,99.00%
4,card_declined,98.00%,100.00%,98.00%,98.67%


Unnamed: 0,intent,run 0 accuracy,run 1 accuracy,run 2 accuracy,mean accuracy
87,pto_used,86.00%,88.00%,82.00%,85.33%
88,change_ai_name,88.00%,69.00%,90.00%,82.33%
89,todo_list,79.00%,85.00%,80.00%,81.33%
90,shopping_list,73.00%,74.00%,91.00%,79.33%
91,ingredients_list,72.00%,87.00%,76.00%,78.33%


Unnamed: 0,intent,mean accuracy
82,pto_balance,87.67%
83,reminder_update,87.67%
84,goodbye,87.00%
85,todo_list_update,86.33%
86,reminder,86.33%
87,pto_used,85.33%
88,change_ai_name,82.33%
89,todo_list,81.33%
90,shopping_list,79.33%
91,ingredients_list,78.33%


Unnamed: 0,run,accuracy
0,0,93.45%
1,1,93.51%
2,2,93.86%
3,mean,93.61%
