# Filter classes by accuracy order - Web Of Science dataset
- Web Of Science has 3 datasets:
  - Web of Science Dataset WOS-11967
    - This dataset contains 11,967 documents with 35 categories which include 7 parents categories.
  - Web of Science Dataset WOS-46985
    - This dataset contains 46,985 documents with 134 categories which include 7 parents categories.
  - Web of Science Dataset WOS-5736
    - This dataset contains 5,736 documents with 11 categories which include 3 parents categories.
  - Pointer to dataset found in Box note: 
    - https://ibm.ent.box.com/notes/643621749124
    - Web of Science dataset
    - Original datasets can be found at
      - https://data.mendeley.com/datasets/9rw3vkcfy4/2
  - Use WOS-46985 dataset
- Object of this notebook is to Filter Classes
  - order classes by SVC class accuracy
  - drop most accurate classes
  - Get ballpark of 80% acuracy and 90 examples

In [1]:
import gzip
from abc import ABC, abstractmethod
from IPython.display import display, HTML
import json
import math
import numpy as np
import os
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import StratifiedKFold
from sklearn.svm import SVC
import time
from typing import List

pd.options.display.max_colwidth = 100
pd.options.display.float_format = '{:,.2%}'.format

%load_ext autoreload
%autoreload 2

# Increase the width of the notebook so that it is the width of the browser 
# which allows larger size for the dashboard
display(HTML('<style>.container { width:100% !important; }</style>'))

#### Load workspace dataset

In [2]:
%%time
# X is input data that include text sequences 
# Y is target value 
# YL1 is target value of level one (parent label)
# YL2 is target value of level one (child label)
x_gzip_file = '../../../data/WebOfScience/WebOfScience/WOS46985/X.txt.gzip'
y_file = '../../../data/WebOfScience/WebOfScience/WOS46985/Y.txt'
yl1_file = '../../../data/WebOfScience/WebOfScience/WOS46985/YL1.txt'
yl2_file = '../../../data/WebOfScience/WebOfScience/WOS46985/YL2.txt'

with gzip.open(x_gzip_file, 'rt') as f:
    lines = f.readlines()
df_x = pd.DataFrame(lines, columns=['example'])
df_y = pd.read_csv(y_file, header=None, names=['intent'])
df_yl1 = pd.read_csv(yl1_file, header=None, names=['yl1'])
df_yl2 = pd.read_csv(yl2_file, header=None, names=['yl2'])
data = [{'intent': 'y', 
         'n unique': len(np.unique(df_y['intent'])),
         'min n uniq': min(np.unique(df_y['intent'], return_counts=True)[1]),
         'max n uniq': max(np.unique(df_y['intent'], return_counts=True)[1])
        },
        {'intent': 'yl1', 
         'n unique': len(np.unique(df_yl1['yl1'])),
         'min n uniq': min(np.unique(df_yl1['yl1'], return_counts=True)[1]),
         'max n uniq': max(np.unique(df_yl1['yl1'], return_counts=True)[1])
        }
]
display(HTML(pd.DataFrame(data).to_html()))

df_merge = pd.concat([df_x, df_y], axis=1, sort=False)
print(f'df_merge.shape = {df_merge.shape}')

x = df_merge['example'].to_numpy()
y = df_merge['intent'].to_numpy().ravel()
print(f'x.shape        = {x.shape}')
print(f'y.shape        = {y.shape}')

# display(HTML(df_merge.head(4).to_html()))

Unnamed: 0,intent,n unique,min n uniq,max n uniq
0,y,134,43,750
1,yl1,7,3297,14625


df_merge.shape = (46985, 2)
x.shape        = (46985,)
y.shape        = (46985,)
CPU times: user 210 ms, sys: 68.2 ms, total: 278 ms
Wall time: 305 ms


#### Encode with USE encoder

In [3]:
%%time
class MiniLMEmbedding:
    def __init__(self):
        self.transformer = SentenceTransformer('paraphrase-MiniLM-L6-v2')
    def encode(self, input_sentences: List[str]) -> np.array:
        sentences = [sentence.lower() for sentence in input_sentences]
        embedded_sentences = [self.embed_sentence(s) for s in sentences]
        return np.array(embedded_sentences)
    def embed_sentence(self, sentence: str) -> np.array:
        embedding = self.transformer.encode(sentence, show_progress_bar=False, convert_to_numpy=True)
        return embedding

encoded_file = '../../../data/WebOfScience/WebOfScience/WOS46985/X_encoded.csv'
if os.path.exists(encoded_file):
    df = pd.read_csv(encoded_file, header=None)
    x_encoded = df.to_numpy()
else:
    encoder = MiniLMEmbedding()
    x_encoded = encoder.encode(x)
    # Save to file
    df = pd.DataFrame(x_encoded)
    df.to_csv(encoded_file, header=False, index=False)

print(f'x_encoded.shape = {x_encoded.shape}')

x_encoded.shape = (46985, 384)
CPU times: user 4.84 s, sys: 217 ms, total: 5.06 s
Wall time: 5.06 s


#### Run a cross validation on SVM classifiers
- Split the combined (x_encoded) dataset into 10 splits
  - Each train (x_trn) is 4699 (46,985/10)
  - Each test (x_tst) is 42,287 (46,985 * 9/10)
- Score the class accuracy of each cross split

In [4]:
# aimtk (AI Metrics Toolkit) 
# Classes from aimtk which score class accuracy
class Metric(ABC):
    @abstractmethod
    def score(self, model, X_test, y_test, **kwargs):
        """Return the value of the metric with respect to the test data."""

    @abstractmethod
    def __str__(self):
        """Return a unique metric name."""

class PerClassAccuracyScoresMetric(Metric):

    def __init__(self):
        super().__init__()

    def score(self, model, X_test, y_test, class_name=None, **kwargs):
        # Use the confusion matrix to generate the accuracy per class
        from sklearn.metrics import confusion_matrix
        conf_mat = confusion_matrix(model.predict(X_test), y_test)
        acc_per_class = conf_mat.diagonal() / conf_mat.sum(axis=0)

        # Get uniques from test to get the labels
        unique, unique_indices = np.unique(y_test, return_index=True)

        # Change nan's into 0.  In the cases where there were no correct predictions
        acc_per_class = [0 if math.isnan(a) else a for a in acc_per_class]

        # Generate dictionary from classes and accuracy scores
        acc_scores = {y_test[ui]: a for ui, a in zip(unique_indices, acc_per_class)}

        # add any missing scores
        for c in model.classes_:
            if c not in acc_scores:
                acc_scores[c] = 0

        if class_name is None:
            return acc_scores
        else:
            return acc_scores[class_name]

    def __str__(self):
        return "PerClassAccuracy"

In [5]:
%%time
# cross validation split to gather class accuracies
n_splits = 10
skf = StratifiedKFold(n_splits=n_splits, random_state=42, shuffle=True)

runs = None
run = 0
# reverse the normal train/test split sizes.
# Keep the train small and the test large
# So the trains are similar in size to the representative dataset
for tst_index, trn_index in skf.split(x_encoded, y):
    x_trn = x_encoded[trn_index]
    x_tst = x_encoded[tst_index]
    y_trn = y[trn_index]
    y_tst = y[tst_index]
    start = time.time()
    model = SVC(probability=True, random_state=42)
    model.fit(x_trn, y_trn)
    print(f'run={run} fit()   dur={time.time() - start}')
    start = time.time()
    scores = PerClassAccuracyScoresMetric().score(model, x_tst, y_tst)
    if runs == None:
        runs = {k: {'intent': k, f'run {run} accuracy': v} for k,v in scores.items()}
    else:
        for k,v in scores.items():
            runs[k][f'run {run} accuracy'] = v
    print(f'run={run} score() dur={time.time() - start}')
    start = time.time()
    run += 1

run=0 fit()   dur=71.295095205307
run=0 score() dur=426.88230562210083
run=1 fit()   dur=129.07576417922974
run=1 score() dur=238.8796887397766
run=2 fit()   dur=50.97539305686951
run=2 score() dur=213.93910670280457
run=3 fit()   dur=50.300012826919556
run=3 score() dur=239.77062106132507
run=4 fit()   dur=50.33560538291931
run=4 score() dur=206.83106327056885
run=5 fit()   dur=47.37153077125549
run=5 score() dur=212.40157532691956
run=6 fit()   dur=50.82289099693298
run=6 score() dur=183.28121161460876
run=7 fit()   dur=57.097898960113525
run=7 score() dur=231.46591663360596
run=8 fit()   dur=66.0179328918457
run=8 score() dur=188.16804909706116
run=9 fit()   dur=52.358171224594116
run=9 score() dur=209.29464173316956
CPU times: user 43min 27s, sys: 2.77 s, total: 43min 30s
Wall time: 49min 37s


#### Gather class accuracies

In [6]:
df_intent_runs = pd.DataFrame(runs.values())

# Add mean accuracy for each class across the runs
df_intent_runs['mean accuracy'] = df_intent_runs[[f'run {i} accuracy' for i in range(n_splits)]].mean(axis=1)

# Sort high to low class accuracies
df_intent_runs.sort_values(by=[f'mean accuracy', 'intent'], ascending=[False, True], inplace=True, ignore_index=True)

# Display the top and low accuracies
fmt = {f'run {i} accuracy': '{:,.2%}'.format for i in range(n_splits)}
fmt['mean accuracy'] = '{:,.2%}'.format
display(HTML(df_intent_runs.head().to_html(formatters=fmt)))
display(HTML(df_intent_runs.tail().to_html(formatters=fmt)))

# Print out the bottom 10 accuracies (only display the class means)
fmt = {'mean accuracy':'{:,.2%}'.format}
display(HTML(df_intent_runs[['intent', 'mean accuracy']].tail(10).to_html(formatters=fmt)))

# Display the mean accuracies for each run
data = [{'run': i, 'accuracy': f'{float(df_intent_runs[[f"run {i} accuracy"]].mean()):.2%}'} for i in range(n_splits)]
data.append({'run': 'mean', 'accuracy':  f'{float(df_intent_runs[["mean accuracy"]].mean()):.2%}'})
display(HTML(pd.DataFrame(data).to_html()))

Unnamed: 0,intent,run 0 accuracy,run 1 accuracy,run 2 accuracy,run 3 accuracy,run 4 accuracy,run 5 accuracy,run 6 accuracy,run 7 accuracy,run 8 accuracy,run 9 accuracy,mean accuracy
0,64,80.35%,85.89%,74.56%,82.12%,81.61%,75.31%,85.14%,82.83%,83.63%,80.10%,81.15%
1,122,81.46%,78.96%,80.49%,84.45%,80.18%,81.10%,75.08%,82.98%,79.03%,76.29%,80.00%
2,12,79.78%,77.56%,78.73%,83.43%,83.70%,76.24%,82.60%,78.73%,76.80%,81.49%,79.91%
3,113,80.72%,82.03%,79.74%,80.07%,78.76%,79.74%,72.88%,82.68%,74.84%,75.16%,78.66%
4,62,76.84%,71.84%,72.63%,79.79%,80.31%,77.95%,70.60%,81.36%,79.79%,75.07%,76.62%


Unnamed: 0,intent,run 0 accuracy,run 1 accuracy,run 2 accuracy,run 3 accuracy,run 4 accuracy,run 5 accuracy,run 6 accuracy,run 7 accuracy,run 8 accuracy,run 9 accuracy,mean accuracy
129,120,2.83%,3.77%,0.00%,0.94%,1.87%,1.87%,1.89%,0.00%,0.00%,4.72%,1.79%
130,76,0.42%,0.00%,5.04%,1.68%,0.42%,2.11%,2.53%,0.84%,1.68%,0.42%,1.52%
131,18,0.00%,2.56%,0.00%,0.00%,2.50%,0.00%,0.00%,0.00%,0.00%,2.56%,0.76%
132,11,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%
133,20,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%


Unnamed: 0,intent,mean accuracy
124,50,6.79%
125,123,4.69%
126,107,4.43%
127,16,2.65%
128,91,2.40%
129,120,1.79%
130,76,1.52%
131,18,0.76%
132,11,0.00%
133,20,0.00%


Unnamed: 0,run,accuracy
0,0,45.68%
1,1,44.98%
2,2,45.49%
3,3,45.63%
4,4,45.89%
5,5,45.31%
6,6,45.14%
7,7,45.66%
8,8,45.59%
9,9,45.02%


#### Determine which intents to keep and remove from clinc 150 dataset
- representative datset needs 92, so get a list of keep for highest 92.
- since the accuracies were so low, keep the highest 90.

In [7]:
keep_count = 90

# Gather the classes to keep.
# Dump into a list which can easily be copied into another notebook
keep_intents = df_intent_runs[['intent']].head(keep_count).to_numpy().flatten()
print(f'len(keep_intents) = {len(keep_intents)}')
print(f'keep_intents = {list(keep_intents)}')
# Gather the example indices for the examples to keep
keep_indices = [i for i in range(len(y)) if y[i] in keep_intents]
print(f'len(keep_indices) = {len(keep_indices)}')

# Gather the classes to be removed.
remove_intents = df_intent_runs[['intent']].tail(len(df_intent_runs) - keep_count).to_numpy().flatten()
print(f'len(remove_intents) = {len(remove_intents)}')
print(f'remove_intents = {list(remove_intents)}')
# Gather the example indices for the examples to remove
remove_indices = [i for i in range(len(y)) if y[i] in remove_intents]
print(f'len(remove_indices) = {len(remove_indices)}')

len(keep_intents) = 90
keep_intents = [64, 122, 12, 113, 62, 49, 66, 2, 68, 45, 103, 97, 70, 48, 115, 98, 3, 57, 61, 8, 74, 47, 127, 112, 65, 31, 99, 9, 79, 114, 35, 63, 111, 94, 101, 92, 46, 100, 69, 93, 96, 42, 25, 60, 39, 106, 121, 44, 33, 109, 14, 130, 81, 53, 17, 58, 71, 132, 80, 0, 83, 37, 55, 90, 85, 32, 75, 105, 22, 38, 56, 41, 128, 5, 21, 84, 43, 54, 36, 77, 27, 131, 72, 73, 118, 7, 108, 23, 26, 124]
len(keep_indices) = 33620
len(remove_intents) = 44
remove_intents = [129, 10, 133, 40, 87, 4, 95, 67, 34, 126, 78, 1, 15, 52, 6, 29, 86, 30, 24, 104, 82, 88, 117, 125, 19, 51, 13, 110, 59, 102, 28, 116, 89, 119, 50, 123, 107, 16, 91, 120, 76, 18, 11, 20]
len(remove_indices) = 13365


In [8]:
keep_count = 92

# Gather the classes to keep.
# Dump into a list which can easily be copied into another notebook
keep_intents = df_intent_runs[['intent']].head(keep_count).to_numpy().flatten()
print(f'len(keep_intents) = {len(keep_intents)}')
print(f'keep_intents = {list(keep_intents)}')
# Gather the example indices for the examples to keep
keep_indices = [i for i in range(len(y)) if y[i] in keep_intents]
print(f'len(keep_indices) = {len(keep_indices)}')

# Gather the classes to be removed.
remove_intents = df_intent_runs[['intent']].tail(len(df_intent_runs) - keep_count).to_numpy().flatten()
print(f'len(remove_intents) = {len(remove_intents)}')
print(f'remove_intents = {list(remove_intents)}')
# Gather the example indices for the examples to remove
remove_indices = [i for i in range(len(y)) if y[i] in remove_intents]
print(f'len(remove_indices) = {len(remove_indices)}')

len(keep_intents) = 92
keep_intents = [64, 122, 12, 113, 62, 49, 66, 2, 68, 45, 103, 97, 70, 48, 115, 98, 3, 57, 61, 8, 74, 47, 127, 112, 65, 31, 99, 9, 79, 114, 35, 63, 111, 94, 101, 92, 46, 100, 69, 93, 96, 42, 25, 60, 39, 106, 121, 44, 33, 109, 14, 130, 81, 53, 17, 58, 71, 132, 80, 0, 83, 37, 55, 90, 85, 32, 75, 105, 22, 38, 56, 41, 128, 5, 21, 84, 43, 54, 36, 77, 27, 131, 72, 73, 118, 7, 108, 23, 26, 124, 129, 10]
len(keep_indices) = 34589
len(remove_intents) = 42
remove_intents = [133, 40, 87, 4, 95, 67, 34, 126, 78, 1, 15, 52, 6, 29, 86, 30, 24, 104, 82, 88, 117, 125, 19, 51, 13, 110, 59, 102, 28, 116, 89, 119, 50, 123, 107, 16, 91, 120, 76, 18, 11, 20]
len(remove_indices) = 12396


#### Run a cross validation on the subset x and y to determine class accuracy

In [9]:
y_sub = y[keep_indices]
x_sub_encoded = x_encoded[keep_indices]
print(f'y_sub.shape         = {y_sub.shape}')
print(f'x_sub_encoded.shape = {x_sub_encoded.shape}')

y_sub.shape         = (34589,)
x_sub_encoded.shape = (34589, 384)


In [10]:
%%time
n_sub_splits = 7
skf = StratifiedKFold(n_splits=n_sub_splits, random_state=42, shuffle=True)

sub_runs = None
run = 0
# reverse the normal train/test split sizes.
# Keep the train small and the test large
# So the trains are similar in size to the representative dataset
for tst_index, trn_index in skf.split(x_sub_encoded, y_sub):
    x_trn = x_sub_encoded[trn_index]
    x_tst = x_sub_encoded[tst_index]
    y_trn = y_sub[trn_index]
    y_tst = y_sub[tst_index]
    start = time.time()
    model = SVC(probability=True, random_state=42)
    model.fit(x_trn, y_trn)
    print(f'fit() dur={time.time() - start}')
    start = time.time()
    scores = PerClassAccuracyScoresMetric().score(model, x_tst, y_tst)
    if sub_runs == None:
        sub_runs = {k: {'intent': k, f'run {run} accuracy': v} for k,v in scores.items()}
    else:
        for k,v in scores.items():
            sub_runs[k][f'run {run} accuracy'] = v
    print(f'score()s dur={time.time() - start}')
    start = time.time()
    run += 1

fit() dur=44.410767793655396
score()s dur=129.6825933456421
fit() dur=59.73698925971985
score()s dur=132.39018630981445
fit() dur=61.139333963394165
score()s dur=150.74867010116577
fit() dur=52.81402921676636
score()s dur=119.25608777999878
fit() dur=56.5437285900116
score()s dur=107.52755570411682
fit() dur=46.233163833618164
score()s dur=128.40847158432007
fit() dur=50.405569076538086
score()s dur=123.30989956855774
CPU times: user 19min 12s, sys: 987 ms, total: 19min 13s
Wall time: 21min 2s


#### Gather the class accuracies on the subset x and y

In [11]:
df_sub_intent_runs = pd.DataFrame(sub_runs.values())

# Add mean accuracy for each class across the runs
df_sub_intent_runs['mean accuracy'] = df_sub_intent_runs[[f'run {i} accuracy' for i in range(n_sub_splits)]].mean(axis=1)

# Sort high to low class accuracies
df_sub_intent_runs.sort_values(by=[f'mean accuracy', 'intent'], ascending=[False, True], inplace=True, ignore_index=True)

# Display the top and low accuracies
fmt = {f'run {i} accuracy': '{:,.2%}'.format for i in range(n_sub_splits)}
fmt['mean accuracy'] = '{:,.2%}'.format
display(HTML(df_sub_intent_runs.head().to_html(formatters=fmt)))
display(HTML(df_sub_intent_runs.tail().to_html(formatters=fmt)))

# Print out the bottom 10 accuracies (only display the class means)
fmt = {'mean accuracy':'{:,.2%}'.format}
display(HTML(df_sub_intent_runs[['intent', 'mean accuracy']].tail(10).to_html(formatters=fmt)))

# Display the mean accuracies for each run
data = [{'run': i, 'accuracy': f'{float(df_sub_intent_runs[[f"run {i} accuracy"]].mean()):.2%}'} for i in range(n_sub_splits)]
data.append({'run': 'mean', 'accuracy':  f'{float(df_sub_intent_runs[["mean accuracy"]].mean()):.2%}'})
display(HTML(pd.DataFrame(data).to_html()))

Unnamed: 0,intent,run 0 accuracy,run 1 accuracy,run 2 accuracy,run 3 accuracy,run 4 accuracy,run 5 accuracy,run 6 accuracy,mean accuracy
0,12,82.85%,79.07%,84.30%,84.93%,82.03%,80.00%,83.48%,82.38%
1,113,85.27%,78.77%,80.14%,85.22%,80.07%,83.85%,80.41%,81.96%
2,64,82.28%,84.39%,78.31%,79.63%,77.78%,85.98%,82.01%,81.48%
3,49,77.17%,83.60%,80.65%,75.16%,80.32%,83.87%,82.26%,80.43%
4,122,82.69%,78.59%,77.64%,78.27%,81.47%,78.59%,81.15%,79.77%


Unnamed: 0,intent,run 0 accuracy,run 1 accuracy,run 2 accuracy,run 3 accuracy,run 4 accuracy,run 5 accuracy,run 6 accuracy,mean accuracy
87,72,46.42%,48.30%,47.73%,36.60%,49.81%,47.92%,47.92%,46.39%
88,108,48.85%,44.70%,43.32%,49.77%,42.86%,49.31%,43.98%,46.11%
89,73,46.08%,46.73%,48.69%,46.73%,40.20%,41.50%,44.77%,44.96%
90,77,44.12%,44.85%,44.49%,39.71%,48.53%,48.34%,42.44%,44.64%
91,26,44.44%,44.44%,44.44%,47.84%,42.90%,43.21%,42.28%,44.22%


Unnamed: 0,intent,mean accuracy
82,129,49.85%
83,84,49.57%
84,27,49.11%
85,23,48.26%
86,10,47.77%
87,72,46.39%
88,108,46.11%
89,73,44.96%
90,77,44.64%
91,26,44.22%


Unnamed: 0,run,accuracy
0,0,62.18%
1,1,62.45%
2,2,62.71%
3,3,61.93%
4,4,62.38%
5,5,62.28%
6,6,62.62%
7,mean,62.37%
