# Cross validation of the clinc150 dataset
- Determine if the clinc150 dataset can be used similar to mimic what is seen in representative dataset when running Performance Predictor
- clinc has 150 intents (need 92 to mimic representative dataset)
- clinc has 3 datasets with 150 intents
  - train (tn)
  - test (te)
  - val (va)
- clinc also has 3 oos datasets but they only have 1 intent so they are ignored
- run a cross validation on the combined (co) dataset to get the accuracies on a trained svm classifier.
- check the accuracies across the splits to see if it is consistent.

In [1]:
import gzip
from IPython.display import display, HTML
import json
import numpy as np
import os
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.model_selection import StratifiedKFold
from sklearn.svm import SVC
import time
from typing import List

pd.options.display.max_colwidth = 100

%load_ext autoreload
%autoreload 2

# Increase the width of the notebook so that it is the width of the browser 
# which allows larger size for the dashboard
display(HTML('<style>.container { width:100% !important; }</style>'))

#### Load workspace dataset

In [2]:
json_gzip_file = '../../../data/clinc150/data_full.json.gzip'
with gzip.open(json_gzip_file, 'r') as fin:
    data = json.loads(fin.read().decode('utf-8'))

In [3]:
datasets = {}
for k,v in data.items():
    datasets[k] = []
    for i in v:
        datasets[k].append({'intent': i[1], 'example':i[0]})

In [4]:
dfs = {}
for k,v in datasets.items():
    dfs[k] = pd.DataFrame(v)
    print(f'{k} n_xmp {len(dfs[k])}')
    print(f'{k} n_int {len(np.unique(dfs[k]["intent"].to_numpy()))}')
    display(HTML(dfs[k].head(1).to_html()))

oos_val n_xmp 100
oos_val n_int 1


Unnamed: 0,intent,example
0,oos,set a warning for when my bank account starts running low


val n_xmp 3000
val n_int 150


Unnamed: 0,intent,example
0,translate,"in spanish, meet me tomorrow is said how"


train n_xmp 15000
train n_int 150


Unnamed: 0,intent,example
0,translate,what expression would i use to say i love you if i were an italian


oos_test n_xmp 1000
oos_test n_int 1


Unnamed: 0,intent,example
0,oos,how much has the dow changed today


test n_xmp 4500
test n_int 150


Unnamed: 0,intent,example
0,translate,how would you say fly in italian


oos_train n_xmp 100
oos_train n_int 1


Unnamed: 0,intent,example
0,oos,how much is an overdraft fee for bank


#### Verify intents are same in 3 datasets (train, test & val)

In [5]:
tr_intents = [v['intent'] for v in datasets['train']]
te_intents = [v['intent'] for v in datasets['test']]
va_intents = [v['intent'] for v in datasets['val']]
co_intents = tr_intents + te_intents + va_intents
print(f'tr unique = {len(np.unique(tr_intents))}')
print(f'te unique = {len(np.unique(te_intents))}')
print(f'va unique = {len(np.unique(va_intents))}')
print(f'co unique = {len(np.unique(co_intents))}')
print(f'tr min(count)  = {min(np.unique(tr_intents, return_counts=True)[1])}')
print(f'tr max(count)  = {max(np.unique(tr_intents, return_counts=True)[1])}')
print(f'te min(count)  = {min(np.unique(te_intents, return_counts=True)[1])}')
print(f'te max(count)  = {max(np.unique(te_intents, return_counts=True)[1])}')
print(f'va min(count)  = {min(np.unique(va_intents, return_counts=True)[1])}')
print(f'va max(count)  = {max(np.unique(va_intents, return_counts=True)[1])}')
print(f'co min(count)  = {min(np.unique(co_intents, return_counts=True)[1])}')
print(f'co max(count)  = {max(np.unique(co_intents, return_counts=True)[1])}')

tr unique = 150
te unique = 150
va unique = 150
co unique = 150
tr min(count)  = 100
tr max(count)  = 100
te min(count)  = 30
te max(count)  = 30
va min(count)  = 20
va max(count)  = 20
co min(count)  = 150
co max(count)  = 150


#### Extract the X and Y values for each of the datasets

In [6]:
x_tr = dfs['train']['example'].to_numpy()
x_te = dfs['test']['example'].to_numpy()
x_va = dfs['val']['example'].to_numpy()
y_tr = dfs['train']['intent'].to_numpy().ravel()
y_te = dfs['test']['intent'].to_numpy().ravel()
y_va = dfs['val']['intent'].to_numpy().ravel()
x_co = np.concatenate((x_tr, x_te, x_va))
y_co = np.concatenate((y_tr, y_te, y_va))

print(f'x_tr.shape  = {x_tr.shape}')
print(f'y_tr.shape  = {y_tr.shape}')
print(f'x_te.shape  = {x_te.shape}')
print(f'y_te.shape  = {y_te.shape}')
print(f'x_va.shape  = {x_va.shape}')
print(f'y_va.shape  = {y_va.shape}')
print(f'x_co.shape  = {x_co.shape}')
print(f'y_co.shape  = {y_co.shape}')

x_tr.shape  = (15000,)
y_tr.shape  = (15000,)
x_te.shape  = (4500,)
y_te.shape  = (4500,)
x_va.shape  = (3000,)
y_va.shape  = (3000,)
x_co.shape  = (22500,)
y_co.shape  = (22500,)


#### Encode with USE encoder

In [7]:
%%time
class MiniLMEmbedding:
    def __init__(self):
        self.transformer = SentenceTransformer('paraphrase-MiniLM-L6-v2')
    def encode(self, input_sentences: List[str]) -> np.array:
        sentences = [sentence.lower() for sentence in input_sentences]
        embedded_sentences = [self.embed_sentence(s) for s in sentences]
        return np.array(embedded_sentences)
    def embed_sentence(self, sentence: str) -> np.array:
        embedding = self.transformer.encode(sentence, show_progress_bar=False, convert_to_numpy=True)
        return embedding

encoder = MiniLMEmbedding()

encoded_file = '../../../data/clinc150/x_co_encoded.csv'
if os.path.exists(encoded_file):
    df = pd.read_csv(encoded_file, header=None)
    x_co_encoded = df.to_numpy()
    x_tr_encoded = x_co_encoded[:len(x_tr)]
    x_te_encoded = x_co_encoded[len(x_tr):len(x_tr) + len(x_te)]
    x_va_encoded = x_co_encoded[len(x_tr) + len(x_te):]
    x_va_encoded = x_co_encoded[-len(x_va):]

    print(f'x_tr_encoded.shape = {x_tr_encoded.shape}')
    print(f'x_te_encoded.shape = {x_te_encoded.shape}')
    print(f'x_va_encoded.shape = {x_va_encoded.shape}')
else:
    encoder = MiniLMEmbedding()

    start = time.time()
    x_tr_encoded = encoder.encode(x_tr)
    print(f'done tr dur={time.time() - start}')

    start = time.time()
    x_te_encoded = encoder.encode(x_te)
    print(f'done te dur={time.time() - start}')

    start = time.time()
    x_va_encoded = encoder.encode(x_va)
    print(f'done va dur={time.time() - start}')

    start = time.time()
    x_co_encoded = np.concatenate((x_tr_encoded, x_te_encoded, x_va_encoded))
    print(f'done co dur={time.time() - start}')
    start = time.time()

    print(f'x_tr_encoded.shape = {x_tr_encoded.shape}')
    print(f'x_te_encoded.shape = {x_te_encoded.shape}')
    print(f'x_va_encoded.shape = {x_va_encoded.shape}')

    # Save to file
    df = pd.DataFrame(x_co_encoded)
    df.to_csv(encoded_file, header=False, index=False)

print(f'x_co_encoded.shape = {x_co_encoded.shape}')

x_tr_encoded.shape = (15000, 384)
x_te_encoded.shape = (4500, 384)
x_va_encoded.shape = (3000, 384)
x_co_encoded.shape = (22500, 384)
CPU times: user 4.45 s, sys: 178 ms, total: 4.63 s
Wall time: 3.04 s


#### Run a cross validation on SVM classifiers
- Split the combined (x_co) dataset into 5 splits
- Each train (x_trn) is 4500 (22500/5)
- Each test (x_tst) is 18000 (22500 * 4/5)
- Score the accuracy of each cross split
  - Normally you'd test against the test of each split (x_tst)
  - But in this case test against each dataset 
     - split train (x_trn)
     - split test (x_tst)
     - original train (x_tr)
     - original test (x_te)
     - original val (x_va)
     - original combined (x_co)
  - This is done to see if any of the datasets have problems, e.g. one has a very low score compared to the others.
  - split train (x_trn) has a high accuracy 99%
  - Other datasets have similar accuracies 93% to 95%.

In [8]:
%%time
skf = StratifiedKFold(n_splits=5, random_state=42, shuffle=True)

runs = []
run = 0
# reverse the normal train/test split sizes.
# Keep the train small and the test large
# So the trains are similar in size to the representative dataset
for tst_index, trn_index in skf.split(x_co_encoded, y_co):
    x_trn = x_co_encoded[trn_index]
    x_tst = x_co_encoded[tst_index]
    y_trn = y_co[trn_index]
    y_tst = y_co[tst_index]
    start = time.time()
    model = SVC(probability=True, random_state=42)
    model.fit(x_trn, y_trn)
    print(f'run {run} fit() dur={time.time() - start}')
    start = time.time()
    runs.append({
        'run': run,
        'trn_acc': model.score(x_trn, y_trn),
        'tst_acc': model.score(x_tst, y_tst),
        'tr_acc':  model.score(x_tr_encoded, y_tr),
        'te_acc':  model.score(x_te_encoded, y_te),
        'va_acc':  model.score(x_va_encoded, y_va),
        'co_acc':  model.score(x_co_encoded, y_co),
    })
    print(f'score()s    dur={time.time() - start}')
    start = time.time()
    run += 1

run 0 fit() dur=33.82995367050171
score()s    dur=226.800518989563
run 1 fit() dur=31.592586040496826
score()s    dur=216.28667426109314
run 2 fit() dur=29.467215538024902
score()s    dur=223.49970602989197
run 3 fit() dur=28.677660942077637
score()s    dur=209.384206533432
run 4 fit() dur=28.410287380218506
score()s    dur=209.28026461601257
CPU times: user 20min 37s, sys: 326 ms, total: 20min 37s
Wall time: 20min 37s


In [9]:
pct_fmt = '{:,.0%}'.format
fmt = {col: pct_fmt for col in ['trn_acc', 'tst_acc', 'tr_acc', 'te_acc', 'va_acc', 'co_acc']}
display(HTML(pd.DataFrame(runs).to_html(formatters=fmt)))

Unnamed: 0,run,trn_acc,tst_acc,tr_acc,te_acc,va_acc,co_acc
0,0,99%,94%,95%,95%,95%,95%
1,1,99%,94%,95%,94%,95%,95%
2,2,99%,94%,95%,94%,95%,95%
3,3,99%,94%,95%,95%,94%,95%
4,4,99%,93%,95%,94%,94%,95%
