# Cross validation of the Jeopardy dataset
- Determine if the Jeopardy dataset can be used similar to mimic what is seen in representative dataset when running Performance Predictor

In [7]:
import collections
import gzip
from IPython.display import display, HTML
import json
import numpy as np
import os
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.model_selection import StratifiedKFold
from sklearn.svm import SVC
import time
from typing import List

pd.options.display.max_colwidth = 100

%load_ext autoreload
%autoreload 2

# Increase the width of the notebook so that it is the width of the browser 
# which allows larger size for the dashboard
display(HTML('<style>.container { width:100% !important; }</style>'))

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


#### Load workspace dataset

In [8]:
csv_file = '../../../data/jeopardy/jeopardy_200.csv'
if os.path.exists(csv_file):
    df_ws = pd.read_csv(csv_file)
else:
    print(f'file not found = {csv_file}')
    # Load the json file
    json_gzip_file = '../../../data/jeopardy/JEOPARDY_QUESTIONS1.json.gzip'
    with gzip.open(json_gzip_file, 'r') as fin:
        questions_list = json.loads(fin.read().decode('utf-8'))
    
    # Gather the most common categories
    categories = collections.Counter([q["category"] for q in questions_list])
    common_categories = categories.most_common(201)
    top_intents = [c[0] for c in common_categories]
    print(f'len(top_intents) = {len(top_intents)}')

    # Create the example and intent from the top most categories.
    # Note: category 'CROSSWORD CLUES "F"' is composed of questions with '<' 
    #       thus get 201 categories to end up with 200
    data = []
    for q in questions_list:
        if '<' not in q['question'] and '<' not in q['answer'] and q['category'] in top_intents:
            data.append({'intent': q['category'].replace('&', 'AND'), 'example': f'{q["question"]} {q["answer"]}'})
    df_ws = pd.DataFrame(data)
    df_ws.to_csv(csv_file, index=False)

print(f'n_intents   = {len(df_ws["intent"].unique())}')
print(f'df_ws.shape = {df_ws.shape}')
# display(HTML(df_ws.head().to_html()))

file not found = ../../../data/jeopardy/jeopardy_200.csv
len(top_intents) = 201
n_intents   = 200
df_ws.shape = (32623, 2)


In [3]:
x = df_ws['example'].to_numpy()
y = df_ws['intent'].to_numpy().ravel()
print(f'x.shape        = {x.shape}')
print(f'y.shape        = {y.shape}')

x.shape        = (32623,)
y.shape        = (32623,)


#### Encode with USE encoder

In [4]:
%%time
class MiniLMEmbedding:
    def __init__(self):
        self.transformer = SentenceTransformer('paraphrase-MiniLM-L6-v2')
    def encode(self, input_sentences: List[str]) -> np.array:
        sentences = [sentence.lower() for sentence in input_sentences]
        embedded_sentences = [self.embed_sentence(s) for s in sentences]
        return np.array(embedded_sentences)
    def embed_sentence(self, sentence: str) -> np.array:
        embedding = self.transformer.encode(sentence, show_progress_bar=False, convert_to_numpy=True)
        return embedding

encoded_file = '../../../data/jeopardy/jeopardy_200_x_encoded.csv'
if os.path.exists(encoded_file):
    df = pd.read_csv(encoded_file, header=None)
    x_encoded = df.to_numpy()
else:
    encoder = MiniLMEmbedding()
    x_encoded = encoder.encode(x)
    # Save to file
    df = pd.DataFrame(x_encoded)
    df.to_csv(encoded_file, header=False, index=False)

print(f'x_encoded.shape = {x_encoded.shape}')

x_encoded.shape = (32623, 384)
CPU times: user 1h 45min 38s, sys: 10.2 s, total: 1h 45min 48s
Wall time: 18min 31s


#### Run a cross validation on SVM classifiers
- Split the combined (x_encoded) dataset into 7 splits
- Each train (x_trn) is 4650 (32,549/7)
- Each test (x_tst) is 27,899 (32,549 * 6/7)
- Score the accuracy of each cross split
  - Normally you'd test against the test of each split (x_tst)
  - But in this case test against each dataset 
     - split train (x_trn)
     - split test (x_tst)
     - original x (x)
  - This is done to see if any of the datasets have problems, e.g. one has a very low score compared to the others.

In [5]:
%%time

skf = StratifiedKFold(n_splits=7, random_state=42, shuffle=True)

runs = []
run = 0
# reverse the normal train/test split sizes.
# Keep the train small and the test large
# So the trains are similar in size to the representative dataset
for tst_index, trn_index in skf.split(x_encoded, y):
    x_trn = x_encoded[trn_index]
    x_tst = x_encoded[tst_index]
    y_trn = y[trn_index]
    y_tst = y[tst_index]
    start = time.time()
    model = SVC(probability=True, random_state=42)
    model.fit(x_trn, y_trn)
    print(f'{run} fit()  dur={time.time() - start}')
    start = time.time()
    runs.append({
        'run':     run,
        'trn_acc': f'{model.score(x_trn, y_trn):.0%}',
        'tst_acc': f'{model.score(x_tst, y_tst):.0%}',
        'x_acc':  f'{model.score(x_encoded, y):.0%}',
    })
    print(f'{run} scores dur={time.time() - start}')
    run += 1

0 fit()  dur=49.67365837097168
0 scores dur=253.6580126285553
1 fit()  dur=47.113123178482056
1 scores dur=262.2241997718811
2 fit()  dur=48.70595026016235
2 scores dur=270.25928950309753
3 fit()  dur=42.8117880821228
3 scores dur=268.46670508384705
4 fit()  dur=49.04321646690369
4 scores dur=275.95526814460754
5 fit()  dur=48.880475759506226
5 scores dur=280.79315996170044
6 fit()  dur=54.80375599861145
6 scores dur=252.6621551513672
CPU times: user 36min 44s, sys: 595 ms, total: 36min 45s
Wall time: 36min 45s


In [6]:
df = pd.DataFrame(runs)
display(HTML(df.to_html()))
# 7 split cross validation
# run trn_acc tst_acc x_acc
# --- ------- ------- -----
#   0     82%     28%   36%
#   1     82%     28%   36%
#   2     81%     28%   36%
#   3     81%     28%   35%
#   4     83%     28%   36%
#   5     83%     28%   36%
#   6     81%     28%   36%

Unnamed: 0,run,trn_acc,tst_acc,x_acc
0,0,82%,28%,36%
1,1,82%,28%,36%
2,2,81%,28%,36%
3,3,81%,28%,35%
4,4,83%,28%,36%
5,5,83%,28%,36%
6,6,81%,28%,36%
