# This notebook enables training and testing of Sherlock.
The procedure is:
- Load train, val, test datasets (should be preprocessed)
- Initialize model using the "pretrained" model or by training one from scratch.
- Evaluate and analyse the model predictions.

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
# This will be the ID for the retrained model,
#further down predictions can also be made with the original model: "sherlock"
model_id = 'retrained_sherlock'

In [4]:
from ast import literal_eval
from collections import Counter
from datetime import datetime

import numpy as np
import pandas as pd

from sklearn.metrics import f1_score, classification_report

from sherlock.deploy.model import SherlockModel

## Load datasets for training, validation, testing

In [5]:
start = datetime.now()
print(f'Started at {start}')

X_train = pd.read_parquet('../data/data/processed/train.parquet')
y_train = pd.read_parquet('../data/data/raw/train_labels.parquet').values.flatten()

y_train = np.array([x.lower() for x in y_train])


print(X_train.shape)

print(f'Load data (train) process took {datetime.now() - start} seconds.')

Started at 2022-04-20 16:20:00.808649
(412059, 1588)
Load data (train) process took 0:00:12.494049 seconds.


In [6]:
print('Distinct types for columns in the Dataframe (should be all float32):')
print(set(X_train.dtypes))

Distinct types for columns in the Dataframe (should be all float32):
{dtype('float32')}


In [1]:
start = datetime.now()
print(f'Started at {start}')

X_validation = pd.read_parquet('../data/data/processed/validation.parquet')
y_validation = pd.read_parquet('../data/data/raw/val_labels.parquet').values.flatten()

y_validation = np.array([x.lower() for x in y_validation])

print(f'Load data (validation) process took {datetime.now() - start} seconds.')

NameError: name 'datetime' is not defined

In [7]:
start = datetime.now()
print(f'Started at {start}')

X_test = pd.read_parquet('../data/data/processed/test.parquet')
y_test = pd.read_parquet('../data/data/raw/test_labels.parquet').values.flatten()

y_test = np.array([x.lower() for x in y_test])

print(f'Finished at {datetime.now()}, took {datetime.now() - start} seconds')

Started at 2022-04-06 09:54:52.079637
Finished at 2022-04-06 09:54:55.915934, took 0:00:03.836310 seconds


In [22]:
from sherlock.deploy import helpers

feature_cols = helpers.categorize_features()
print(feature_cols["rest"]) # char / word / par / rest

#X_train[['par_vec_0', 'par_vec_1']]
X_train_char = X_train[feature_cols["char"]]
X_train_char.shape[1]

['col_entropy', 'frac_unique', 'frac_numcells', 'frac_textcells', 'avg_num_cells', 'std_num_cells', 'avg_text_cells', 'std_text_cells', 'avg_spec_cells', 'std_spec_cells', 'avg_word_cells', 'std_word_cells', 'n_values', 'length-agg-any', 'length-agg-all', 'length-agg-mean', 'length-agg-var', 'length-agg-min', 'length-agg-max', 'length-agg-median', 'length-agg-sum', 'length-agg-kurtosis', 'length-agg-skewness', 'none-agg-has', 'none-agg-percent', 'none-agg-num', 'none-agg-all']


960

## Initialize the model
Two options:
- Load Sherlock model with pretrained weights
- Fit Sherlock model from scratch

### Option 1: load Sherlock with pretrained weights

In [8]:
#'''
start = datetime.now()
print(f'Started at {start}')

model = SherlockModel();
model.initialize_model_from_json(with_weights=True, model_id="sherlock");

print('Initialized model.')
print(f'Finished at {datetime.now()}, took {datetime.now() - start} seconds')

#'''

Started at 2022-04-06 09:54:55.919779
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
If using Keras pass *_constraint arguments to layers.


2022-04-06 09:54:56.329225: I tensorflow/core/platform/cpu_feature_guard.cc:145] This TensorFlow binary is optimized with Intel(R) MKL-DNN to use the following CPU instructions in performance critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in non-MKL-DNN operations, rebuild TensorFlow with the appropriate compiler flags.
2022-04-06 09:54:56.381408: I tensorflow/core/platform/profile_utils/cpu_utils.cc:94] CPU Frequency: 4001000000 Hz
2022-04-06 09:54:56.383827: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x7fffc9554520 initialized for platform Host (this does not guarantee that XLA will be used). Devices:
2022-04-06 09:54:56.383907: I tensorflow/compiler/xla/service/service.cc:176]   StreamExecutor device (0): Host, Default Version
OMP: Info #155: KMP_AFFINITY: Initial OS proc set respected: 0-7
OMP: Info #217: KMP_AFFINITY: decoding x2APIC ids.
OMP: Info #217: KMP_AFFINITY: x2APIC ids not unique.
OMP: Info #217: KMP_AFFINITY: decoding legacy APIC ids.

Initialized model.
Finished at 2022-04-06 09:54:56.828820, took 0:00:00.909051 seconds


### Option 2: fit Sherlock from scratch (and save for later use)

In [3]:
model_id = "retrained_sherlock"

In [32]:
start = datetime.now()
print(f'Started at {start}')

model = SherlockModel()
# Model will be stored with ID `model_id`
model.fit(X_train, y_train, X_validation, y_validation, model_id=model_id)

print('Trained and saved new model.')
print(f'Finished at {datetime.now()}, took {datetime.now() - start} seconds')

Started at 2022-04-04 15:22:51.043972
Train on 412059 samples, validate on 137353 samples
Epoch 1/2
Epoch 2/2
Trained and saved new model.
Finished at 2022-04-04 15:24:59.291637, took 0:02:08.247681 seconds


In [10]:
model.store_weights(model_id=model_id)

In [33]:
# serialize model to JSON
model.to_json2()



Saved model to disk


In [23]:

#model = SherlockModel();
st = model.initialize_model_from_json(with_weights=True, model_id="retrained_sherlock");


None


### Make prediction

In [None]:
predicted_labels = model.predict(X_test)
predicted_labels = np.array([x.lower() for x in predicted_labels])

In [None]:
print(f'prediction count {len(predicted_labels)}, type = {type(predicted_labels)}')

size=len(y_test)

# Should be fully deterministic too.
f1_score(y_test[:size], predicted_labels[:size], average="weighted")

In [None]:
# If using the original model, model_id should be replaced with "sherlock"
#model_id = "sherlock"
classes = np.load(f"../model_files/classes_{model_id}.npy", allow_pickle=True)

report = classification_report(y_test, predicted_labels, output_dict=True)

class_scores = list(filter(lambda x: isinstance(x, tuple) and isinstance(x[1], dict) and 'f1-score' in x[1] and x[0] in classes, list(report.items())))

class_scores = sorted(class_scores, key=lambda item: item[1]['f1-score'], reverse=True)

### Top 5 Types

In [None]:
print(f"\t\tf1-score\tprecision\trecall\t\tsupport")

for key, value in class_scores[0:5]:
    if len(key) >= 8:
        tabs = '\t' * 1
    else:
        tabs = '\t' * 2

    print(f"{key}{tabs}{value['f1-score']:.3f}\t\t{value['precision']:.3f}\t\t{value['recall']:.3f}\t\t{value['support']}")

### Bottom 5 Types

In [None]:
print(f"\t\tf1-score\tprecision\trecall\t\tsupport")

for key, value in class_scores[len(class_scores)-5:len(class_scores)]:
    if len(key) >= 8:
        tabs = '\t' * 1
    else:
        tabs = '\t' * 2

    print(f"{key}{tabs}{value['f1-score']:.3f}\t\t{value['precision']:.3f}\t\t{value['recall']:.3f}\t\t{value['support']}")

### All Scores

In [None]:
print(classification_report(y_test, predicted_labels, digits=3))

## Review errors

In [None]:
size = len(y_test)
mismatches = list()

for idx, k1 in enumerate(y_test[:size]):
    k2 = predicted_labels[idx]

    if k1 != k2:
        mismatches.append(k1)
        
        # zoom in to specific errors. Use the index in the next step
        if k1 in ('address'):
            print(f'[{idx}] expected "{k1}" but predicted "{k2}"')
        
f1 = f1_score(y_test[:size], predicted_labels[:size], average="weighted")
print(f'Total mismatches: {len(mismatches)} (F1 score: {f1})')

data = Counter(mismatches)
data.most_common()   # Returns all unique items and their counts

In [None]:
test_samples = pd.read_parquet('../data/data/raw/test_values.parquet')

In [None]:
idx = 1001
original = test_samples.iloc[idx]
converted = original.apply(literal_eval).to_list()

print(f'Predicted "{predicted_labels[idx]}", actual label "{y_test[idx]}". Actual values:\n{converted}')