# Train and test Sherlock when ensembled with a RF classifier
To boost the performance of Sherlock, it can be combined with a RF classifier.

The scripts below show the procedure for doing so.

In [3]:
model_id = 'sherlock'

In [4]:
# If you need fully deterministic results between runs, set the following environment value prior to launching jupyter.

# See comment in sherlock.features.paragraph_vectors.infer_paragraph_embeddings_features for more info.
#%env PYTHONHASHSEED

In [5]:
%load_ext autoreload
%autoreload 2

In [6]:
import itertools

from ast import literal_eval
from collections import Counter
from datetime import datetime

import pandas as pd
import numpy as np

from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier, VotingClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, f1_score

from sherlock.deploy.model import SherlockModel

### Load training and validation sets

In [7]:
start = datetime.now()
print(f'Started at {start}')

X_train = pd.read_parquet('../data/data/processed/train.parquet')
y_train = pd.read_parquet('../data/data/raw/train_labels.parquet').values.flatten()

y_train = np.array([x.lower() for x in y_train])

print(f'Load data (train) process took {datetime.now() - start} seconds.')

Started at 2022-03-24 14:50:34.415070
Load data (train) process took 0:00:13.766619 seconds.


In [8]:
print('Distinct types for columns in the Dataframe (should be all float32):')
print(set(X_train.dtypes))

Distinct types for columns in the Dataframe (should be all float32):
{dtype('float32')}


In [9]:
start = datetime.now()
print(f'Started at {start}')

X_validation = pd.read_parquet('../data/data/processed/validation.parquet')
y_validation = pd.read_parquet('../data/data/raw/val_labels.parquet').values.flatten()

y_validation = np.array([x.lower() for x in y_validation])

print(f'Load data (validation) process took {datetime.now() - start} seconds.')

Started at 2022-03-24 14:50:49.100031
Load data (validation) process took 0:00:04.417602 seconds.


In [10]:
X_train = pd.concat([X_train, X_validation], ignore_index=True)

In [11]:
y_train = np.array([x.lower() for x in itertools.chain(y_train, y_validation)])

### Train Voting Classifier using RFC and ETC

In [12]:
# n_estimators=300 gives a slightly better result (0.1%), but triples the fit time
voting_clf = VotingClassifier(
    estimators=[
        ('rf', RandomForestClassifier(n_estimators=100, random_state=13, n_jobs=-1)),
        ('et', ExtraTreesClassifier(n_estimators=100, random_state=13, n_jobs=-1))
    ],
    voting='soft'
)

start = datetime.now()
print(f'Started at {start}')

voting_clf.fit(X_train, y_train)

print(f'Finished at {datetime.now()}, took {datetime.now() - start} seconds')

Started at 2022-03-24 14:50:58.349528
Finished at 2022-03-25 03:32:41.677354, took 12:41:43.328609 seconds


In [13]:
# Make individual (trained) estimators available
rf_clf = voting_clf.estimators_[0]
et_clf = voting_clf.estimators_[1]

### Load test set

In [14]:
start = datetime.now()
print(f'Started at {start}')

X_test = pd.read_parquet('../data/data/processed/test.parquet')
y_test = pd.read_parquet('../data/data/raw/test_labels.parquet').values.flatten()

y_test = np.array([x.lower() for x in y_test])

print('Trained and saved new model.')
print(f'Finished at {datetime.now()}, took {datetime.now() - start} seconds')

Started at 2022-03-25 03:32:42.549990
Trained and saved new model.
Finished at 2022-03-25 03:32:46.986988, took 0:00:04.437011 seconds


In [3]:
np.unique(y_validation)


NameError: name 'np' is not defined

### Make predictions

In [16]:
classes = np.load(
    f"../model_files/classes_{model_id}.npy",
    allow_pickle=True
)
classes = np.array([cls.lower() for cls in classes])

In [17]:
assert (classes == sorted(classes)).all()

In [18]:
def predicted_labels(y_pred_proba, classes):
    y_pred_int = np.argmax(y_pred_proba, axis=1)
    
    encoder = LabelEncoder()
    encoder.classes_ = classes

    return encoder.inverse_transform(y_pred_int)


def prediction_summary(y_test, predicted_labels):
    print(f'prediction count {len(predicted_labels)}, type = {type(predicted_labels)}')

    size=len(y_test)

    print(f'f1 score {f1_score(y_test[:size], predicted_labels[:size], average="weighted")}')

### Predict: RFC

In [19]:
predicted_rfc_proba = rf_clf.predict_proba(X_test)

In [20]:
prediction_summary(y_test, predicted_labels(predicted_rfc_proba, classes))

prediction count 137353, type = <class 'numpy.ndarray'>
f1 score 0.8907835900890803


### Predict: ETC

In [21]:
predicted_etc_proba = et_clf.predict_proba(X_test)

In [22]:
prediction_summary(y_test, predicted_labels(predicted_etc_proba, classes))

prediction count 137353, type = <class 'numpy.ndarray'>
f1 score 0.8883310076542335


### Predict: Voting Classifier (RFC + ETC)

In [23]:
predicted_voting_proba = voting_clf.predict_proba(X_test)

In [24]:
prediction_summary(y_test, predicted_labels(predicted_voting_proba, classes))

prediction count 137353, type = <class 'numpy.ndarray'>
f1 score 0.8933560975251762


### Predict: Sherlock NN

In [1]:
model = SherlockModel()
model.initialize_model_from_json(with_weights=True, model_id="sherlock")
predicted_sherlock_proba = model.predict_proba(X_test)

NameError: name 'SherlockModel' is not defined

In [2]:
prediction_summary(y_test, predicted_labels(predicted_sherlock_proba, classes))

NameError: name 'prediction_summary' is not defined

### Predict: Combined

In [None]:
combined = []
    
for i in range(len(y_test)):
    nn_probs = predicted_sherlock_proba[i]
    voting_probs = predicted_voting_proba[i]
    
    x = nn_probs + voting_probs
    x = x / 2

    combined.append(x)

In [None]:
labels = predicted_labels(combined, classes)

prediction_summary(y_test, labels)

In [None]:
report = classification_report(y_test, labels, output_dict=True)

class_scores = list(filter(lambda x: isinstance(x, tuple) and isinstance(x[1], dict) and 'f1-score' in x[1] and x[0] in classes, list(report.items())))

class_scores = sorted(class_scores, key=lambda item: item[1]['f1-score'], reverse=True)

### Top 5 Types

In [None]:
def score_table(class_scores):
    print(f"\t\tf1-score\tprecision\trecall\t\tsupport")

    for key, value in class_scores:
        if len(key) >= 8:
            tabs = '\t' * 1
        else:
            tabs = '\t' * 2

        print(f"{key}{tabs}{value['f1-score']:.3f}\t\t{value['precision']:.3f}\t\t{value['recall']:.3f}\t\t{value['support']}")

In [None]:
score_table(class_scores[0:5])

### Bottom 5 Types

In [None]:
score_table(class_scores[len(class_scores)-5:len(class_scores)])

### All Scores (by class)

In [None]:
print(classification_report(y_test, labels, digits=3))

## Review errors

In [None]:
size = len(y_test)
mismatches = list()

for idx, k1 in enumerate(y_test[:size]):
    k2 = labels[idx]

    if k1 != k2:
        mismatches.append(k1)
#        if k1 in ('brand'):
#        print(f'[{idx}] expected "{k1}" but predicted "{k2}"')
        
f1 = f1_score(y_test[:size], labels[:size], average="weighted")
print(f'Total mismatches: {len(mismatches)} (F1 score: {f1})')

data = Counter(mismatches)
data.most_common()   # Returns all unique items and their counts

In [None]:
test_samples = pd.read_parquet('../data/data/raw/test_values.parquet')

In [None]:
idx = 541
original = test_samples.iloc[idx]
converted = original.apply(literal_eval).to_list()

print(f'Predicted "{labels[idx]}", actual label "{y_test[idx]}". Actual values:\n{converted}')

In [None]:
print(f'Completed at {datetime.now()}')