In [None]:
import json
import os
from importlib import reload 

import pandas as pd
from thefuzz import fuzz, process
from rapidfuzz.distance import Levenshtein

import democratizing_data_ml_algorithms.data.kaggle_repository as kr
import democratizing_data_ml_algorithms.evaluate.model as em

In [None]:
class MockRepo:
    def __init__(self, df):
        self.df = df
    def get_validation_data(self):
        return self.df
    def copy(self):
        return MockRepo(self.df.copy())
        
repo = MockRepo(kr.KaggleRepository().get_validation_data())

In [None]:
os.makedirs("eval_cache", exist_ok=True)

### String Matching

In [None]:
import democratizing_data_ml_algorithms.models.kaggle_model3 as km3
import democratizing_data_ml_algorithms.models.kaggle_model3_regex_inference as km3r
import democratizing_data_ml_algorithms.models.regex_model as rm

#### Simple String Matching

In [None]:
km3_config = {
    "model_path": "../models/kaggle_model3/baseline/params.txt",
}

km3_eval = em.evaluate_model(
    repo.copy(),
    km3.KaggleModel3(),
    km3_config, 
    scorer = fuzz.partial_ratio,
)

with open("./eval_cache/km3_pr.json", "w") as f:
    json.dump(km3_eval.to_json(), f)

km3_eval

#### Regex Enhanced Match

In [None]:
with open(km3_config["model_path"], "r") as f:
    keywords = [l.strip() for l in f.readlines()]

In [None]:
regex_config = {
    "keywords": keywords,
    "regex_pattern": "",
}

regex_eval = em.evaluate_model(
    repo.copy(),
    rm.RegexModel(regex_config),
    dict(), 
    scorer = fuzz.partial_ratio,
)

with open("./eval_cache/regex_pr.json", "w") as f:
    json.dump(regex_eval.to_json(), f)

regex_eval

### Entity Classification

In [None]:
import democratizing_data_ml_algorithms.models.kaggle_model2 as km2
import democratizing_data_ml_algorithms.models.schwartz_hearst_model as shm

#### Submitted Model 2

In [None]:
km2_config = dict(
    batch_size=16,
    model_tokenizer_name="../models/kaggle_model2/baseline/model",
    min_prob=0.9,
    extractor = shm.SchwartzHearstModel(),
    extractor_config = dict(),
    tokenizer_call_kwargs=dict(
        return_tensors='pt',
        padding=True,
    ),
)

km2_eval = em.evaluate_model(
    repo.copy(),
    km2.KaggleModel2(),
    km2_config,
    scorer = fuzz.partial_ratio,
)

with open("./eval_cache/km2_pr.json", "w") as f:
    json.dump(km2_eval.to_json(), f)

km2_eval

#### km2 with updates. New models weights and regular expression

In [None]:
km2_update_config = dict(
    batch_size=16,
    model_tokenizer_name="../models/kaggle_model2/km2_ad4c81a2cb1d449dbf2ae89c940d5eb1",
    min_prob=0.9,
    extractor = rm.RegexModel(dict()),
    extractor_config = dict(),
    tokenizer_call_kwargs=dict(
        return_tensors='pt',
        padding=True,
        truncation=True,
    ),
)

km2_update_eval = em.evaluate_model(
    repo.copy(),
    km2.KaggleModel2(),
    km2_update_config,
    scorer = fuzz.partial_ratio,
)

with open("./eval_cache/km2_update_pr.json", "w") as f:
    json.dump(km2_update_eval.to_json(), f)

km2_update_eval

### Token Classification

In [None]:
import democratizing_data_ml_algorithms.models.kaggle_model1 as km1
import democratizing_data_ml_algorithms.models.generic_model1 as gm1
import democratizing_data_ml_algorithms.models.ner_model as nm

#### Submitted SciBERT

Note this model can have a high variance.

In [None]:
model_base_path = "../models/kaggle_model1/sub_scibert"
km1_scibert_config = dict(
    support_mask_embedding_path = os.path.join(model_base_path, "embeddings/support_embeddings.npy"),
    support_no_mask_embedding_path = os.path.join(model_base_path, "embeddings/support_nomask_embeddings.npy"),
    n_support_samples = 100,
    model_tokenizer_name = model_base_path,
    weights_path = os.path.join(model_base_path, "embeddings/"),
    batch_size = 128,
    seq_len = 320,
    overlap = 200,
    is_roberta = False,
    threshold = 0.7,
    inference_progress_bar = True,
)

km1_scibert_eval = em.evaluate_model(
    repo.copy(),
    km1.KaggleModel1(),
    km1_scibert_config,
    scorer = fuzz.partial_ratio,
)

with open("./eval_cache/km1_scibert_pr.json", "w") as f:
    json.dump(km1_scibert_eval.to_json(), f)

km1_scibert_eval

In [None]:
with open("./eval_cache/km1_scibert_pr.json", "r") as f:
    rec = json.load(f)
em.ModelEvaluation.from_json(rec)

In [None]:
with open("./eval_cache/km1_scibert_pr.json", "r") as f:
    rec = json.load(f)
em.ModelEvaluation.from_json(rec)

#### Submitted RoBERTa

Note this model can have high variance.

In [None]:
model_base_path = "../models/kaggle_model1/sub_biomed_roberta"
km1_roberta_config = dict(
    support_mask_embedding_path = os.path.join(model_base_path, "embeddings/support_embeddings.npy"),
    support_no_mask_embedding_path = os.path.join(model_base_path, "embeddings/support_nomask_embeddings.npy"),
    n_support_samples = 100,
    model_tokenizer_name = model_base_path,
    weights_path = os.path.join(model_base_path, "embeddings/"),
    batch_size = 128,
    seq_len = 320,
    overlap = 200,
    is_roberta = True,
    threshold = 0.7,
    inference_progress_bar = True,
)

km1_roberta_eval = em.evaluate_model(
    repo.copy(),
    km1.KaggleModel1(),
    km1_roberta_config,
    scorer = fuzz.partial_ratio,
)

with open("./eval_cache/km1_roberta_pr.json", "w") as f:
    json.dump(km1_roberta_eval.to_json(), f)

km1_roberta_eval

In [None]:
with open("./eval_cache/km1_roberta_pr.json", "r") as f:
    rec = json.load(f)
em.ModelEvaluation.from_json(rec)

#### NER

In [None]:
key = "b74f96f240ad41198d046232220ee024"
ner_config = dict(
    batch_size = 16,
    threshold = 0.7,
    inference_progress_bar = True,
    model_tokenizer_name = f"../models/ner_model/baseline/{key}",
    model_kwargs=dict(),
    tokenizer_kwargs=dict(add_prefix_space=True),
    tokenizer_call_kwargs=dict(max_length=512, truncation=True, is_split_into_words=True),   
)

ner_eval = em.evaluate_model(
    repo.copy(),
    nm.NERModel_pytorch(),
    ner_config,
    scorer = fuzz.partial_ratio,
)

with open("./eval_cache/ner_pr.json", "w") as f:
    json.dump(ner_eval.to_json(), f)

ner_eval

In [None]:
with open("./eval_cache/ner_pr.json", "r") as f:
    rec = json.load(f)
em.ModelEvaluation.from_json(rec)

#### Retrained Model (RoBERTa) -- Implemented as GenericModel1


In [None]:
key = "gm1_88ba3fdc63c646308ad15b648acd4843"
gm1_config = {
    "model_tokenizer_name":f"../models/generic_model1/baseline/{key}",
    "tokenizer_kwargs":{},
    "tokenizer_call_kwargs": {
        "max_length":256,
        "truncation":True,
        "is_split_into_words": True
    },
    "model_kwargs":{},
    "optimizer":"torch.optim.AdamW",
    "optimizer_kwargs":{"lr":1e-5},
    "metric_optimizer":"torch.optim.SGD",
    "metric_optimizer_kwargs":{"lr":1e-3},
    "batch_size":32,
    "epochs":1,
    "n_support_samples": 10000,
    "support_mask_embedding_path": f"../models/generic_model1/baseline/{key}/embeddings/support_mask_embeddings.npy",
    "support_no_mask_embedding_path":  f"../models/generic_model1/baseline/{key}/embeddings/support_nomask_embeddings.npy",
    "steps_per_epoch":5000,
    "steps_per_eval":10,
    "balance_labels":True,
    "n_query":2,
    "save_model":True,
    "scheduler": "torch.optim.lr_scheduler.CosineAnnealingLR",
    "scheduler_kwargs": {"T_max":100},
    "model_path":"baseline",
    "inference_progress_bar": True,
    # added
    "threshold": 0.7
}

gm1_eval = em.evaluate_model(
    repo.copy(),
    gm1.GenericModel1(),
    gm1_config,
    scorer = fuzz.partial_ratio,
)

with open("./eval_cache/gm1_pr.json", "w") as f:
    json.dump(gm1_eval.to_json(), f)

gm1_eval

In [None]:
with open("./eval_cache/gm1_pr.json", "r") as f:
    rec = json.load(f)
em.ModelEvaluation.from_json(rec)

### Ensemble

In [None]:

pr = dict()

with open("./eval_cache/regex_pr.json", "r") as f:
    pr["2"] = em.ModelEvaluation.from_json(json.load(f))

with open("./eval_cache/km2_update_pr.json", "r") as f:
    pr["4"] = em.ModelEvaluation.from_json(json.load(f))

with open("./eval_cache/ner_pr.json", "r") as f:
    pr["7"] = em.ModelEvaluation.from_json(json.load(f))

#### Regex + Retrained Model 2 = (2)+(4)

In [None]:
pr["2"] | pr["4"]

#### Regex + NER = (2) + (7)

In [None]:
pr["2"] | pr["7"]

#### Retrained Model 2 + NER = (4)+(7)

In [None]:
pr["4"] | pr["7"]

#### Regex + Retrained Model 2 + NER = (2) + (4) + (7)

In [None]:
pr["2"] | pr["4"] | pr["7"]