In [1]:
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
from util import *

In [2]:
# from sklearn import set_config
# set_config(display='diagram') 

In [3]:
training_set = pd.read_json("training_set.json.gz", lines=True, orient="records")

testing_set = pd.read_json("testing_set.json.gz", lines=True, orient="records")

subset = list(sorted(set(all_tiers_100)-set(["PersonalizedProduct"])))

In [6]:
count = training_set.shape[0]

In [7]:
pos_weights = (1 / (training_set[subset].sum() / count).values)

In [8]:
subset = list(sorted(set(all_tiers_100)-set(["PersonalizedProduct"])))
subset

['AnalysisAndModeling',
 'AnalysisAndModeling_3DModeling',
 'AnatomicalTarget',
 'AnatomicalTarget_LowerExtremity',
 'AnatomicalTarget_LowerExtremity_Hip',
 'AnatomicalTarget_LowerExtremity_Knee',
 'AnatomicalTarget_Torso',
 'AnatomicalTarget_Torso_Spine',
 'AnatomicalTarget_UpperExtremity',
 'AnatomicalTarget_UpperExtremity_Shoulder',
 'Imaging',
 'Imaging_CT',
 'Imaging_MRI',
 'Imaging_Ultrasound',
 'Manufacturing',
 'Manufacturing_AdditiveManufacturing',
 'PersonalizedProduct_Guide/Jig',
 'PersonalizedProduct_Implant',
 'SpecificationofUse',
 'SpecificationofUse_Disease',
 'SpecificationofUse_JointReplacement',
 'SurgicalMethod']

In [9]:
training_labels = training_set[subset]

In [10]:
testing_labels = testing_set[subset]

In [11]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [12]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

In [13]:
set(training_set.columns) - set(all_tiers)

{'abstract',
 'all_tiers',
 'all_tiers_100',
 'application_kind',
 'application_number',
 'application_number_formatted',
 'assignees',
 'citations',
 'cited_by',
 'claims',
 'country_code_x',
 'country_code_y',
 'cpc_codes',
 'description',
 'embedding_v1',
 'examiners',
 'family_id',
 'fi_codes',
 'filing_date',
 'fterm_codes',
 'gpa_number',
 'grant_date',
 'inventors',
 'kind',
 'kind_code',
 'padded_serial',
 'pct_number',
 'priority_date',
 'publication_date',
 'publication_number',
 'serial_x',
 'serial_y',
 'similar_npl',
 'similar_patents',
 'tier1_tags',
 'tier1_tier2_tags',
 'tier2_100',
 'tier3_100',
 'title',
 'top_terms',
 'url',
 'uspc_codes'}

In [14]:
training_set.top_terms

0      [prosthesis, member, coupling portion, portion...
1      [femur, tibia, implant, leg, patient, portion,...
2      [lateral, femur, patient, body, medial, slot, ...
3      [glenoid, virtual, alignment pin, patient, sca...
4      [bone, surface, patient, resection, jig, use, ...
                             ...                        
967    [method, bone, medial, patient, template, late...
968    [cutting guide, surgical cutting, customized s...
969    [patient, adapted, surface, surface model, imp...
970    [polymer, monomers, modulus, thiol, multifunct...
971    [implant, patient, peg, portion, bone, porous,...
Name: top_terms, Length: 972, dtype: object

In [15]:
cpc_embeddings = np.fromfile("/home/martin/patentmark/cpc.node2vec.emb.32d.bin", dtype=np.float32).reshape((-1,32))

import joblib
cpc_labelizer = joblib.load('./node2id.joblib')
cpc_lookup = {c: n for n, c in enumerate(cpc_labelizer.classes_)}

@f.collecting
def convert_cpc_codes(codes):
    for code in codes:
        if code in cpc_lookup:
            yield cpc_lookup[code]
    
def embed_cpc_codes(codes):
    embedding = np.zeros(32)
    converted = convert_cpc_codes(codes)
    
    if not converted:
        return embedding
    
    for code_id in converted:
        embedding = embedding + cpc_embeddings[code_id]
        
    return embedding / len(converted)

training_set['embedded_cpc'] = training_set.cpc_codes.apply(embed_cpc_codes)
training_set.embedded_cpc

testing_set['embedded_cpc'] = testing_set.cpc_codes.apply(embed_cpc_codes)
testing_set.embedded_cpc

0      [0.09129103335241477, -0.8074875394503276, -0....
1      [-0.0626441298850945, -0.8264780470303127, -0....
2      [-0.2087969978650411, -0.8326806823412577, -0....
3      [0.020394775830209256, -0.8215901732444764, -0...
4      [-0.26043402403593063, -0.6891247034072876, -0...
                             ...                        
238    [-0.23802674313386282, -0.628900408744812, -0....
239    [-0.3754243354002635, -0.6894144614537557, -0....
240    [-0.12913421913981438, -0.6960149183869362, -0...
241    [-0.3880331997688, -0.702021429171929, -0.2717...
242    [-0.2977850042283535, -0.7015813589096069, -0....
Name: embedded_cpc, Length: 243, dtype: object

In [16]:
np.array(testing_set.embedded_cpc.values.tolist()).shape

(243, 32)

In [17]:
from sklearn.ensemble import *

In [18]:
from sklearn.svm import SVC, LinearSVC
from sklearn.neighbors import KNeighborsClassifier

In [19]:
from sklearn.dummy import DummyClassifier

In [20]:
from sklearn.linear_model import LogisticRegression

In [53]:
# from transformers import FeatureExtractionPipeline, AutoModel, AutoTokenizer
# transformer_model = AutoModel.from_pretrained("bertForPatents/", gradient_checkpointing=True)
# transformer_tokenizer = AutoTokenizer.from_pretrained("bertForPatents/")
# transformer = FeatureExtractionPipeline(transformer_model, tokenizer=transformer_tokenizer, device=0)

import sentence_transformers as st
from sentence_transformers import SentenceTransformer

# model_name = "/home/martin/patentmark/patentmark-charting-app/vectors/sPatent-v2/"
# text_embedder = SentenceTransformer(model_name)
model_name = "bertForPatents/"
word_embedding_model = st.models.Transformer(model_name)
pooling_model = st.models.Pooling(word_embedding_model.get_word_embedding_dimension(),
                               pooling_mode_mean_tokens=True,
                               pooling_mode_cls_token=False,
                               pooling_mode_max_tokens=True)
text_embedder = SentenceTransformer(modules=[word_embedding_model, pooling_model])

In [54]:
class TransformerFeatures(sklearn.base.BaseEstimator):
    def fit(self, X, y):
        return self
        
    def transform(self, X):
        return text_embedder.encode(X)
        

In [55]:
class VectorPassthrough(sklearn.base.BaseEstimator):
    def fit(self, X, y):
        return self
    
    def transform(self, X):
        return np.array(X.values.tolist())
        

In [56]:
from sklearn.decomposition import TruncatedSVD

In [57]:

# column_prep = ColumnTransformer([
# #      ('top_terms',
# #       CountVectorizer(analyzer=iden, binary=True, min_df=2),
# #      'top_terms'
# #      ),
#      ('cited_by',
#       Pipeline(steps=[
#           ('cited', CountVectorizer(analyzer=iden, binary=True, min_df=2)),
#           ('cited_svd', TruncatedSVD(n_components=64, random_state=42))]),
#      'cited_by'
#      ),
#     ('inventors',
#      Pipeline(steps=[
#          ('inventors', CountVectorizer(analyzer=iden, binary=True, min_df=2)),
#          ('inventors_svd', TruncatedSVD(n_components=64, random_state=42))]),
#      'inventors'
#      ),
# #      ('citations',
# #        CountVectorizer(analyzer=iden, binary=True, min_df=2),
# #       'citations'
# #       ),
# # #         ('similar_npl',
# # #       CountVectorizer(analyzer=lambda x:x, min_df=2),
# # #      'similar_npl'
# # #      ),
# #         ('similar_patents',
# #       CountVectorizer(analyzer=iden, binary=True, min_df=2),
# #      'similar_patents'
# #      )
# #   ,
# #      ('cpc',
# #       CountVectorizer(analyzer=cpc_split, binary=True, min_df=2),
# #      'cpc_codes'
# #      ),
#     ('embedded_cpc', 
#      VectorPassthrough(),
#      'embedded_cpc'
#     ),
#     ('embedding_v1',
#      VectorPassthrough(),
#      'embedding_v1'
#     ),
#     ('transformer_abstract',
#     TransformerFeatures(),
#      'abstract'
#     ),
#     ('transformer_claims',
#      TransformerFeatures(),
#      'claims'
#     )], verbose=False, n_jobs=1)

# from sklearn.preprocessing import Normalizer
# pipeline = Pipeline(steps=[('columns', column_prep), 
#                            ('norm', Normalizer())])
    

In [58]:
tfidf_default_settings = {
    'lowercase': True, 
    'strip_accents': 'ascii',
    'stop_words' : stopwords,
    'min_df': 5,
    #'max_df': 0.5#,
    #'ngram_range': (1,3)
}

column_prep = ColumnTransformer([
#      ('top_terms',
#       CountVectorizer(analyzer=iden, binary=True, min_df=2),
#      'top_terms'
#      ),
     ('cited_by',
      CountVectorizer(analyzer=iden, binary=True, min_df=2),
     'cited_by'
     ),
    ('inventors',
      CountVectorizer(analyzer=iden, binary=True, min_df=2),
     'inventors'
     ),
#      ('citations',
#        CountVectorizer(analyzer=iden, binary=True, min_df=2),
#       'citations'
#       ),
# #         ('similar_npl',
# #       CountVectorizer(analyzer=lambda x:x, min_df=2),
# #      'similar_npl'
# #      ),
#         ('similar_patents',
#       CountVectorizer(analyzer=iden, binary=True, min_df=2),
#      'similar_patents'
#      )
#   ,
#      ('cpc',
#       CountVectorizer(analyzer=cpc_split, binary=True, min_df=2),
#      'cpc_codes'
#      ),
    ('embedded_cpc', 
     VectorPassthrough(),
     'embedded_cpc'
    ),
    ('embedding_v1',
     VectorPassthrough(),
     'embedding_v1'
    ),
    ('transformer_abstract',
    TransformerFeatures(),
     'abstract'
    ),
    ('transformer_claims',
     TransformerFeatures(),
     'claims'
    )], verbose=False, n_jobs=1)

from sklearn.preprocessing import Normalizer
pipeline = Pipeline(steps=[('columns', column_prep), ('norm', Normalizer())])
    
#     ('abstract_tfidf', 
#     TfidfVectorizer(**tfidf_default_settings),
#    'abstract'),
#     ('claims_tfidf',
#      TfidfVectorizer(**tfidf_default_settings),
#      'claims'
#     ),
#     ('description_tfidf',
#      TfidfVectorizer(**tfidf_default_settings),
#      'description'
#     )
      

In [59]:
from sklearn.decomposition import TruncatedSVD

In [60]:
from sklearn.multiclass import OneVsRestClassifier, OneVsOneClassifier
from sklearn.naive_bayes import GaussianNB

In [61]:
from sklearn.metrics import *

In [62]:
from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer()
import util
X_train = pipeline.fit_transform(training_set)
X_test = pipeline.transform(testing_set)

In [63]:
y_train = mlb.fit_transform(training_set[subset].apply(util.array_labels, axis=1))
y_test = mlb.transform(testing_set[subset].apply(util.array_labels, axis=1))

In [64]:
def scorer(net, ds, y=None):
    y_true = [y for _, y in ds]
    y_pred = (net.predict(ds) >= 0.5)
    #score = sklearn.metrics.label_ranking_loss(y_true, y_pred)
    score = sklearn.metrics.f1_score(y_true, y_pred, average="weighted")
    #score = sklearn.metrics.hamming_loss(y_true, y_pred)
    return score

In [73]:
from torch import nn

import torch.nn.functional as F
def linear_combination(x, y, epsilon): 
    return epsilon*x + (1-epsilon)*y

def reduce_loss(loss, reduction='mean'):
    return loss.mean() if reduction=='mean' else loss.sum() if reduction=='sum' else loss

class LabelSmoothingCrossEntropy(nn.Module):
    def __init__(self, pos_weights=pos_weights, epsilon:float=0.01, reduction='mean'):
        super().__init__()
        self.epsilon = epsilon
        self.reduction = reduction
        self.pos_weights = pos_weights
    
    def forward(self, preds, target):
        n = preds.size()[-1]
        loss = reduce_loss(-preds.sum(dim=-1), self.reduction)
        nll = F.binary_cross_entropy_with_logits(preds, target, pos_weight=torch.tensor(self.pos_weights).to("cuda:0"), reduction=self.reduction)
        return linear_combination(loss/n, nll, self.epsilon)

In [86]:
from torch import nn
import torch.nn.functional as F
import torch
from skorch import NeuralNet, dataset
from skorch.callbacks import ProgressBar, Checkpoint, EarlyStopping, EpochScoring, LRScheduler, LoadInitState
from skorch.helper import predefined_split

valid_ds = dataset.Dataset(X_test.astype(np.float32), y_test.astype(np.float32))

class Net(nn.Module):
    def __init__(self, input_features, output_features, bottleneck=512):
        super().__init__()
        self.layer1 = nn.Linear(input_features, bottleneck)
        self.dropout = nn.Dropout(0.5)
        self.dropout2 = nn.Dropout(0.5)
        self.layer2 = nn.Linear(bottleneck, bottleneck)
        self.output = nn.Linear(bottleneck, output_features)
        
    def forward(self, x):
        
        x = self.layer1(x)
        x = F.elu(x)
        #x = self.bn(x)
        x = self.dropout(x)
        
        x = self.layer2(x)
        x = F.elu(x)
        x = self.dropout2(x)
        #x = F.elu(x)
        
        x = self.output(x)
        
        return x

checkpoint = Checkpoint(dirname='exp4')
earlystopping = EarlyStopping(patience=20)
#load_state = LoadInitState(checkpoint)

net = NeuralNet(Net, 
                criterion=LabelSmoothingCrossEntropy, 
                lr=0.00005,
                predict_nonlinearity=torch.sigmoid,
                batch_size=32, 
                max_epochs=10000, 
                module__input_features=X_train.shape[1],
                module__output_features=len(mlb.classes_),
                optimizer=torch.optim.AdamW,
                #train_split=predefined_split(valid_ds),
                iterator_train__shuffle=True,
                iterator_train__num_workers=4,
                callbacks=[ProgressBar(),
                           EpochScoring(scorer, name="f1", lower_is_better=False),
                           checkpoint,
                           earlystopping,
                           #load_state,
                           #LRScheduler(monitor='val_loss'),
                          ],
                device='cuda'
               )

In [87]:
X_train.shape

(972, 8572)

In [94]:
net.fit(X_train[:600].astype(np.float32), y_train[:600].astype(np.float32))

Re-initializing module because the following parameters were re-set: input_features, output_features.
Re-initializing optimizer.


HBox(children=(FloatProgress(value=0.0, max=19.0), HTML(value='')))

  epoch      f1    train_loss    valid_loss    cp     dur
-------  ------  ------------  ------------  ----  ------
      1  [36m0.4312[0m        [32m1.1886[0m        [35m1.1632[0m     +  0.5607


HBox(children=(FloatProgress(value=0.0, max=19.0), HTML(value='')))

      2  [36m0.5034[0m        [32m1.1830[0m        [35m1.1587[0m     +  0.5797


HBox(children=(FloatProgress(value=0.0, max=19.0), HTML(value='')))

      3  [36m0.5080[0m        [32m1.1764[0m        [35m1.1534[0m     +  0.5577


HBox(children=(FloatProgress(value=0.0, max=19.0), HTML(value='')))

      4  [36m0.5080[0m        [32m1.1682[0m        [35m1.1478[0m     +  0.5596


HBox(children=(FloatProgress(value=0.0, max=19.0), HTML(value='')))

      5  0.5040        [32m1.1606[0m        [35m1.1435[0m     +  0.5757


HBox(children=(FloatProgress(value=0.0, max=19.0), HTML(value='')))

      6  0.5040        [32m1.1548[0m        [35m1.1415[0m     +  0.5652


HBox(children=(FloatProgress(value=0.0, max=19.0), HTML(value='')))

      7  0.5080        [32m1.1519[0m        [35m1.1411[0m     +  0.5571


HBox(children=(FloatProgress(value=0.0, max=19.0), HTML(value='')))

      8  0.5080        [32m1.1485[0m        1.1421        0.5535


HBox(children=(FloatProgress(value=0.0, max=19.0), HTML(value='')))

      9  0.5080        [32m1.1481[0m        1.1423        0.5562


HBox(children=(FloatProgress(value=0.0, max=19.0), HTML(value='')))

     10  0.5080        1.1485        1.1423        0.5569


HBox(children=(FloatProgress(value=0.0, max=19.0), HTML(value='')))

     11  0.5080        1.1481        1.1425        0.5566


HBox(children=(FloatProgress(value=0.0, max=19.0), HTML(value='')))

     12  0.5080        [32m1.1469[0m        1.1420        0.5594


HBox(children=(FloatProgress(value=0.0, max=19.0), HTML(value='')))

     13  0.5080        1.1474        1.1420        0.5608


HBox(children=(FloatProgress(value=0.0, max=19.0), HTML(value='')))

     14  0.5080        1.1485        1.1424        0.5677


HBox(children=(FloatProgress(value=0.0, max=19.0), HTML(value='')))

     15  0.5080        [32m1.1461[0m        1.1417        0.5390


HBox(children=(FloatProgress(value=0.0, max=19.0), HTML(value='')))

     16  0.5080        1.1488        1.1421        0.5576


HBox(children=(FloatProgress(value=0.0, max=19.0), HTML(value='')))

     17  0.5080        1.1480        1.1420        0.5672


HBox(children=(FloatProgress(value=0.0, max=19.0), HTML(value='')))

     18  0.5080        1.1474        1.1414        0.5598


HBox(children=(FloatProgress(value=0.0, max=19.0), HTML(value='')))

     19  0.5080        1.1469        1.1417        0.5539


HBox(children=(FloatProgress(value=0.0, max=19.0), HTML(value='')))

     20  0.5080        1.1476        [35m1.1410[0m     +  0.5689


HBox(children=(FloatProgress(value=0.0, max=19.0), HTML(value='')))

     21  0.5080        1.1467        1.1413        0.5503


HBox(children=(FloatProgress(value=0.0, max=19.0), HTML(value='')))

     22  0.5080        1.1476        1.1414        0.5729


HBox(children=(FloatProgress(value=0.0, max=19.0), HTML(value='')))

     23  0.5080        1.1480        1.1413        0.5535


HBox(children=(FloatProgress(value=0.0, max=19.0), HTML(value='')))

     24  0.5080        1.1464        1.1414        0.5676


HBox(children=(FloatProgress(value=0.0, max=19.0), HTML(value='')))

     25  0.5080        1.1471        1.1413        0.5754


HBox(children=(FloatProgress(value=0.0, max=19.0), HTML(value='')))

     26  0.5080        1.1462        [35m1.1408[0m     +  0.5649


HBox(children=(FloatProgress(value=0.0, max=19.0), HTML(value='')))

     27  0.5080        [32m1.1450[0m        1.1411        0.5589


HBox(children=(FloatProgress(value=0.0, max=19.0), HTML(value='')))

     28  0.5080        1.1455        [35m1.1407[0m     +  0.5594


HBox(children=(FloatProgress(value=0.0, max=19.0), HTML(value='')))

     29  0.5080        [32m1.1448[0m        1.1409        0.5746


HBox(children=(FloatProgress(value=0.0, max=19.0), HTML(value='')))

     30  0.5080        [32m1.1444[0m        1.1411        0.5773


HBox(children=(FloatProgress(value=0.0, max=19.0), HTML(value='')))

     31  0.5080        1.1451        [35m1.1402[0m     +  0.5584


HBox(children=(FloatProgress(value=0.0, max=19.0), HTML(value='')))

     32  0.5080        1.1454        1.1410        0.5396


HBox(children=(FloatProgress(value=0.0, max=19.0), HTML(value='')))

     33  0.5080        1.1454        [35m1.1401[0m     +  0.5360


HBox(children=(FloatProgress(value=0.0, max=19.0), HTML(value='')))

     34  0.5080        1.1449        [35m1.1399[0m     +  0.5510


HBox(children=(FloatProgress(value=0.0, max=19.0), HTML(value='')))

     35  0.5080        1.1470        1.1401        0.5537


HBox(children=(FloatProgress(value=0.0, max=19.0), HTML(value='')))

     36  0.5080        [32m1.1439[0m        1.1400        0.5581


HBox(children=(FloatProgress(value=0.0, max=19.0), HTML(value='')))

     37  0.5080        [32m1.1438[0m        [35m1.1399[0m     +  0.5766


HBox(children=(FloatProgress(value=0.0, max=19.0), HTML(value='')))

     38  0.5080        [32m1.1435[0m        1.1400        0.5804


HBox(children=(FloatProgress(value=0.0, max=19.0), HTML(value='')))

     39  0.5080        [32m1.1417[0m        1.1403        0.5743


HBox(children=(FloatProgress(value=0.0, max=19.0), HTML(value='')))

     40  0.5080        1.1439        1.1403        0.5395


HBox(children=(FloatProgress(value=0.0, max=19.0), HTML(value='')))

     41  0.5080        1.1437        [35m1.1388[0m     +  0.5330


HBox(children=(FloatProgress(value=0.0, max=19.0), HTML(value='')))

     42  0.5080        1.1435        [35m1.1383[0m     +  0.5428


HBox(children=(FloatProgress(value=0.0, max=19.0), HTML(value='')))

     43  0.5080        1.1427        1.1384        0.5439


HBox(children=(FloatProgress(value=0.0, max=19.0), HTML(value='')))

     44  0.5080        1.1425        1.1383        0.5453


HBox(children=(FloatProgress(value=0.0, max=19.0), HTML(value='')))

     45  0.5080        [32m1.1417[0m        1.1384        0.5732


HBox(children=(FloatProgress(value=0.0, max=19.0), HTML(value='')))

     46  0.5080        1.1422        [35m1.1380[0m     +  0.5545


HBox(children=(FloatProgress(value=0.0, max=19.0), HTML(value='')))

     47  0.5080        1.1420        [35m1.1379[0m     +  0.5362


HBox(children=(FloatProgress(value=0.0, max=19.0), HTML(value='')))

     48  [36m0.5081[0m        1.1425        [35m1.1375[0m     +  0.5754


HBox(children=(FloatProgress(value=0.0, max=19.0), HTML(value='')))

     49  [36m0.5081[0m        1.1422        1.1376        0.5403


HBox(children=(FloatProgress(value=0.0, max=19.0), HTML(value='')))

     50  0.5081        [32m1.1406[0m        [35m1.1374[0m     +  0.5329


HBox(children=(FloatProgress(value=0.0, max=19.0), HTML(value='')))

     51  0.5081        [32m1.1402[0m        [35m1.1371[0m     +  0.5437


HBox(children=(FloatProgress(value=0.0, max=19.0), HTML(value='')))

     52  [36m0.5082[0m        1.1411        [35m1.1365[0m     +  0.5362


HBox(children=(FloatProgress(value=0.0, max=19.0), HTML(value='')))

     53  [36m0.5082[0m        1.1408        [35m1.1358[0m     +  0.5489


HBox(children=(FloatProgress(value=0.0, max=19.0), HTML(value='')))

     54  0.5082        [32m1.1392[0m        [35m1.1356[0m     +  0.5465


HBox(children=(FloatProgress(value=0.0, max=19.0), HTML(value='')))

     55  0.5082        1.1400        1.1360        0.5538


HBox(children=(FloatProgress(value=0.0, max=19.0), HTML(value='')))

     56  [36m0.5082[0m        [32m1.1381[0m        [35m1.1353[0m     +  0.5534


HBox(children=(FloatProgress(value=0.0, max=19.0), HTML(value='')))

     57  0.5082        1.1386        1.1355        0.5529


HBox(children=(FloatProgress(value=0.0, max=19.0), HTML(value='')))

     58  0.5082        1.1384        [35m1.1350[0m     +  0.5645


HBox(children=(FloatProgress(value=0.0, max=19.0), HTML(value='')))

     59  [36m0.5084[0m        [32m1.1364[0m        [35m1.1344[0m     +  0.5586


HBox(children=(FloatProgress(value=0.0, max=19.0), HTML(value='')))

     60  [36m0.5085[0m        1.1368        [35m1.1342[0m     +  0.5584


HBox(children=(FloatProgress(value=0.0, max=19.0), HTML(value='')))

     61  [36m0.5085[0m        [32m1.1360[0m        [35m1.1334[0m     +  0.5601


HBox(children=(FloatProgress(value=0.0, max=19.0), HTML(value='')))

     62  [36m0.5089[0m        [32m1.1358[0m        [35m1.1330[0m     +  0.5780


HBox(children=(FloatProgress(value=0.0, max=19.0), HTML(value='')))

     63  [36m0.5090[0m        [32m1.1345[0m        [35m1.1324[0m     +  0.5574


HBox(children=(FloatProgress(value=0.0, max=19.0), HTML(value='')))

     64  [36m0.5092[0m        [32m1.1321[0m        [35m1.1322[0m     +  0.5480


HBox(children=(FloatProgress(value=0.0, max=19.0), HTML(value='')))

     65  0.5090        1.1334        1.1323        0.5481


HBox(children=(FloatProgress(value=0.0, max=19.0), HTML(value='')))

     66  [36m0.5095[0m        1.1335        [35m1.1307[0m     +  0.5609


HBox(children=(FloatProgress(value=0.0, max=19.0), HTML(value='')))

     67  [36m0.5099[0m        1.1337        [35m1.1302[0m     +  0.5622


HBox(children=(FloatProgress(value=0.0, max=19.0), HTML(value='')))

     68  0.5094        1.1333        1.1302        0.5393


HBox(children=(FloatProgress(value=0.0, max=19.0), HTML(value='')))

     69  0.5098        [32m1.1301[0m        [35m1.1292[0m     +  0.5586


HBox(children=(FloatProgress(value=0.0, max=19.0), HTML(value='')))

     70  0.5098        1.1304        [35m1.1287[0m     +  0.5531


HBox(children=(FloatProgress(value=0.0, max=19.0), HTML(value='')))

     71  [36m0.5101[0m        1.1311        [35m1.1279[0m     +  0.5393


HBox(children=(FloatProgress(value=0.0, max=19.0), HTML(value='')))

     72  [36m0.5103[0m        1.1304        [35m1.1276[0m     +  0.5492


HBox(children=(FloatProgress(value=0.0, max=19.0), HTML(value='')))

     73  [36m0.5115[0m        1.1309        [35m1.1263[0m     +  0.5540


HBox(children=(FloatProgress(value=0.0, max=19.0), HTML(value='')))

     74  0.5103        [32m1.1276[0m        1.1267        0.5898


HBox(children=(FloatProgress(value=0.0, max=19.0), HTML(value='')))

     75  0.5108        [32m1.1257[0m        [35m1.1259[0m     +  0.5622


HBox(children=(FloatProgress(value=0.0, max=19.0), HTML(value='')))

     76  0.5113        1.1272        [35m1.1241[0m     +  0.5730


HBox(children=(FloatProgress(value=0.0, max=19.0), HTML(value='')))

     77  [36m0.5130[0m        1.1258        [35m1.1229[0m     +  0.5543


HBox(children=(FloatProgress(value=0.0, max=19.0), HTML(value='')))

     78  0.5129        [32m1.1254[0m        1.1233        0.5619


HBox(children=(FloatProgress(value=0.0, max=19.0), HTML(value='')))

     79  [36m0.5132[0m        [32m1.1240[0m        [35m1.1229[0m     +  0.5812


HBox(children=(FloatProgress(value=0.0, max=19.0), HTML(value='')))

     80  [36m0.5142[0m        1.1240        [35m1.1208[0m     +  0.5827


HBox(children=(FloatProgress(value=0.0, max=19.0), HTML(value='')))

     81  [36m0.5147[0m        [32m1.1207[0m        [35m1.1205[0m     +  0.5931


HBox(children=(FloatProgress(value=0.0, max=19.0), HTML(value='')))

     82  0.5141        [32m1.1197[0m        [35m1.1204[0m     +  0.5911


HBox(children=(FloatProgress(value=0.0, max=19.0), HTML(value='')))

     83  [36m0.5157[0m        [32m1.1192[0m        [35m1.1195[0m     +  0.5673


HBox(children=(FloatProgress(value=0.0, max=19.0), HTML(value='')))

     84  [36m0.5167[0m        [32m1.1156[0m        [35m1.1177[0m     +  0.5815


HBox(children=(FloatProgress(value=0.0, max=19.0), HTML(value='')))

     85  [36m0.5172[0m        1.1183        [35m1.1174[0m     +  0.5888


HBox(children=(FloatProgress(value=0.0, max=19.0), HTML(value='')))

     86  [36m0.5186[0m        1.1170        [35m1.1146[0m     +  0.5654


HBox(children=(FloatProgress(value=0.0, max=19.0), HTML(value='')))

     87  0.5184        [32m1.1144[0m        [35m1.1145[0m     +  0.5498


HBox(children=(FloatProgress(value=0.0, max=19.0), HTML(value='')))

     88  [36m0.5187[0m        [32m1.1134[0m        [35m1.1140[0m     +  0.5542


HBox(children=(FloatProgress(value=0.0, max=19.0), HTML(value='')))

     89  [36m0.5194[0m        1.1134        [35m1.1133[0m     +  0.5706


HBox(children=(FloatProgress(value=0.0, max=19.0), HTML(value='')))

     90  [36m0.5205[0m        [32m1.1110[0m        [35m1.1117[0m     +  0.5389


HBox(children=(FloatProgress(value=0.0, max=19.0), HTML(value='')))

     91  [36m0.5209[0m        [32m1.1096[0m        [35m1.1105[0m     +  0.5578


HBox(children=(FloatProgress(value=0.0, max=19.0), HTML(value='')))

     92  [36m0.5217[0m        [32m1.1054[0m        [35m1.1089[0m     +  0.5474


HBox(children=(FloatProgress(value=0.0, max=19.0), HTML(value='')))

     93  [36m0.5233[0m        1.1056        [35m1.1071[0m     +  0.5621


HBox(children=(FloatProgress(value=0.0, max=19.0), HTML(value='')))

     94  0.5232        1.1068        [35m1.1068[0m     +  0.5742


HBox(children=(FloatProgress(value=0.0, max=19.0), HTML(value='')))

     95  [36m0.5247[0m        [32m1.1043[0m        1.1068        0.5543


HBox(children=(FloatProgress(value=0.0, max=19.0), HTML(value='')))

     96  0.5237        [32m1.1004[0m        [35m1.1053[0m     +  0.5481


HBox(children=(FloatProgress(value=0.0, max=19.0), HTML(value='')))

     97  0.5244        [32m1.1004[0m        [35m1.1037[0m     +  0.5752


HBox(children=(FloatProgress(value=0.0, max=19.0), HTML(value='')))

     98  0.5243        [32m1.1001[0m        [35m1.1017[0m     +  0.5403


HBox(children=(FloatProgress(value=0.0, max=19.0), HTML(value='')))

     99  [36m0.5255[0m        [32m1.0976[0m        [35m1.1001[0m     +  0.5485


HBox(children=(FloatProgress(value=0.0, max=19.0), HTML(value='')))

    100  0.5252        [32m1.0959[0m        1.1011        0.5463


HBox(children=(FloatProgress(value=0.0, max=19.0), HTML(value='')))

    101  [36m0.5266[0m        1.0977        [35m1.0983[0m     +  0.5457


HBox(children=(FloatProgress(value=0.0, max=19.0), HTML(value='')))

    102  [36m0.5281[0m        [32m1.0938[0m        [35m1.0972[0m     +  0.5996


HBox(children=(FloatProgress(value=0.0, max=19.0), HTML(value='')))

    103  0.5277        [32m1.0924[0m        1.0991        0.5493


HBox(children=(FloatProgress(value=0.0, max=19.0), HTML(value='')))

    104  [36m0.5283[0m        [32m1.0897[0m        [35m1.0963[0m     +  0.5660


HBox(children=(FloatProgress(value=0.0, max=19.0), HTML(value='')))

    105  [36m0.5294[0m        [32m1.0860[0m        [35m1.0956[0m     +  0.5583


HBox(children=(FloatProgress(value=0.0, max=19.0), HTML(value='')))

    106  [36m0.5302[0m        [32m1.0857[0m        [35m1.0933[0m     +  0.5432


HBox(children=(FloatProgress(value=0.0, max=19.0), HTML(value='')))

    107  [36m0.5307[0m        [32m1.0823[0m        [35m1.0917[0m     +  0.5499


HBox(children=(FloatProgress(value=0.0, max=19.0), HTML(value='')))

    108  0.5305        1.0847        [35m1.0915[0m     +  0.5484


HBox(children=(FloatProgress(value=0.0, max=19.0), HTML(value='')))

    109  [36m0.5322[0m        [32m1.0796[0m        [35m1.0907[0m     +  0.5564


HBox(children=(FloatProgress(value=0.0, max=19.0), HTML(value='')))

    110  0.5303        1.0805        [35m1.0905[0m     +  0.5624


HBox(children=(FloatProgress(value=0.0, max=19.0), HTML(value='')))

    111  0.5321        [32m1.0785[0m        [35m1.0892[0m     +  0.5867


HBox(children=(FloatProgress(value=0.0, max=19.0), HTML(value='')))

    112  [36m0.5327[0m        [32m1.0783[0m        [35m1.0872[0m     +  0.5789


HBox(children=(FloatProgress(value=0.0, max=19.0), HTML(value='')))

    113  [36m0.5331[0m        1.0787        [35m1.0872[0m     +  0.5584


HBox(children=(FloatProgress(value=0.0, max=19.0), HTML(value='')))

    114  0.5321        [32m1.0728[0m        [35m1.0862[0m     +  0.5858


HBox(children=(FloatProgress(value=0.0, max=19.0), HTML(value='')))

    115  0.5293        [32m1.0715[0m        1.0874        0.5630


HBox(children=(FloatProgress(value=0.0, max=19.0), HTML(value='')))

    116  0.5323        1.0722        [35m1.0848[0m     +  0.5491


HBox(children=(FloatProgress(value=0.0, max=19.0), HTML(value='')))

    117  0.5313        [32m1.0694[0m        [35m1.0844[0m     +  0.5595


HBox(children=(FloatProgress(value=0.0, max=19.0), HTML(value='')))

    118  0.5318        [32m1.0686[0m        [35m1.0825[0m     +  0.5604


HBox(children=(FloatProgress(value=0.0, max=19.0), HTML(value='')))

    119  0.5323        [32m1.0649[0m        [35m1.0816[0m     +  0.5663


HBox(children=(FloatProgress(value=0.0, max=19.0), HTML(value='')))

    120  0.5329        [32m1.0640[0m        1.0831        0.5682


HBox(children=(FloatProgress(value=0.0, max=19.0), HTML(value='')))

    121  0.5326        [32m1.0606[0m        [35m1.0798[0m     +  0.5634


HBox(children=(FloatProgress(value=0.0, max=19.0), HTML(value='')))

    122  0.5323        [32m1.0597[0m        [35m1.0785[0m     +  0.6010


HBox(children=(FloatProgress(value=0.0, max=19.0), HTML(value='')))

    123  0.5290        [32m1.0567[0m        1.0800        0.5584


HBox(children=(FloatProgress(value=0.0, max=19.0), HTML(value='')))

    124  [36m0.5333[0m        [32m1.0540[0m        [35m1.0781[0m     +  0.5598


HBox(children=(FloatProgress(value=0.0, max=19.0), HTML(value='')))

    125  0.5313        [32m1.0521[0m        [35m1.0778[0m     +  0.5627


HBox(children=(FloatProgress(value=0.0, max=19.0), HTML(value='')))

    126  0.5329        1.0567        [35m1.0757[0m     +  0.5477


HBox(children=(FloatProgress(value=0.0, max=19.0), HTML(value='')))

    127  0.5320        [32m1.0517[0m        1.0760        0.5416


HBox(children=(FloatProgress(value=0.0, max=19.0), HTML(value='')))

    128  0.5306        [32m1.0492[0m        1.0766        0.5499


HBox(children=(FloatProgress(value=0.0, max=19.0), HTML(value='')))

    129  [36m0.5347[0m        [32m1.0477[0m        [35m1.0742[0m     +  0.5468


HBox(children=(FloatProgress(value=0.0, max=19.0), HTML(value='')))

    130  0.5318        [32m1.0447[0m        1.0767        0.5478


HBox(children=(FloatProgress(value=0.0, max=19.0), HTML(value='')))

    131  0.5345        1.0474        [35m1.0721[0m     +  0.5550


HBox(children=(FloatProgress(value=0.0, max=19.0), HTML(value='')))

    132  0.5319        [32m1.0407[0m        1.0740        0.5536


HBox(children=(FloatProgress(value=0.0, max=19.0), HTML(value='')))

    133  0.5329        1.0424        1.0737        0.5535


HBox(children=(FloatProgress(value=0.0, max=19.0), HTML(value='')))

    134  0.5336        1.0465        [35m1.0711[0m     +  0.5546


HBox(children=(FloatProgress(value=0.0, max=19.0), HTML(value='')))

    135  0.5324        [32m1.0393[0m        1.0724        0.5612


HBox(children=(FloatProgress(value=0.0, max=19.0), HTML(value='')))

    136  0.5336        [32m1.0377[0m        1.0718        0.5678


HBox(children=(FloatProgress(value=0.0, max=19.0), HTML(value='')))

    137  0.5312        [32m1.0339[0m        1.0724        0.5470


HBox(children=(FloatProgress(value=0.0, max=19.0), HTML(value='')))

    138  0.5340        1.0380        [35m1.0674[0m     +  0.5496


HBox(children=(FloatProgress(value=0.0, max=19.0), HTML(value='')))

    139  0.5325        1.0350        1.0732        0.5542


HBox(children=(FloatProgress(value=0.0, max=19.0), HTML(value='')))

    140  0.5320        [32m1.0327[0m        1.0720        0.5529


HBox(children=(FloatProgress(value=0.0, max=19.0), HTML(value='')))

    141  0.5307        [32m1.0273[0m        1.0679        0.5424


HBox(children=(FloatProgress(value=0.0, max=19.0), HTML(value='')))

    142  0.5323        1.0315        [35m1.0665[0m     +  0.5565


HBox(children=(FloatProgress(value=0.0, max=19.0), HTML(value='')))

    143  0.5314        [32m1.0247[0m        1.0697        0.5546


HBox(children=(FloatProgress(value=0.0, max=19.0), HTML(value='')))

    144  0.5313        1.0258        1.0688        0.5519


HBox(children=(FloatProgress(value=0.0, max=19.0), HTML(value='')))

    145  0.5300        [32m1.0223[0m        1.0689        0.5628


HBox(children=(FloatProgress(value=0.0, max=19.0), HTML(value='')))

    146  0.5297        [32m1.0197[0m        [35m1.0650[0m     +  0.5592


HBox(children=(FloatProgress(value=0.0, max=19.0), HTML(value='')))

    147  0.5303        1.0223        1.0722        0.5488


HBox(children=(FloatProgress(value=0.0, max=19.0), HTML(value='')))

    148  0.5303        [32m1.0163[0m        1.0668        0.5560


HBox(children=(FloatProgress(value=0.0, max=19.0), HTML(value='')))

    149  0.5309        1.0193        1.0656        0.5508


HBox(children=(FloatProgress(value=0.0, max=19.0), HTML(value='')))

    150  0.5334        [32m1.0100[0m        1.0688        0.5663


HBox(children=(FloatProgress(value=0.0, max=19.0), HTML(value='')))

    151  0.5308        1.0119        1.0684        0.5468


HBox(children=(FloatProgress(value=0.0, max=19.0), HTML(value='')))

    152  0.5309        [32m1.0080[0m        1.0669        0.5526


HBox(children=(FloatProgress(value=0.0, max=19.0), HTML(value='')))

    153  0.5284        1.0148        1.0683        0.5611


HBox(children=(FloatProgress(value=0.0, max=19.0), HTML(value='')))

    154  0.5290        [32m1.0049[0m        [35m1.0638[0m     +  0.5448


HBox(children=(FloatProgress(value=0.0, max=19.0), HTML(value='')))

    155  0.5287        [32m1.0038[0m        1.0668        0.5538


HBox(children=(FloatProgress(value=0.0, max=19.0), HTML(value='')))

    156  0.5314        [32m1.0021[0m        1.0666        0.5597


HBox(children=(FloatProgress(value=0.0, max=19.0), HTML(value='')))

    157  0.5295        [32m0.9996[0m        1.0657        0.5554


HBox(children=(FloatProgress(value=0.0, max=19.0), HTML(value='')))

    158  0.5290        [32m0.9974[0m        1.0684        0.5474


HBox(children=(FloatProgress(value=0.0, max=19.0), HTML(value='')))

    159  0.5305        [32m0.9964[0m        1.0639        0.5449


HBox(children=(FloatProgress(value=0.0, max=19.0), HTML(value='')))

    160  0.5281        [32m0.9950[0m        1.0668        0.5515


HBox(children=(FloatProgress(value=0.0, max=19.0), HTML(value='')))

    161  0.5284        [32m0.9934[0m        1.0652        0.5506


HBox(children=(FloatProgress(value=0.0, max=19.0), HTML(value='')))

    162  0.5294        [32m0.9890[0m        1.0663        0.5835


HBox(children=(FloatProgress(value=0.0, max=19.0), HTML(value='')))

    163  0.5306        0.9931        1.0672        0.5660


HBox(children=(FloatProgress(value=0.0, max=19.0), HTML(value='')))

    164  0.5291        0.9924        1.0683        0.5789


HBox(children=(FloatProgress(value=0.0, max=19.0), HTML(value='')))

    165  0.5299        [32m0.9796[0m        1.0642        0.5672


HBox(children=(FloatProgress(value=0.0, max=19.0), HTML(value='')))

    166  0.5270        0.9834        1.0654        0.5472


HBox(children=(FloatProgress(value=0.0, max=19.0), HTML(value='')))

    167  0.5261        0.9825        1.0707        0.5585


HBox(children=(FloatProgress(value=0.0, max=19.0), HTML(value='')))

    168  0.5295        0.9815        1.0646        0.5604


HBox(children=(FloatProgress(value=0.0, max=19.0), HTML(value='')))

    169  0.5272        0.9822        1.0686        0.5408


HBox(children=(FloatProgress(value=0.0, max=19.0), HTML(value='')))

    170  0.5270        [32m0.9792[0m        1.0679        0.5711


HBox(children=(FloatProgress(value=0.0, max=19.0), HTML(value='')))

    171  0.5265        [32m0.9750[0m        1.0702        0.5577


HBox(children=(FloatProgress(value=0.0, max=19.0), HTML(value='')))

    172  0.5272        0.9795        1.0685        0.5752


HBox(children=(FloatProgress(value=0.0, max=19.0), HTML(value='')))

    173  0.5273        [32m0.9731[0m        1.0680        0.5858


HBox(children=(FloatProgress(value=0.0, max=19.0), HTML(value='')))

Stopping since valid_loss has not improved in the last 20 epochs.


<class 'skorch.net.NeuralNet'>[initialized](
  module_=Net(
    (layer1): Linear(in_features=8572, out_features=512, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
    (dropout2): Dropout(p=0.5, inplace=False)
    (layer2): Linear(in_features=512, out_features=512, bias=True)
    (output): Linear(in_features=512, out_features=22, bias=True)
  ),
)

In [95]:
from sklearn import metrics
predictions = (net.predict(X_test.astype(np.float32)) >=0.5).astype(np.long)

print(f"Hamming loss: {metrics.hamming_loss(y_test,predictions)}")
print(metrics.label_ranking_loss(y_test,predictions))
print(metrics.classification_report(y_test, predictions, target_names=mlb.classes_))

print(model_name)
print(pipeline)
print(net)
print(net.get_params())

Hamming loss: 0.4878413767302656
0.6883439099590769
                                               precision    recall  f1-score   support

                        Analysis and Modeling       0.36      0.92      0.51        84
           Analysis and Modeling: 3D Modeling       0.31      0.89      0.45        71
                            Anatomical Target       0.69      0.95      0.80       164
           Anatomical Target: Lower Extremity       0.47      0.93      0.62       113
     Anatomical Target: Lower Extremity - Hip       0.21      0.53      0.30        40
    Anatomical Target: Lower Extremity - Knee       0.34      0.93      0.50        82
                     Anatomical Target: Torso       0.15      0.43      0.23        35
             Anatomical Target: Torso - Spine       0.09      0.38      0.14        21
           Anatomical Target: Upper Extremity       0.17      0.48      0.25        31
Anatomical Target: Upper Extremity - Shoulder       0.12      0.39      0.19 

  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
N=250
Hamming loss: 0.7184811073699963
0.9958847736625515

In [None]:
N=400
Hamming loss: 0.7184811073699963
0.9958847736625515

In [None]:
N=100
Hamming loss: 0.5875420875420876
0.7901547932844298
                                               precision    recall  f1-score   support

                        Analysis and Modeling       0.35      1.00      0.51        84
           Analysis and Modeling: 3D Modeling       0.29      1.00      0.45        71
                            Anatomical Target       0.67      1.00      0.81       164
           Anatomical Target: Lower Extremity       0.47      1.00      0.63       113
     Anatomical Target: Lower Extremity - Hip       0.00      0.00      0.00        40
    Anatomical Target: Lower Extremity - Knee       0.34      1.00      0.50        82
                     Anatomical Target: Torso       0.00      0.00      0.00        35
             Anatomical Target: Torso - Spine       0.00      0.00      0.00        21
           Anatomical Target: Upper Extremity       0.13      1.00      0.23        31
Anatomical Target: Upper Extremity - Shoulder       0.09      1.00      0.17        23
                                      Imaging       0.55      1.00      0.71       133
                                  Imaging: CT       0.24      1.00      0.39        59
                                 Imaging: MRI       0.24      1.00      0.39        59
                          Imaging: Ultrasound       0.13      1.00      0.23        32
                                Manufacturing       0.34      1.00      0.51        83
        Manufacturing: Additive Manufacturing       0.16      1.00      0.27        38
           Personalized Product: Guide or Jig       0.49      1.00      0.66       120
                Personalized Product: Implant       0.51      1.00      0.68       124
                         Specification of Use       0.33      1.00      0.49        79
                Specification of Use: Disease       0.12      1.00      0.22        30
      Specification of Use: Joint Replacement       0.18      1.00      0.31        44
                              Surgical Method       0.00      0.00      0.00        40

                                    micro avg       0.31      0.91      0.47      1505
                                    macro avg       0.26      0.82      0.37      1505
                                 weighted avg       0.36      0.91      0.50      1505
                                  samples avg       0.31      0.91      0.45      1505

bertForPatents/
Pipeline(steps=[('columns',
                 ColumnTransformer(n_jobs=1,
                                   transformers=[('cited_by',
                                                  CountVectorizer(analyzer=<function iden at 0x7fbdbdddc430>,
                                                                  binary=True,
                                                                  min_df=2),
                                                  'cited_by'),
                                                 ('inventors',
                                                  CountVectorizer(analyzer=<function iden at 0x7fbdbdddc430>,
                                                                  binary=True,
                                                                  min_df=2),
                                                  'inventors'),
                                                 ('embedded_cpc',
                                                  VectorPassthrough(),
                                                  'embedded_cpc'),
                                                 ('embedding_v1',
                                                  VectorPassthrough(),
                                                  'embedding_v1'),
                                                 ('transformer_abstract',
                                                  TransformerFeatures(),
                                                  'abstract'),
                                                 ('transformer_claims',
                                                  TransformerFeatures(),
                                                  'claims')])),
                ('norm', Normalizer())])
<class 'skorch.net.NeuralNet'>[initialized](
  module_=Net(
    (layer1): Linear(in_features=8572, out_features=512, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
    (dropout2): Dropout(p=0.5, inplace=False)
    (layer2): Linear(in_features=512, out_features=512, bias=True)
    (output): Linear(in_features=512, out_features=22, bias=True)
  ),
)
{'module': <class '__main__.Net'>, 'criterion': <class '__main__.LabelSmoothingCrossEntropy'>, 'optimizer': <class 'torch.optim.adamw.AdamW'>, 'lr': 5e-05, 'max_epochs': 10000, 'batch_size': 32, 'iterator_train': <class 'torch.utils.data.dataloader.DataLoader'>, 'iterator_valid': <class 'torch.utils.data.dataloader.DataLoader'>, 'dataset': <class 'skorch.dataset.Dataset'>, 'train_split': <skorch.dataset.CVSplit object at 0x7fbc5f11cdc0>, 'callbacks': [<skorch.callbacks.logging.ProgressBar object at 0x7fbbc112ae50>, <skorch.callbacks.scoring.EpochScoring object at 0x7fbbc111da30>, <skorch.callbacks.training.Checkpoint object at 0x7fbc93feab20>, <skorch.callbacks.training.EarlyStopping object at 0x7fbba17d26a0>], 'predict_nonlinearity': <built-in method sigmoid of type object at 0x7fbc8d3b55a0>, 'warm_start': False, 'verbose': 1, 'device': 'cuda', 'module__input_features': 8572, 'module__output_features': 22, 'iterator_train__shuffle': True, 'iterator_train__num_workers': 4, 'callbacks__epoch_timer': <skorch.callbacks.logging.EpochTimer object at 0x7fbba164fb50>, 'callbacks__train_loss': <skorch.callbacks.scoring.PassthroughScoring object at 0x7fbba13fca30>, 'callbacks__train_loss__name': 'train_loss', 'callbacks__train_loss__lower_is_better': True, 'callbacks__train_loss__on_train': True, 'callbacks__valid_loss': <skorch.callbacks.scoring.PassthroughScoring object at 0x7fbba13fce50>, 'callbacks__valid_loss__name': 'valid_loss', 'callbacks__valid_loss__lower_is_better': True, 'callbacks__valid_loss__on_train': False, 'callbacks__ProgressBar': <skorch.callbacks.logging.ProgressBar object at 0x7fbbc112ae50>, 'callbacks__ProgressBar__batches_per_epoch': 'auto', 'callbacks__ProgressBar__detect_notebook': True, 'callbacks__ProgressBar__postfix_keys': ['train_loss', 'valid_loss'], 'callbacks__EpochScoring': <skorch.callbacks.scoring.EpochScoring object at 0x7fbbc111da30>, 'callbacks__EpochScoring__scoring': <function scorer at 0x7fbc5f07e160>, 'callbacks__EpochScoring__lower_is_better': False, 'callbacks__EpochScoring__on_train': False, 'callbacks__EpochScoring__name': 'f1', 'callbacks__EpochScoring__target_extractor': <function to_numpy at 0x7fbc5f1219d0>, 'callbacks__EpochScoring__use_caching': True, 'callbacks__Checkpoint': <skorch.callbacks.training.Checkpoint object at 0x7fbc93feab20>, 'callbacks__Checkpoint__monitor': 'valid_loss_best', 'callbacks__Checkpoint__f_params': 'params.pt', 'callbacks__Checkpoint__f_optimizer': 'optimizer.pt', 'callbacks__Checkpoint__f_criterion': 'criterion.pt', 'callbacks__Checkpoint__f_history': 'history.json', 'callbacks__Checkpoint__f_pickle': None, 'callbacks__Checkpoint__fn_prefix': '', 'callbacks__Checkpoint__dirname': 'exp4', 'callbacks__Checkpoint__event_name': 'event_cp', 'callbacks__Checkpoint__sink': <function noop at 0x7fbc5f1203a0>, 'callbacks__EarlyStopping': <skorch.callbacks.training.EarlyStopping object at 0x7fbba17d26a0>, 'callbacks__EarlyStopping__monitor': 'valid_loss', 'callbacks__EarlyStopping__lower_is_better': True, 'callbacks__EarlyStopping__patience': 20, 'callbacks__EarlyStopping__threshold': 0.0001, 'callbacks__EarlyStopping__threshold_mode': 'rel', 'callbacks__EarlyStopping__sink': <built-in function print>, 'callbacks__print_log': <skorch.callbacks.logging.PrintLog object at 0x7fbba13fc2e0>, 'callbacks__print_log__keys_ignored': None, 'callbacks__print_log__sink': <built-in function print>, 'callbacks__print_log__tablefmt': 'simple', 'callbacks__print_log__floatfmt': '.4f', 'callbacks__print_log__stralign': 'right'}
/home/martin/anaconda3/envs/phenetics2/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1221: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/home/martin/anaconda3/envs/phenetics2/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1221: UndefinedMetricWarning: Recall and F-score are ill-defined and being set to 0.0 in samples with no true labels. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))

In [None]:
Hamming loss: 0.5044893378226711
0.6950069237607938
                                               precision    recall  f1-score   support

                        Analysis and Modeling       0.36      0.93      0.52        84
           Analysis and Modeling: 3D Modeling       0.31      0.93      0.47        71
                            Anatomical Target       0.68      0.95      0.80       164
           Anatomical Target: Lower Extremity       0.48      0.96      0.64       113
     Anatomical Target: Lower Extremity - Hip       0.20      0.75      0.32        40
    Anatomical Target: Lower Extremity - Knee       0.35      0.91      0.51        82
                     Anatomical Target: Torso       0.14      0.57      0.23        35
             Anatomical Target: Torso - Spine       0.08      0.38      0.13        21
           Anatomical Target: Upper Extremity       0.17      0.58      0.26        31
Anatomical Target: Upper Extremity - Shoulder       0.11      0.43      0.18        23
                                      Imaging       0.57      0.98      0.72       133
                                  Imaging: CT       0.28      0.90      0.43        59
                                 Imaging: MRI       0.28      0.88      0.42        59
                          Imaging: Ultrasound       0.13      0.31      0.18        32
                                Manufacturing       0.35      0.90      0.51        83
        Manufacturing: Additive Manufacturing       0.18      0.71      0.29        38
           Personalized Product: Guide or Jig       0.49      0.90      0.64       120
                Personalized Product: Implant       0.52      0.98      0.68       124
                         Specification of Use       0.33      0.95      0.49        79
                Specification of Use: Disease       0.12      0.30      0.17        30
      Specification of Use: Joint Replacement       0.19      0.95      0.31        44
                              Surgical Method       0.23      0.45      0.31        40

                                    micro avg       0.34      0.86      0.49      1505
                                    macro avg       0.30      0.76      0.42      1505
                                 weighted avg       0.39      0.86      0.53      1505
                                  samples avg       0.36      0.86      0.48      1505

bertForPatents/
Pipeline(steps=[('columns',
                 ColumnTransformer(n_jobs=1,
                                   transformers=[('cited_by',
                                                  CountVectorizer(analyzer=<function iden at 0x7fbdbdddc430>,
                                                                  binary=True,
                                                                  min_df=2),
                                                  'cited_by'),
                                                 ('inventors',
                                                  CountVectorizer(analyzer=<function iden at 0x7fbdbdddc430>,
                                                                  binary=True,
                                                                  min_df=2),
                                                  'inventors'),
                                                 ('embedded_cpc',
                                                  VectorPassthrough(),
                                                  'embedded_cpc'),
                                                 ('embedding_v1',
                                                  VectorPassthrough(),
                                                  'embedding_v1'),
                                                 ('transformer_abstract',
                                                  TransformerFeatures(),
                                                  'abstract'),
                                                 ('transformer_claims',
                                                  TransformerFeatures(),
                                                  'claims')])),
                ('norm', Normalizer())])
<class 'skorch.net.NeuralNet'>[initialized](
  module_=Net(
    (layer1): Linear(in_features=8572, out_features=512, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
    (dropout2): Dropout(p=0.5, inplace=False)
    (layer2): Linear(in_features=512, out_features=512, bias=True)
    (output): Linear(in_features=512, out_features=22, bias=True)
  ),
)
{'module': <class '__main__.Net'>, 'criterion': <class '__main__.LabelSmoothingCrossEntropy'>, 'optimizer': <class 'torch.optim.adamw.AdamW'>, 'lr': 5e-05, 'max_epochs': 10000, 'batch_size': 32, 'iterator_train': <class 'torch.utils.data.dataloader.DataLoader'>, 'iterator_valid': <class 'torch.utils.data.dataloader.DataLoader'>, 'dataset': <class 'skorch.dataset.Dataset'>, 'train_split': <skorch.dataset.CVSplit object at 0x7fbc5f11cdc0>, 'callbacks': [<skorch.callbacks.logging.ProgressBar object at 0x7fbba0ef6a60>, <skorch.callbacks.scoring.EpochScoring object at 0x7fbbc09913d0>, <skorch.callbacks.training.Checkpoint object at 0x7fbbc1354fd0>, <skorch.callbacks.training.EarlyStopping object at 0x7fbbc09910a0>], 'predict_nonlinearity': <built-in method sigmoid of type object at 0x7fbc8d3b55a0>, 'warm_start': False, 'verbose': 1, 'device': 'cuda', 'module__input_features': 8572, 'module__output_features': 22, 'iterator_train__shuffle': True, 'iterator_train__num_workers': 4, 'callbacks__epoch_timer': <skorch.callbacks.logging.EpochTimer object at 0x7fbba16d6730>, 'callbacks__train_loss': <skorch.callbacks.scoring.PassthroughScoring object at 0x7fbbc12b8370>, 'callbacks__train_loss__name': 'train_loss', 'callbacks__train_loss__lower_is_better': True, 'callbacks__train_loss__on_train': True, 'callbacks__valid_loss': <skorch.callbacks.scoring.PassthroughScoring object at 0x7fbba163df40>, 'callbacks__valid_loss__name': 'valid_loss', 'callbacks__valid_loss__lower_is_better': True, 'callbacks__valid_loss__on_train': False, 'callbacks__ProgressBar': <skorch.callbacks.logging.ProgressBar object at 0x7fbba0ef6a60>, 'callbacks__ProgressBar__batches_per_epoch': 'auto', 'callbacks__ProgressBar__detect_notebook': True, 'callbacks__ProgressBar__postfix_keys': ['train_loss', 'valid_loss'], 'callbacks__EpochScoring': <skorch.callbacks.scoring.EpochScoring object at 0x7fbbc09913d0>, 'callbacks__EpochScoring__scoring': <function scorer at 0x7fbc5f07e160>, 'callbacks__EpochScoring__lower_is_better': False, 'callbacks__EpochScoring__on_train': False, 'callbacks__EpochScoring__name': 'f1', 'callbacks__EpochScoring__target_extractor': <function to_numpy at 0x7fbc5f1219d0>, 'callbacks__EpochScoring__use_caching': True, 'callbacks__Checkpoint': <skorch.callbacks.training.Checkpoint object at 0x7fbbc1354fd0>, 'callbacks__Checkpoint__monitor': 'valid_loss_best', 'callbacks__Checkpoint__f_params': 'params.pt', 'callbacks__Checkpoint__f_optimizer': 'optimizer.pt', 'callbacks__Checkpoint__f_criterion': 'criterion.pt', 'callbacks__Checkpoint__f_history': 'history.json', 'callbacks__Checkpoint__f_pickle': None, 'callbacks__Checkpoint__fn_prefix': '', 'callbacks__Checkpoint__dirname': 'exp3', 'callbacks__Checkpoint__event_name': 'event_cp', 'callbacks__Checkpoint__sink': <function noop at 0x7fbc5f1203a0>, 'callbacks__EarlyStopping': <skorch.callbacks.training.EarlyStopping object at 0x7fbbc09910a0>, 'callbacks__EarlyStopping__monitor': 'valid_loss', 'callbacks__EarlyStopping__lower_is_better': True, 'callbacks__EarlyStopping__patience': 20, 'callbacks__EarlyStopping__threshold': 0.0001, 'callbacks__EarlyStopping__threshold_mode': 'rel', 'callbacks__EarlyStopping__sink': <built-in function print>, 'callbacks__print_log': <skorch.callbacks.logging.PrintLog object at 0x7fbba163ddf0>, 'callbacks__print_log__keys_ignored': None, 'callbacks__print_log__sink': <built-in function print>, 'callbacks__print_log__tablefmt': 'simple', 'callbacks__print_log__floatfmt': '.4f', 'callbacks__print_log__stralign': 'right'}

In [None]:
Best so far

Hamming loss: 0.5422745978301534
0.7398683245697636
                                               precision    recall  f1-score   support

                        Analysis and Modeling       0.35      0.96      0.52        84
           Analysis and Modeling: 3D Modeling       0.30      0.94      0.46        71
                            Anatomical Target       0.67      1.00      0.81       164
           Anatomical Target: Lower Extremity       0.47      1.00      0.64       113
     Anatomical Target: Lower Extremity - Hip       0.18      0.78      0.30        40
    Anatomical Target: Lower Extremity - Knee       0.34      0.96      0.51        82
                     Anatomical Target: Torso       0.14      0.63      0.24        35
             Anatomical Target: Torso - Spine       0.09      0.43      0.15        21
           Anatomical Target: Upper Extremity       0.14      0.58      0.23        31
Anatomical Target: Upper Extremity - Shoulder       0.12      0.57      0.20        23
                                      Imaging       0.56      0.99      0.72       133
                                  Imaging: CT       0.26      0.97      0.41        59
                                 Imaging: MRI       0.26      0.95      0.41        59
                          Imaging: Ultrasound       0.16      0.59      0.25        32
                                Manufacturing       0.35      0.93      0.51        83
        Manufacturing: Additive Manufacturing       0.21      0.79      0.33        38
           Personalized Product: Guide or Jig       0.50      0.98      0.66       120
                Personalized Product: Implant       0.52      0.99      0.68       124
                         Specification of Use       0.34      0.97      0.51        79
                Specification of Use: Disease       0.13      0.50      0.20        30
      Specification of Use: Joint Replacement       0.18      0.93      0.30        44
                              Surgical Method       0.23      0.68      0.35        40

                                    micro avg       0.33      0.91      0.49      1505
                                    macro avg       0.30      0.82      0.43      1505
                                 weighted avg       0.39      0.91      0.53      1505
                                  samples avg       0.34      0.92      0.47      1505

distilroberta-base-msmarco-v2
Pipeline(steps=[('columns',
                 ColumnTransformer(n_jobs=1,
                                   transformers=[('cited_by',
                                                  CountVectorizer(analyzer=<function iden at 0x7fc9bcb59430>,
                                                                  binary=True,
                                                                  min_df=2),
                                                  'cited_by'),
                                                 ('inventors',
                                                  CountVectorizer(analyzer=<function iden at 0x7fc9bcb59430>,
                                                                  binary=True,
                                                                  min_df=2),
                                                  'inventors'),
                                                 ('embedded_cpc',
                                                  VectorPassthrough(),
                                                  'embedded_cpc'),
                                                 ('embedding_v1',
                                                  VectorPassthrough(),
                                                  'embedding_v1'),
                                                 ('transformer_abstract',
                                                  TransformerFeatures(),
                                                  'abstract'),
                                                 ('transformer_claims',
                                                  TransformerFeatures(),
                                                  'claims')])),
                ('norm', Normalizer())])
<class 'skorch.net.NeuralNet'>[initialized](
  module_=Net(
    (layer1): Linear(in_features=6012, out_features=512, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
    (layer2): Linear(in_features=512, out_features=512, bias=True)
    (output): Linear(in_features=512, out_features=22, bias=True)
  ),
)
{'module': <class '__main__.Net'>, 'criterion': <class '__main__.LabelSmoothingCrossEntropy'>, 'optimizer': <class 'torch.optim.adamw.AdamW'>, 'lr': 1e-05, 'max_epochs': 10000, 'batch_size': 32, 'iterator_train': <class 'torch.utils.data.dataloader.DataLoader'>, 'iterator_valid': <class 'torch.utils.data.dataloader.DataLoader'>, 'dataset': <class 'skorch.dataset.Dataset'>, 'train_split': <skorch.dataset.CVSplit object at 0x7fc7d033f460>, 'callbacks': [<skorch.callbacks.logging.ProgressBar object at 0x7fc7d0381100>, <skorch.callbacks.scoring.EpochScoring object at 0x7fc7d0304370>, <skorch.callbacks.training.Checkpoint object at 0x7fc7d0381040>, <skorch.callbacks.training.EarlyStopping object at 0x7fc7d0381130>, <skorch.callbacks.training.LoadInitState object at 0x7fc7d03810d0>], 'predict_nonlinearity': <built-in method sigmoid of type object at 0x7fc88c1305a0>, 'warm_start': False, 'verbose': 1, 'device': 'cuda', 'module__input_features': 6012, 'module__output_features': 22, 'iterator_train__shuffle': True, 'iterator_train__num_workers': 4, 'callbacks__epoch_timer': <skorch.callbacks.logging.EpochTimer object at 0x7fc7c0e793d0>, 'callbacks__train_loss': <skorch.callbacks.scoring.PassthroughScoring object at 0x7fc7c1e4fa60>, 'callbacks__train_loss__name': 'train_loss', 'callbacks__train_loss__lower_is_better': True, 'callbacks__train_loss__on_train': True, 'callbacks__valid_loss': <skorch.callbacks.scoring.PassthroughScoring object at 0x7fc7c1e4f190>, 'callbacks__valid_loss__name': 'valid_loss', 'callbacks__valid_loss__lower_is_better': True, 'callbacks__valid_loss__on_train': False, 'callbacks__ProgressBar': <skorch.callbacks.logging.ProgressBar object at 0x7fc7d0381100>, 'callbacks__ProgressBar__batches_per_epoch': 'auto', 'callbacks__ProgressBar__detect_notebook': True, 'callbacks__ProgressBar__postfix_keys': ['train_loss', 'valid_loss'], 'callbacks__EpochScoring': <skorch.callbacks.scoring.EpochScoring object at 0x7fc7d0304370>, 'callbacks__EpochScoring__scoring': <function scorer at 0x7fc7d172ef70>, 'callbacks__EpochScoring__lower_is_better': False, 'callbacks__EpochScoring__on_train': False, 'callbacks__EpochScoring__name': 'f1', 'callbacks__EpochScoring__target_extractor': <function to_numpy at 0x7fc7d0383550>, 'callbacks__EpochScoring__use_caching': True, 'callbacks__Checkpoint': <skorch.callbacks.training.Checkpoint object at 0x7fc7d0381040>, 'callbacks__Checkpoint__monitor': 'valid_loss_best', 'callbacks__Checkpoint__f_params': 'params.pt', 'callbacks__Checkpoint__f_optimizer': 'optimizer.pt', 'callbacks__Checkpoint__f_criterion': 'criterion.pt', 'callbacks__Checkpoint__f_history': 'history.json', 'callbacks__Checkpoint__f_pickle': None, 'callbacks__Checkpoint__fn_prefix': '', 'callbacks__Checkpoint__dirname': 'exp2', 'callbacks__Checkpoint__event_name': 'event_cp', 'callbacks__Checkpoint__sink': <function noop at 0x7fc7d0383ee0>, 'callbacks__EarlyStopping': <skorch.callbacks.training.EarlyStopping object at 0x7fc7d0381130>, 'callbacks__EarlyStopping__monitor': 'valid_loss', 'callbacks__EarlyStopping__lower_is_better': True, 'callbacks__EarlyStopping__patience': 20, 'callbacks__EarlyStopping__threshold': 0.0001, 'callbacks__EarlyStopping__threshold_mode': 'rel', 'callbacks__EarlyStopping__sink': <built-in function print>, 'callbacks__LoadInitState': <skorch.callbacks.training.LoadInitState object at 0x7fc7d03810d0>, 'callbacks__LoadInitState__checkpoint__monitor': 'valid_loss_best', 'callbacks__LoadInitState__checkpoint__f_params': 'params.pt', 'callbacks__LoadInitState__checkpoint__f_optimizer': 'optimizer.pt', 'callbacks__LoadInitState__checkpoint__f_criterion': 'criterion.pt', 'callbacks__LoadInitState__checkpoint__f_history': 'history.json', 'callbacks__LoadInitState__checkpoint__f_pickle': None, 'callbacks__LoadInitState__checkpoint__fn_prefix': '', 'callbacks__LoadInitState__checkpoint__dirname': 'exp2', 'callbacks__LoadInitState__checkpoint__event_name': 'event_cp', 'callbacks__LoadInitState__checkpoint__sink': <function noop at 0x7fc7d0383ee0>, 'callbacks__LoadInitState__checkpoint': <skorch.callbacks.training.Checkpoint object at 0x7fc7d0381040>, 'callbacks__print_log': <skorch.callbacks.logging.PrintLog object at 0x7fc7c1e4f460>, 'callbacks__print_log__keys_ignored': None, 'callbacks__print_log__sink': <built-in function print>, 'callbacks__print_log__tablefmt': 'simple', 'callbacks__print_log__floatfmt': '.4f', 'callbacks__print_log__stralign': 'right'}
/home/martin/anaconda3/envs/phenetics2/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1221: UndefinedMetricWarning: Recall and F-score are ill-defined and being set to 0.0 in samples with no true labels. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))

In [None]:
Hamming loss: 0.5744481855592967
0.7836308184196901
                                               precision    recall  f1-score   support

                        Analysis and Modeling       0.36      0.96      0.52        84
           Analysis and Modeling: 3D Modeling       0.30      0.96      0.46        71
                            Anatomical Target       0.68      0.99      0.80       164
           Anatomical Target: Lower Extremity       0.47      1.00      0.63       113
     Anatomical Target: Lower Extremity - Hip       0.19      0.85      0.31        40
    Anatomical Target: Lower Extremity - Knee       0.33      0.94      0.49        82
                     Anatomical Target: Torso       0.15      0.77      0.25        35
             Anatomical Target: Torso - Spine       0.09      0.67      0.15        21
           Anatomical Target: Upper Extremity       0.15      0.81      0.25        31
Anatomical Target: Upper Extremity - Shoulder       0.11      0.78      0.19        23
                                      Imaging       0.56      0.98      0.71       133
                                  Imaging: CT       0.26      0.98      0.41        59
                                 Imaging: MRI       0.25      0.95      0.39        59
                          Imaging: Ultrasound       0.15      0.47      0.23        32
                                Manufacturing       0.35      0.93      0.50        83
        Manufacturing: Additive Manufacturing       0.17      0.87      0.28        38
           Personalized Product: Guide or Jig       0.51      0.96      0.66       120
                Personalized Product: Implant       0.52      0.98      0.68       124
                         Specification of Use       0.33      0.96      0.49        79
                Specification of Use: Disease       0.13      0.33      0.18        30
      Specification of Use: Joint Replacement       0.19      0.98      0.32        44
                              Surgical Method       0.23      0.53      0.32        40

                                    micro avg       0.32      0.91      0.47      1505
                                    macro avg       0.29      0.85      0.42      1505
                                 weighted avg       0.39      0.91      0.53      1505
                                  samples avg       0.34      0.92      0.46      1505

/home/martin/patentmark/patentmark-charting-app/vectors/patent-electra-v4-patent-mnr-cl/
Pipeline(steps=[('columns',
                 ColumnTransformer(n_jobs=1,
                                   transformers=[('cited_by',
                                                  CountVectorizer(analyzer=<function iden at 0x7fc9bcb59430>,
                                                                  binary=True,
                                                                  min_df=2),
                                                  'cited_by'),
                                                 ('inventors',
                                                  CountVectorizer(analyzer=<function iden at 0x7fc9bcb59430>,
                                                                  binary=True,
                                                                  min_df=2),
                                                  'inventors'),
                                                 ('embedded_cpc',
                                                  VectorPassthrough(),
                                                  'embedded_cpc'),
                                                 ('embedding_v1',
                                                  VectorPassthrough(),
                                                  'embedding_v1'),
                                                 ('transformer_abstract',
                                                  TransformerFeatures(),
                                                  'abstract'),
                                                 ('transformer_claims',
                                                  TransformerFeatures(),
                                                  'claims')])),
                ('norm', Normalizer())])
<class 'skorch.net.NeuralNet'>[initialized](
  module_=Net(
    (layer1): Linear(in_features=6012, out_features=512, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
    (layer2): Linear(in_features=512, out_features=512, bias=True)
    (output): Linear(in_features=512, out_features=22, bias=True)
  ),
)
{'module': <class '__main__.Net'>, 'criterion': <class '__main__.LabelSmoothingCrossEntropy'>, 'optimizer': <class 'torch.optim.adamw.AdamW'>, 'lr': 5e-05, 'max_epochs': 10000, 'batch_size': 32, 'iterator_train': <class 'torch.utils.data.dataloader.DataLoader'>, 'iterator_valid': <class 'torch.utils.data.dataloader.DataLoader'>, 'dataset': <class 'skorch.dataset.Dataset'>, 'train_split': <skorch.dataset.CVSplit object at 0x7fc7d033f460>, 'callbacks': [<skorch.callbacks.logging.ProgressBar object at 0x7fc7d16d18e0>, <skorch.callbacks.scoring.EpochScoring object at 0x7fc7d16d1610>, <skorch.callbacks.training.Checkpoint object at 0x7fc874db38e0>, <skorch.callbacks.training.EarlyStopping object at 0x7fc7d16d17f0>, <skorch.callbacks.training.LoadInitState object at 0x7fc88ca963a0>], 'predict_nonlinearity': <built-in method sigmoid of type object at 0x7fc88c1305a0>, 'warm_start': False, 'verbose': 1, 'device': 'cuda', 'module__input_features': 6012, 'module__output_features': 22, 'iterator_train__shuffle': True, 'iterator_train__num_workers': 4, 'callbacks__epoch_timer': <skorch.callbacks.logging.EpochTimer object at 0x7fc874d5e2b0>, 'callbacks__train_loss': <skorch.callbacks.scoring.PassthroughScoring object at 0x7fc79ee405b0>, 'callbacks__train_loss__name': 'train_loss', 'callbacks__train_loss__lower_is_better': True, 'callbacks__train_loss__on_train': True, 'callbacks__valid_loss': <skorch.callbacks.scoring.PassthroughScoring object at 0x7fc874db3d30>, 'callbacks__valid_loss__name': 'valid_loss', 'callbacks__valid_loss__lower_is_better': True, 'callbacks__valid_loss__on_train': False, 'callbacks__ProgressBar': <skorch.callbacks.logging.ProgressBar object at 0x7fc7d16d18e0>, 'callbacks__ProgressBar__batches_per_epoch': 'auto', 'callbacks__ProgressBar__detect_notebook': True, 'callbacks__ProgressBar__postfix_keys': ['train_loss', 'valid_loss'], 'callbacks__EpochScoring': <skorch.callbacks.scoring.EpochScoring object at 0x7fc7d16d1610>, 'callbacks__EpochScoring__scoring': <function scorer at 0x7fc85dcba310>, 'callbacks__EpochScoring__lower_is_better': False, 'callbacks__EpochScoring__on_train': False, 'callbacks__EpochScoring__name': 'f1', 'callbacks__EpochScoring__target_extractor': <function to_numpy at 0x7fc7d0383550>, 'callbacks__EpochScoring__use_caching': True, 'callbacks__Checkpoint': <skorch.callbacks.training.Checkpoint object at 0x7fc874db38e0>, 'callbacks__Checkpoint__monitor': 'valid_loss_best', 'callbacks__Checkpoint__f_params': 'params.pt', 'callbacks__Checkpoint__f_optimizer': 'optimizer.pt', 'callbacks__Checkpoint__f_criterion': 'criterion.pt', 'callbacks__Checkpoint__f_history': 'history.json', 'callbacks__Checkpoint__f_pickle': None, 'callbacks__Checkpoint__fn_prefix': '', 'callbacks__Checkpoint__dirname': 'exp2', 'callbacks__Checkpoint__event_name': 'event_cp', 'callbacks__Checkpoint__sink': <function noop at 0x7fc7d0383ee0>, 'callbacks__EarlyStopping': <skorch.callbacks.training.EarlyStopping object at 0x7fc7d16d17f0>, 'callbacks__EarlyStopping__monitor': 'valid_loss', 'callbacks__EarlyStopping__lower_is_better': True, 'callbacks__EarlyStopping__patience': 20, 'callbacks__EarlyStopping__threshold': 0.0001, 'callbacks__EarlyStopping__threshold_mode': 'rel', 'callbacks__EarlyStopping__sink': <built-in function print>, 'callbacks__LoadInitState': <skorch.callbacks.training.LoadInitState object at 0x7fc88ca963a0>, 'callbacks__LoadInitState__checkpoint__monitor': 'valid_loss_best', 'callbacks__LoadInitState__checkpoint__f_params': 'params.pt', 'callbacks__LoadInitState__checkpoint__f_optimizer': 'optimizer.pt', 'callbacks__LoadInitState__checkpoint__f_criterion': 'criterion.pt', 'callbacks__LoadInitState__checkpoint__f_history': 'history.json', 'callbacks__LoadInitState__checkpoint__f_pickle': None, 'callbacks__LoadInitState__checkpoint__fn_prefix': '', 'callbacks__LoadInitState__checkpoint__dirname': 'exp2', 'callbacks__LoadInitState__checkpoint__event_name': 'event_cp', 'callbacks__LoadInitState__checkpoint__sink': <function noop at 0x7fc7d0383ee0>, 'callbacks__LoadInitState__checkpoint': <skorch.callbacks.training.Checkpoint object at 0x7fc874db38e0>, 'callbacks__print_log': <skorch.callbacks.logging.PrintLog object at 0x7fc7d03678b0>, 'callbacks__print_log__keys_ignored': None, 'callbacks__print_log__sink': <built-in function print>, 'callbacks__print_log__tablefmt': 'simple', 'callbacks__print_log__floatfmt': '.4f', 'callbacks__print_log__stralign': 'right'}
/home/martin/anaconda3/envs/phenetics2/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1221: UndefinedMetricWarning: Recall and F-score are ill-defined and being set to 0.0 in samples with no true labels. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))

In [None]:


Hamming loss: 0.522633744855967
0.7151155295024978
                                               precision    recall  f1-score   support

                        Analysis and Modeling       0.36      0.94      0.52        84
           Analysis and Modeling: 3D Modeling       0.29      0.89      0.44        71
                            Anatomical Target       0.68      0.96      0.80       164
           Anatomical Target: Lower Extremity       0.48      0.97      0.65       113
     Anatomical Target: Lower Extremity - Hip       0.19      0.82      0.31        40
    Anatomical Target: Lower Extremity - Knee       0.35      0.91      0.50        82
                     Anatomical Target: Torso       0.16      0.69      0.26        35
             Anatomical Target: Torso - Spine       0.08      0.48      0.14        21
           Anatomical Target: Upper Extremity       0.16      0.68      0.25        31
Anatomical Target: Upper Extremity - Shoulder       0.12      0.61      0.20        23
                                      Imaging       0.56      0.98      0.71       133
                                  Imaging: CT       0.28      0.97      0.43        59
                                 Imaging: MRI       0.27      0.93      0.41        59
                          Imaging: Ultrasound       0.11      0.31      0.17        32
                                Manufacturing       0.35      0.92      0.51        83
        Manufacturing: Additive Manufacturing       0.19      0.74      0.31        38
           Personalized Product: Guide or Jig       0.51      0.93      0.66       120
                Personalized Product: Implant       0.52      0.99      0.68       124
                         Specification of Use       0.33      0.95      0.49        79
                Specification of Use: Disease       0.14      0.37      0.20        30
      Specification of Use: Joint Replacement       0.19      0.91      0.31        44
                              Surgical Method       0.27      0.55      0.36        40

                                    micro avg       0.34      0.88      0.49      1505
                                    macro avg       0.30      0.79      0.42      1505
                                 weighted avg       0.39      0.88      0.53      1505
                                  samples avg       0.35      0.89      0.48      1505
bertForPatents/
Pipeline(steps=[('columns',
                 ColumnTransformer(n_jobs=1,
                                   transformers=[('cited_by',
                                                  CountVectorizer(analyzer=<function iden at 0x7fd77b563d30>,
                                                                  binary=True,
                                                                  min_df=2),
                                                  'cited_by'),
                                                 ('inventors',
                                                  CountVectorizer(analyzer=<function iden at 0x7fd77b563d30>,
                                                                  binary=True,
                                                                  min_df=2),
                                                  'inventors'),
                                                 ('embedded_cpc',
                                                  VectorPassthrough(),
                                                  'embedded_cpc'),
                                                 ('embedding_v1',
                                                  VectorPassthrough(),
                                                  'embedding_v1'),
                                                 ('transformer_abstract',
                                                  TransformerFeatures(),
                                                  'abstract'),
                                                 ('transformer_claims',
                                                  TransformerFeatures(),
                                                  'claims')])),
                ('norm', Normalizer())])
<class 'skorch.net.NeuralNet'>[initialized](
  module_=Net(
    (layer1): Linear(in_features=8572, out_features=512, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
    (bn): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (layer2): Linear(in_features=512, out_features=512, bias=True)
    (output): Linear(in_features=512, out_features=22, bias=True)
  ),
)


In [None]:
Hamming loss: 0.5744481855592967
0.7836308184196901
                                               precision    recall  f1-score   support

                        Analysis and Modeling       0.36      0.96      0.52        84
           Analysis and Modeling: 3D Modeling       0.30      0.96      0.46        71
                            Anatomical Target       0.68      0.99      0.80       164
           Anatomical Target: Lower Extremity       0.47      1.00      0.63       113
     Anatomical Target: Lower Extremity - Hip       0.19      0.85      0.31        40
    Anatomical Target: Lower Extremity - Knee       0.33      0.94      0.49        82
                     Anatomical Target: Torso       0.15      0.77      0.25        35
             Anatomical Target: Torso - Spine       0.09      0.67      0.15        21
           Anatomical Target: Upper Extremity       0.15      0.81      0.25        31
Anatomical Target: Upper Extremity - Shoulder       0.11      0.78      0.19        23
                                      Imaging       0.56      0.98      0.71       133
                                  Imaging: CT       0.26      0.98      0.41        59
                                 Imaging: MRI       0.25      0.95      0.39        59
                          Imaging: Ultrasound       0.15      0.47      0.23        32
                                Manufacturing       0.35      0.93      0.50        83
        Manufacturing: Additive Manufacturing       0.17      0.87      0.28        38
           Personalized Product: Guide or Jig       0.51      0.96      0.66       120
                Personalized Product: Implant       0.52      0.98      0.68       124
                         Specification of Use       0.33      0.96      0.49        79
                Specification of Use: Disease       0.13      0.33      0.18        30
      Specification of Use: Joint Replacement       0.19      0.98      0.32        44
                              Surgical Method       0.23      0.53      0.32        40

                                    micro avg       0.32      0.91      0.47      1505
                                    macro avg       0.29      0.85      0.42      1505
                                 weighted avg       0.39      0.91      0.53      1505
                                  samples avg       0.34      0.92      0.46      1505

/home/martin/patentmark/patentmark-charting-app/vectors/patent-electra-v4-patent-mnr-cl/
Pipeline(steps=[('columns',
                 ColumnTransformer(n_jobs=1,
                                   transformers=[('cited_by',
                                                  CountVectorizer(analyzer=<function iden at 0x7fc9bcb59430>,
                                                                  binary=True,
                                                                  min_df=2),
                                                  'cited_by'),
                                                 ('inventors',
                                                  CountVectorizer(analyzer=<function iden at 0x7fc9bcb59430>,
                                                                  binary=True,
                                                                  min_df=2),
                                                  'inventors'),
                                                 ('embedded_cpc',
                                                  VectorPassthrough(),
                                                  'embedded_cpc'),
                                                 ('embedding_v1',
                                                  VectorPassthrough(),
                                                  'embedding_v1'),
                                                 ('transformer_abstract',
                                                  TransformerFeatures(),
                                                  'abstract'),
                                                 ('transformer_claims',
                                                  TransformerFeatures(),
                                                  'claims')])),
                ('norm', Normalizer())])
<class 'skorch.net.NeuralNet'>[initialized](
  module_=Net(
    (layer1): Linear(in_features=6012, out_features=512, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
    (layer2): Linear(in_features=512, out_features=512, bias=True)
    (output): Linear(in_features=512, out_features=22, bias=True)
  ),
)
{'module': <class '__main__.Net'>, 'criterion': <class '__main__.LabelSmoothingCrossEntropy'>, 'optimizer': <class 'torch.optim.adamw.AdamW'>, 'lr': 5e-05, 'max_epochs': 10000, 'batch_size': 32, 'iterator_train': <class 'torch.utils.data.dataloader.DataLoader'>, 'iterator_valid': <class 'torch.utils.data.dataloader.DataLoader'>, 'dataset': <class 'skorch.dataset.Dataset'>, 'train_split': <skorch.dataset.CVSplit object at 0x7fc7d033f460>, 'callbacks': [<skorch.callbacks.logging.ProgressBar object at 0x7fc7d16d18e0>, <skorch.callbacks.scoring.EpochScoring object at 0x7fc7d16d1610>, <skorch.callbacks.training.Checkpoint object at 0x7fc874db38e0>, <skorch.callbacks.training.EarlyStopping object at 0x7fc7d16d17f0>, <skorch.callbacks.training.LoadInitState object at 0x7fc88ca963a0>], 'predict_nonlinearity': <built-in method sigmoid of type object at 0x7fc88c1305a0>, 'warm_start': False, 'verbose': 1, 'device': 'cuda', 'module__input_features': 6012, 'module__output_features': 22, 'iterator_train__shuffle': True, 'iterator_train__num_workers': 4, 'callbacks__epoch_timer': <skorch.callbacks.logging.EpochTimer object at 0x7fc874d5e2b0>, 'callbacks__train_loss': <skorch.callbacks.scoring.PassthroughScoring object at 0x7fc79ee405b0>, 'callbacks__train_loss__name': 'train_loss', 'callbacks__train_loss__lower_is_better': True, 'callbacks__train_loss__on_train': True, 'callbacks__valid_loss': <skorch.callbacks.scoring.PassthroughScoring object at 0x7fc874db3d30>, 'callbacks__valid_loss__name': 'valid_loss', 'callbacks__valid_loss__lower_is_better': True, 'callbacks__valid_loss__on_train': False, 'callbacks__ProgressBar': <skorch.callbacks.logging.ProgressBar object at 0x7fc7d16d18e0>, 'callbacks__ProgressBar__batches_per_epoch': 'auto', 'callbacks__ProgressBar__detect_notebook': True, 'callbacks__ProgressBar__postfix_keys': ['train_loss', 'valid_loss'], 'callbacks__EpochScoring': <skorch.callbacks.scoring.EpochScoring object at 0x7fc7d16d1610>, 'callbacks__EpochScoring__scoring': <function scorer at 0x7fc85dcba310>, 'callbacks__EpochScoring__lower_is_better': False, 'callbacks__EpochScoring__on_train': False, 'callbacks__EpochScoring__name': 'f1', 'callbacks__EpochScoring__target_extractor': <function to_numpy at 0x7fc7d0383550>, 'callbacks__EpochScoring__use_caching': True, 'callbacks__Checkpoint': <skorch.callbacks.training.Checkpoint object at 0x7fc874db38e0>, 'callbacks__Checkpoint__monitor': 'valid_loss_best', 'callbacks__Checkpoint__f_params': 'params.pt', 'callbacks__Checkpoint__f_optimizer': 'optimizer.pt', 'callbacks__Checkpoint__f_criterion': 'criterion.pt', 'callbacks__Checkpoint__f_history': 'history.json', 'callbacks__Checkpoint__f_pickle': None, 'callbacks__Checkpoint__fn_prefix': '', 'callbacks__Checkpoint__dirname': 'exp2', 'callbacks__Checkpoint__event_name': 'event_cp', 'callbacks__Checkpoint__sink': <function noop at 0x7fc7d0383ee0>, 'callbacks__EarlyStopping': <skorch.callbacks.training.EarlyStopping object at 0x7fc7d16d17f0>, 'callbacks__EarlyStopping__monitor': 'valid_loss', 'callbacks__EarlyStopping__lower_is_better': True, 'callbacks__EarlyStopping__patience': 20, 'callbacks__EarlyStopping__threshold': 0.0001, 'callbacks__EarlyStopping__threshold_mode': 'rel', 'callbacks__EarlyStopping__sink': <built-in function print>, 'callbacks__LoadInitState': <skorch.callbacks.training.LoadInitState object at 0x7fc88ca963a0>, 'callbacks__LoadInitState__checkpoint__monitor': 'valid_loss_best', 'callbacks__LoadInitState__checkpoint__f_params': 'params.pt', 'callbacks__LoadInitState__checkpoint__f_optimizer': 'optimizer.pt', 'callbacks__LoadInitState__checkpoint__f_criterion': 'criterion.pt', 'callbacks__LoadInitState__checkpoint__f_history': 'history.json', 'callbacks__LoadInitState__checkpoint__f_pickle': None, 'callbacks__LoadInitState__checkpoint__fn_prefix': '', 'callbacks__LoadInitState__checkpoint__dirname': 'exp2', 'callbacks__LoadInitState__checkpoint__event_name': 'event_cp', 'callbacks__LoadInitState__checkpoint__sink': <function noop at 0x7fc7d0383ee0>, 'callbacks__LoadInitState__checkpoint': <skorch.callbacks.training.Checkpoint object at 0x7fc874db38e0>, 'callbacks__print_log': <skorch.callbacks.logging.PrintLog object at 0x7fc7d03678b0>, 'callbacks__print_log__keys_ignored': None, 'callbacks__print_log__sink': <built-in function print>, 'callbacks__print_log__tablefmt': 'simple', 'callbacks__print_log__floatfmt': '.4f', 'callbacks__print_log__stralign': 'right'}

In [None]:
Hamming loss: 0.5422745978301534
0.7398683245697636
                                               precision    recall  f1-score   support

                        Analysis and Modeling       0.35      0.96      0.52        84
           Analysis and Modeling: 3D Modeling       0.30      0.94      0.46        71
                            Anatomical Target       0.67      1.00      0.81       164
           Anatomical Target: Lower Extremity       0.47      1.00      0.64       113
     Anatomical Target: Lower Extremity - Hip       0.18      0.78      0.30        40
    Anatomical Target: Lower Extremity - Knee       0.34      0.96      0.51        82
                     Anatomical Target: Torso       0.14      0.63      0.24        35
             Anatomical Target: Torso - Spine       0.09      0.43      0.15        21
           Anatomical Target: Upper Extremity       0.14      0.58      0.23        31
Anatomical Target: Upper Extremity - Shoulder       0.12      0.57      0.20        23
                                      Imaging       0.56      0.99      0.72       133
                                  Imaging: CT       0.26      0.97      0.41        59
                                 Imaging: MRI       0.26      0.95      0.41        59
                          Imaging: Ultrasound       0.16      0.59      0.25        32
                                Manufacturing       0.35      0.93      0.51        83
        Manufacturing: Additive Manufacturing       0.21      0.79      0.33        38
           Personalized Product: Guide or Jig       0.50      0.98      0.66       120
                Personalized Product: Implant       0.52      0.99      0.68       124
                         Specification of Use       0.34      0.97      0.51        79
                Specification of Use: Disease       0.13      0.50      0.20        30
      Specification of Use: Joint Replacement       0.18      0.93      0.30        44
                              Surgical Method       0.23      0.68      0.35        40

                                    micro avg       0.33      0.91      0.49      1505
                                    macro avg       0.30      0.82      0.43      1505
                                 weighted avg       0.39      0.91      0.53      1505
                                  samples avg       0.34      0.92      0.47      1505

distilroberta-base-msmarco-v2
Pipeline(steps=[('columns',
                 ColumnTransformer(n_jobs=1,
                                   transformers=[('cited_by',
                                                  CountVectorizer(analyzer=<function iden at 0x7fc9bcb59430>,
                                                                  binary=True,
                                                                  min_df=2),
                                                  'cited_by'),
                                                 ('inventors',
                                                  CountVectorizer(analyzer=<function iden at 0x7fc9bcb59430>,
                                                                  binary=True,
                                                                  min_df=2),
                                                  'inventors'),
                                                 ('embedded_cpc',
                                                  VectorPassthrough(),
                                                  'embedded_cpc'),
                                                 ('embedding_v1',
                                                  VectorPassthrough(),
                                                  'embedding_v1'),
                                                 ('transformer_abstract',
                                                  TransformerFeatures(),
                                                  'abstract'),
                                                 ('transformer_claims',
                                                  TransformerFeatures(),
                                                  'claims')])),
                ('norm', Normalizer())])
<class 'skorch.net.NeuralNet'>[initialized](
  module_=Net(
    (layer1): Linear(in_features=6012, out_features=512, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
    (layer2): Linear(in_features=512, out_features=512, bias=True)
    (output): Linear(in_features=512, out_features=22, bias=True)
  ),
)
{'module': <class '__main__.Net'>, 'criterion': <class '__main__.LabelSmoothingCrossEntropy'>, 'optimizer': <class 'torch.optim.adamw.AdamW'>, 'lr': 1e-05, 'max_epochs': 10000, 'batch_size': 32, 'iterator_train': <class 'torch.utils.data.dataloader.DataLoader'>, 'iterator_valid': <class 'torch.utils.data.dataloader.DataLoader'>, 'dataset': <class 'skorch.dataset.Dataset'>, 'train_split': <skorch.dataset.CVSplit object at 0x7fc7d033f460>, 'callbacks': [<skorch.callbacks.logging.ProgressBar object at 0x7fc7d0381100>, <skorch.callbacks.scoring.EpochScoring object at 0x7fc7d0304370>, <skorch.callbacks.training.Checkpoint object at 0x7fc7d0381040>, <skorch.callbacks.training.EarlyStopping object at 0x7fc7d0381130>, <skorch.callbacks.training.LoadInitState object at 0x7fc7d03810d0>], 'predict_nonlinearity': <built-in method sigmoid of type object at 0x7fc88c1305a0>, 'warm_start': False, 'verbose': 1, 'device': 'cuda', 'module__input_features': 6012, 'module__output_features': 22, 'iterator_train__shuffle': True, 'iterator_train__num_workers': 4, 'callbacks__epoch_timer': <skorch.callbacks.logging.EpochTimer object at 0x7fc7c0e793d0>, 'callbacks__train_loss': <skorch.callbacks.scoring.PassthroughScoring object at 0x7fc7c1e4fa60>, 'callbacks__train_loss__name': 'train_loss', 'callbacks__train_loss__lower_is_better': True, 'callbacks__train_loss__on_train': True, 'callbacks__valid_loss': <skorch.callbacks.scoring.PassthroughScoring object at 0x7fc7c1e4f190>, 'callbacks__valid_loss__name': 'valid_loss', 'callbacks__valid_loss__lower_is_better': True, 'callbacks__valid_loss__on_train': False, 'callbacks__ProgressBar': <skorch.callbacks.logging.ProgressBar object at 0x7fc7d0381100>, 'callbacks__ProgressBar__batches_per_epoch': 'auto', 'callbacks__ProgressBar__detect_notebook': True, 'callbacks__ProgressBar__postfix_keys': ['train_loss', 'valid_loss'], 'callbacks__EpochScoring': <skorch.callbacks.scoring.EpochScoring object at 0x7fc7d0304370>, 'callbacks__EpochScoring__scoring': <function scorer at 0x7fc7d172ef70>, 'callbacks__EpochScoring__lower_is_better': False, 'callbacks__EpochScoring__on_train': False, 'callbacks__EpochScoring__name': 'f1', 'callbacks__EpochScoring__target_extractor': <function to_numpy at 0x7fc7d0383550>, 'callbacks__EpochScoring__use_caching': True, 'callbacks__Checkpoint': <skorch.callbacks.training.Checkpoint object at 0x7fc7d0381040>, 'callbacks__Checkpoint__monitor': 'valid_loss_best', 'callbacks__Checkpoint__f_params': 'params.pt', 'callbacks__Checkpoint__f_optimizer': 'optimizer.pt', 'callbacks__Checkpoint__f_criterion': 'criterion.pt', 'callbacks__Checkpoint__f_history': 'history.json', 'callbacks__Checkpoint__f_pickle': None, 'callbacks__Checkpoint__fn_prefix': '', 'callbacks__Checkpoint__dirname': 'exp2', 'callbacks__Checkpoint__event_name': 'event_cp', 'callbacks__Checkpoint__sink': <function noop at 0x7fc7d0383ee0>, 'callbacks__EarlyStopping': <skorch.callbacks.training.EarlyStopping object at 0x7fc7d0381130>, 'callbacks__EarlyStopping__monitor': 'valid_loss', 'callbacks__EarlyStopping__lower_is_better': True, 'callbacks__EarlyStopping__patience': 20, 'callbacks__EarlyStopping__threshold': 0.0001, 'callbacks__EarlyStopping__threshold_mode': 'rel', 'callbacks__EarlyStopping__sink': <built-in function print>, 'callbacks__LoadInitState': <skorch.callbacks.training.LoadInitState object at 0x7fc7d03810d0>, 'callbacks__LoadInitState__checkpoint__monitor': 'valid_loss_best', 'callbacks__LoadInitState__checkpoint__f_params': 'params.pt', 'callbacks__LoadInitState__checkpoint__f_optimizer': 'optimizer.pt', 'callbacks__LoadInitState__checkpoint__f_criterion': 'criterion.pt', 'callbacks__LoadInitState__checkpoint__f_history': 'history.json', 'callbacks__LoadInitState__checkpoint__f_pickle': None, 'callbacks__LoadInitState__checkpoint__fn_prefix': '', 'callbacks__LoadInitState__checkpoint__dirname': 'exp2', 'callbacks__LoadInitState__checkpoint__event_name': 'event_cp', 'callbacks__LoadInitState__checkpoint__sink': <function noop at 0x7fc7d0383ee0>, 'callbacks__LoadInitState__checkpoint': <skorch.callbacks.training.Checkpoint object at 0x7fc7d0381040>, 'callbacks__print_log': <skorch.callbacks.logging.PrintLog object at 0x7fc7c1e4f460>, 'callbacks__print_log__keys_ignored': None, 'callbacks__print_log__sink': <built-in function print>, 'callbacks__print_log__tablefmt': 'simple', 'callbacks__print_log__floatfmt': '.4f', 'callbacks__print_log__stralign': 'right'}

In [None]:
Using BertForPatents, dropout =0.5, bottleneck = 512, pre normalization
Hamming loss: 0.6380471380471381
                                               precision    recall  f1-score   support

                        Analysis and Modeling       0.35      1.00      0.52        84
           Analysis and Modeling: 3D Modeling       0.30      1.00      0.46        71
                            Anatomical Target       0.67      1.00      0.81       164
           Anatomical Target: Lower Extremity       0.47      1.00      0.63       113
     Anatomical Target: Lower Extremity - Hip       0.18      0.97      0.30        40
    Anatomical Target: Lower Extremity - Knee       0.34      1.00      0.50        82
                     Anatomical Target: Torso       0.15      1.00      0.27        35
             Anatomical Target: Torso - Spine       0.09      1.00      0.17        21
           Anatomical Target: Upper Extremity       0.13      0.87      0.23        31
Anatomical Target: Upper Extremity - Shoulder       0.10      0.87      0.19        23
                                      Imaging       0.55      1.00      0.71       133
                                  Imaging: CT       0.25      1.00      0.40        59
                                 Imaging: MRI       0.25      1.00      0.40        59
                          Imaging: Ultrasound       0.14      0.53      0.22        32
                                Manufacturing       0.34      1.00      0.51        83
        Manufacturing: Additive Manufacturing       0.16      0.95      0.28        38
           Personalized Product: Guide or Jig       0.49      1.00      0.66       120
                Personalized Product: Implant       0.51      1.00      0.68       124
                         Specification of Use       0.33      1.00      0.49        79
                Specification of Use: Disease       0.12      0.50      0.19        30
      Specification of Use: Joint Replacement       0.18      1.00      0.31        44
                              Surgical Method       0.20      0.72      0.32        40

                                    micro avg       0.30      0.97      0.46      1505
                                    macro avg       0.29      0.93      0.42      1505
                                 weighted avg       0.38      0.97      0.53      1505
                                  samples avg       0.31      0.97      0.45      1505

In [None]:
Abstract + Claims + CPC Codes

Hamming loss: 0.7010849233071456
                                               precision    recall  f1-score   support

                        Analysis and Modeling       0.35      1.00      0.52        84
           Analysis and Modeling: 3D Modeling       0.30      1.00      0.46        71
                            Anatomical Target       0.67      1.00      0.81       164
           Anatomical Target: Lower Extremity       0.47      1.00      0.63       113
     Anatomical Target: Lower Extremity - Hip       0.17      1.00      0.29        40
    Anatomical Target: Lower Extremity - Knee       0.34      1.00      0.50        82
                     Anatomical Target: Torso       0.15      1.00      0.26        35
             Anatomical Target: Torso - Spine       0.09      1.00      0.16        21
           Anatomical Target: Upper Extremity       0.13      1.00      0.23        31
Anatomical Target: Upper Extremity - Shoulder       0.10      1.00      0.18        23
                                      Imaging       0.56      1.00      0.72       133
                                  Imaging: CT       0.25      1.00      0.40        59
                                 Imaging: MRI       0.25      1.00      0.40        59
                          Imaging: Ultrasound       0.14      1.00      0.24        32
                                Manufacturing       0.35      1.00      0.51        83
        Manufacturing: Additive Manufacturing       0.16      1.00      0.28        38
           Personalized Product: Guide or Jig       0.49      1.00      0.66       120
                Personalized Product: Implant       0.52      1.00      0.69       124
                         Specification of Use       0.33      1.00      0.50        79
                Specification of Use: Disease       0.13      1.00      0.22        30
      Specification of Use: Joint Replacement       0.19      1.00      0.31        44
                              Surgical Method       0.16      1.00      0.28        40

                                    micro avg       0.29      1.00      0.45      1505
                                    macro avg       0.29      1.00      0.42      1505
                                 weighted avg       0.38      1.00      0.53      1505
                                  samples avg       0.30      1.00      0.44      1505

In [None]:
Abstract Only
{'estimator__C': 1.0, 'estimator__break_ties': False, 'estimator__cache_size': 200, 'estimator__class_weight': 'balanced', 'estimator__coef0': 0.0, 'estimator__decision_function_shape': 'ovr', 'estimator__degree': 3, 'estimator__gamma': 'scale', 'estimator__kernel': 'rbf', 'estimator__max_iter': -1, 'estimator__probability': False, 'estimator__random_state': None, 'estimator__shrinking': True, 'estimator__tol': 0.001, 'estimator__verbose': True, 'estimator': SVC(class_weight='balanced', verbose=True), 'n_jobs': -1}
Hamming loss: 0.39206883651328095
                                               precision    recall  f1-score   support

                        Analysis and Modeling       0.40      0.69      0.51        84
           Analysis and Modeling: 3D Modeling       0.33      0.70      0.45        71
                            Anatomical Target       0.73      0.86      0.79       164
           Anatomical Target: Lower Extremity       0.51      0.57      0.54       113
     Anatomical Target: Lower Extremity - Hip       0.18      0.57      0.28        40
    Anatomical Target: Lower Extremity - Knee       0.38      0.26      0.31        82
                     Anatomical Target: Torso       0.16      0.66      0.26        35
             Anatomical Target: Torso - Spine       0.07      0.14      0.09        21
           Anatomical Target: Upper Extremity       0.15      0.52      0.23        31
Anatomical Target: Upper Extremity - Shoulder       0.12      0.52      0.19        23
                                      Imaging       0.64      0.81      0.71       133
                                  Imaging: CT       0.29      0.41      0.34        59
                                 Imaging: MRI       0.31      0.15      0.20        59
                          Imaging: Ultrasound       0.14      0.12      0.13        32
                                Manufacturing       0.37      0.53      0.44        83
        Manufacturing: Additive Manufacturing       0.19      0.55      0.29        38
           Personalized Product: Guide or Jig       0.56      0.70      0.62       120
                Personalized Product: Implant       0.61      0.47      0.53       124
                         Specification of Use       0.36      0.39      0.37        79
                Specification of Use: Disease       0.16      0.13      0.15        30
      Specification of Use: Joint Replacement       0.21      0.68      0.32        44
                              Surgical Method       0.37      0.25      0.30        40

                                    micro avg       0.37      0.56      0.44      1505
                                    macro avg       0.33      0.49      0.36      1505
                                 weighted avg       0.43      0.56      0.47      1505
                                  samples avg       0.39      0.57      0.43      1505

In [None]:
Claims Only
{'estimator__C': 1.0, 'estimator__break_ties': False, 'estimator__cache_size': 200, 'estimator__class_weight': 'balanced', 'estimator__coef0': 0.0, 'estimator__decision_function_shape': 'ovr', 'estimator__degree': 3, 'estimator__gamma': 'scale', 'estimator__kernel': 'rbf', 'estimator__max_iter': -1, 'estimator__probability': False, 'estimator__random_state': None, 'estimator__shrinking': True, 'estimator__tol': 0.001, 'estimator__verbose': True, 'estimator': SVC(class_weight='balanced', verbose=True), 'n_jobs': -1}
Hamming loss: 0.4292929292929293
                                               precision    recall  f1-score   support

                        Analysis and Modeling       0.34      0.58      0.43        84
           Analysis and Modeling: 3D Modeling       0.27      0.44      0.34        71
                            Anatomical Target       0.72      0.34      0.46       164
           Anatomical Target: Lower Extremity       0.50      0.29      0.37       113
     Anatomical Target: Lower Extremity - Hip       0.17      0.68      0.27        40
    Anatomical Target: Lower Extremity - Knee       0.40      0.23      0.29        82
                     Anatomical Target: Torso       0.14      0.34      0.20        35
             Anatomical Target: Torso - Spine       0.07      0.19      0.11        21
           Anatomical Target: Upper Extremity       0.14      0.39      0.20        31
Anatomical Target: Upper Extremity - Shoulder       0.09      0.30      0.14        23
                                      Imaging       0.60      0.78      0.68       133
                                  Imaging: CT       0.30      0.63      0.41        59
                                 Imaging: MRI       0.31      0.75      0.44        59
                          Imaging: Ultrasound       0.13      0.41      0.19        32
                                Manufacturing       0.34      0.40      0.37        83
        Manufacturing: Additive Manufacturing       0.20      0.45      0.28        38
           Personalized Product: Guide or Jig       0.55      0.59      0.57       120
                Personalized Product: Implant       0.54      0.45      0.49       124
                         Specification of Use       0.37      0.75      0.49        79
                Specification of Use: Disease       0.12      0.40      0.19        30
      Specification of Use: Joint Replacement       0.18      0.39      0.25        44
                              Surgical Method       0.44      0.20      0.28        40

                                    micro avg       0.32      0.48      0.39      1505
                                    macro avg       0.32      0.45      0.34      1505
                                 weighted avg       0.41      0.48      0.41      1505
                                  samples avg       0.35      0.50      0.38      1505
                            

In [None]:
Abstract + Claims
{'estimator__C': 1.0, 'estimator__break_ties': False, 'estimator__cache_size': 200, 'estimator__class_weight': 'balanced', 'estimator__coef0': 0.0, 'estimator__decision_function_shape': 'ovr', 'estimator__degree': 3, 'estimator__gamma': 'scale', 'estimator__kernel': 'rbf', 'estimator__max_iter': -1, 'estimator__probability': False, 'estimator__random_state': None, 'estimator__shrinking': True, 'estimator__tol': 0.001, 'estimator__verbose': True, 'estimator': SVC(class_weight='balanced', verbose=True), 'n_jobs': -1}
Hamming loss: 0.3943135054246165
                                               precision    recall  f1-score   support

                        Analysis and Modeling       0.39      0.70      0.50        84
           Analysis and Modeling: 3D Modeling       0.32      0.63      0.43        71
                            Anatomical Target       0.73      0.79      0.76       164
           Anatomical Target: Lower Extremity       0.47      0.35      0.40       113
     Anatomical Target: Lower Extremity - Hip       0.17      0.55      0.26        40
    Anatomical Target: Lower Extremity - Knee       0.40      0.21      0.27        82
                     Anatomical Target: Torso       0.14      0.60      0.22        35
             Anatomical Target: Torso - Spine       0.06      0.14      0.09        21
           Anatomical Target: Upper Extremity       0.16      0.48      0.24        31
Anatomical Target: Upper Extremity - Shoulder       0.14      0.52      0.22        23
                                      Imaging       0.61      0.76      0.68       133
                                  Imaging: CT       0.33      0.58      0.42        59
                                 Imaging: MRI       0.28      0.19      0.22        59
                          Imaging: Ultrasound       0.12      0.12      0.12        32
                                Manufacturing       0.35      0.45      0.39        83
        Manufacturing: Additive Manufacturing       0.22      0.55      0.32        38
           Personalized Product: Guide or Jig       0.55      0.68      0.61       120
                Personalized Product: Implant       0.60      0.47      0.52       124
                         Specification of Use       0.36      0.52      0.42        79
                Specification of Use: Disease       0.15      0.13      0.14        30
      Specification of Use: Joint Replacement       0.18      0.50      0.27        44
                              Surgical Method       0.50      0.23      0.31        40

                                    micro avg       0.36      0.52      0.43      1505
                                    macro avg       0.33      0.46      0.36      1505
                                 weighted avg       0.43      0.52      0.45      1505
                                  samples avg       0.39      0.55      0.42      1505

In [None]:
from skmultilearn.problem_transform import ClassifierChain
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC

parameters = [
    {
        'classifier': [MultinomialNB()],
        'classifier__alpha': [0.7, 1.0],
    },
    {
        'classifier': [SVC()],
        'classifier__kernel': ['rbf', 'linear'],
    },
]

clf = GridSearchCV(ClassifierChain(), parameters, scoring='accuracy')
clf.fit(X_train, y_train)

print (clf.best_params_, clf.best_score_)



In [None]:
import lightgbm as lgb
evaluate(BinaryRelevance(lgb.LGBMClassifier()))

In [None]:
evaluate(MLkNN())

In [None]:
evaluate(LabelPowerset(
    classifier=SVC(),
    require_dense=[False, True]
))

In [None]:
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectKBest, SelectFromModel
from sklearn.feature_selection import f_classif


In [None]:
pipe = Pipeline(steps=[('transformer', transformer),
                       ('feature_selection', SelectFromModel(LinearSVC(penalty="l1"))),
                       #('svd', TruncatedSVD(random_state=42)),
                       #('dummy', OneVsRestClassifier(DummyClassifier()))
                       #('svc', OneVsRestClassifier(SVC(random_state=42), n_jobs=-1))
                       ('rf',  RandomForestClassifier(n_jobs=-1, random_state=42))
                       #('lr', OneVsRestClassifier(LogisticRegression(n_jobs=-1), n_jobs=-1))
                       #('cat', OneVsRestClassifier(CatBoostClassifier(verbose=True)))
                       #('knn', KNeighborsClassifier(n_jobs=-1))
                      ], 
                verbose=True,
                memory="cachedir/")

In [None]:
training_labels[:,0]

In [None]:
from sklearn.feature_selection import chi2
selected_features = [] 
for label in range(0, len(subset)):
    selector = SelectKBest(chi2, k='all')
    selector.fit(X, training_labels.values[:,label])
    selected_features.append(list(selector.scores_))

In [None]:
selected_features = np.nan_to_num(np.array(selected_features))

In [None]:
selected_features

In [None]:
selected_features.mean(axis=0).mean()

In [None]:
features_set = np.mean(np.nan_to_num(np.array(selected_features)), axis=0) > 0.8

In [None]:
features_set.astype(int).sum()

In [None]:
X.shape

In [None]:
feature_subset = X[:,features_set]

In [None]:
feature_subset.shape

In [None]:
model.fit(X[:,features_set], training_labels)
predictions = model.predict(Xtest[:,features_set])
print(classification_report(testing_labels, predictions, target_names=subset))

In [None]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from skopt import BayesSearchCV
from skopt import *
from skopt.space import Real, Categorical, Integer

param_grid = {
    #'rf__bootstrap': [True, False],
    'max_depth': (10, 1000), #[10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 125, 150, 175, 200, 250, 300, 350, 400, None],
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': (1, 12),
 'min_samples_split': (2, 12),
 'n_estimators': (5, 1000)          
             }


# param_grid = {
#     'svd__n_components': Integer(64,10000),
#     'svc__estimator__C': Real(1e-6, 1e+6, prior='log-uniform'),
#     'svc__estimator__gamma': Real(1e-6, 1e+6, prior='log-uniform'),
#     'svc__estimator__degree': Integer(1,8),
#     'svc__estimator__kernel': Categorical(['linear', 'poly', 'rbf']),
# }

# param_grid = {
# #  'svd__n_components': np.arange(64, 5000, 100),
#  'knn__leaf_size': np.arange(1, 50, 1),
#  'knn__metric': ['minkowski', 'euclidean'],
#  'knn__n_neighbors': [2,3,4,5,6,7,8,9,10,11,12],
#  'knn__weights': ['distance', 'uniform']
# }
model = RandomForestClassifier()
search = BayesSearchCV(model, param_grid, n_iter=50, n_points=3, pre_dispatch=36, refit=True, cv=3, verbose=10, random_state=42, n_jobs=-1)

from tqdm.auto import tqdm
from tqdm.utils import CallbackIOWrapper

with tqdm(total=search.total_iterations) as pbar:
    def on_step(optim_result):
        print(optim_result)
        pbar.update(9)
        return False
    search.fit(X_train, y_train, callback=on_step)


In [None]:
predictions = search.estimator.predict(X_test)

In [None]:
f1_score(testing_labels, predictions, average="weighted")

In [None]:
hamming_loss(testing_labels, predictions)

In [None]:
plot_confusion_matrix(pipe, testing_set, testing_labels)

In [None]:
training_predictions = search.best_estimator_.predict(X[:,features_set])
print(classification_report(training_labels, training_predictions, target_names=subset))

In [None]:
from skopt import BayesSearchCV
from skopt import *
from skopt.space import Real, Categorical, Integer

In [None]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
param_grid = {
    #'rf__bootstrap': [True, False],
    'rf__max_depth': (10, 1000), #[10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 125, 150, 175, 200, 250, 300, 350, 400, None],
 #'rf__max_features': ['auto', 'sqrt'],
 'rf__min_samples_leaf': (1, 12),
 'rf__min_samples_split': (2, 12),
 'rf__n_estimators': (5, 1000)          
             }


# param_grid = {
#     'svd__n_components': Integer(64,10000),
#     'svc__estimator__C': Real(1e-6, 1e+6, prior='log-uniform'),
#     'svc__estimator__gamma': Real(1e-6, 1e+6, prior='log-uniform'),
#     'svc__estimator__degree': Integer(1,8),
#     'svc__estimator__kernel': Categorical(['linear', 'poly', 'rbf']),
# }

# param_grid = {
# #  'svd__n_components': np.arange(64, 5000, 100),
#  'knn__leaf_size': np.arange(1, 50, 1),
#  'knn__metric': ['minkowski', 'euclidean'],
#  'knn__n_neighbors': [2,3,4,5,6,7,8,9,10,11,12],
#  'knn__weights': ['distance', 'uniform']
# }
search = BayesSearchCV(pipe, param_grid, n_iter=50, n_points=3, pre_dispatch=36, refit=True, cv=3, verbose=10, random_state=42, n_jobs=-1)

from tqdm.auto import tqdm
from tqdm.utils import CallbackIOWrapper

with tqdm(total=search.total_iterations) as pbar:
    def on_step(optim_result):
        print(optim_result)
        pbar.update(9)
        return False
    search.fit(training_set, training_labels, callback=on_step)



In [None]:
print(search.best_estimator_.get_params())

In [None]:
from sklearn.metrics import *
#pipe.fit(training_set, training_labels)
#predictions = pipe.predict(testing_set)
predictions = search.best_estimator_.predict(Xtest[:,features_set])
print(classification_report(testing_labels, predictions, target_names=subset))

In [None]:
from sklearn.metrics import *
#pipe.fit(training_set, training_labels)
#predictions = pipe.predict(testing_set)
predictions = search.best_estimator_.predict(testing_set)
print(classification_report(testing_labels, predictions, target_names=all_tiers_100))

In [None]:
from sklearn.metrics import *
#pipe.fit(training_set, training_labels)
predictions = pipe.predict(testing_set)
print(classification_report(testing_labels, predictions, target_names=all_tiers_100))

In [None]:
predictions = predictions = search.best_estimator_.predict(training_set)
print(classification_report(training_labels, predictions, target_names=subset))


#  Guessing Baseline

                                      precision    recall  f1-score   support

                 AnalysisAndModeling       0.31      0.32      0.32        84
                    AnatomicalTarget       0.63      0.63      0.63       164
                             Imaging       0.57      0.56      0.56       133
                       Manufacturing       0.38      0.48      0.43        83
                  SpecificationofUse       0.33      0.33      0.33        79
                      SurgicalMethod       0.19      0.20      0.20        40
      AnalysisAndModeling_3DModeling       0.22      0.21      0.22        71
     AnatomicalTarget_LowerExtremity       0.45      0.43      0.44       113
              AnatomicalTarget_Torso       0.19      0.17      0.18        35
     AnatomicalTarget_UpperExtremity       0.26      0.26      0.26        31
                          Imaging_CT       0.14      0.19      0.16        59
                         Imaging_MRI       0.24      0.20      0.22        59
                  Imaging_Ultrasound       0.17      0.19      0.18        32
 Manufacturing_AdditiveManufacturing       0.24      0.24      0.24        38
       PersonalizedProduct_Guide/Jig       0.55      0.46      0.50       120
         PersonalizedProduct_Implant       0.49      0.53      0.51       124
          SpecificationofUse_Disease       0.19      0.20      0.20        30
 SpecificationofUse_JointReplacement       0.14      0.23      0.17        44
 AnatomicalTarget_LowerExtremity_Hip       0.21      0.17      0.19        40
AnatomicalTarget_LowerExtremity_Knee       0.32      0.34      0.33        82

                           micro avg       0.38      0.39      0.38      1461
                           macro avg       0.31      0.32      0.31      1461
                        weighted avg       0.38      0.39      0.39      1461
                         samples avg       0.38      0.40      0.37      1461

# RF

 {'rf__bootstrap': False,
 'rf__ccp_alpha': 0.0,
 'rf__class_weight': None,
 'rf__criterion': 'gini',
 'rf__max_depth': 150,
 'rf__max_features': 'sqrt',
 'rf__max_leaf_nodes': None,
 'rf__max_samples': None,
 'rf__min_impurity_decrease': 0.0,
 'rf__min_impurity_split': None,
 'rf__min_samples_leaf': 1,
 'rf__min_samples_split': 2,
 'rf__min_weight_fraction_leaf': 0.0,
 'rf__n_estimators': 200,
 'rf__n_jobs': None,
 'rf__oob_score': False,
 'rf__random_state': None,
 'rf__verbose': 0,
 'rf__warm_start': False}

                                      precision    recall  f1-score   support

                 AnalysisAndModeling       0.43      0.24      0.31        84
                    AnatomicalTarget       0.70      0.78      0.74       164
                             Imaging       0.60      0.59      0.60       133
                       Manufacturing       0.37      0.25      0.30        83
                  SpecificationofUse       0.42      0.32      0.36        79
                      SurgicalMethod       0.71      0.30      0.42        40
      AnalysisAndModeling_3DModeling       0.38      0.18      0.25        71
     AnatomicalTarget_LowerExtremity       0.53      0.46      0.49       113
              AnatomicalTarget_Torso       0.08      0.03      0.04        35
     AnatomicalTarget_UpperExtremity       0.11      0.03      0.05        31
                          Imaging_CT       0.18      0.10      0.13        59
                         Imaging_MRI       0.28      0.14      0.18        59
                  Imaging_Ultrasound       0.00      0.00      0.00        32
 Manufacturing_AdditiveManufacturing       0.23      0.08      0.12        38
       PersonalizedProduct_Guide/Jig       0.64      0.41      0.50       120
         PersonalizedProduct_Implant       0.59      0.74      0.66       124
          SpecificationofUse_Disease       0.06      0.03      0.04        30
 SpecificationofUse_JointReplacement       0.21      0.16      0.18        44
 AnatomicalTarget_LowerExtremity_Hip       0.26      0.12      0.17        40
AnatomicalTarget_LowerExtremity_Knee       0.47      0.33      0.39        82

                           micro avg       0.50      0.38      0.43      1461
                           macro avg       0.36      0.26      0.30      1461
                        weighted avg       0.45      0.38      0.40      1461
                         samples avg       0.53      0.41      0.43      1461



# KNN

 'knn__algorithm': 'auto',
 'knn__leaf_size': 30,
 'knn__metric': 'euclidean',
 'knn__metric_params': None,
 'knn__n_jobs': None,
 'knn__n_neighbors': 2,
 'knn__p': 2,
 'knn__weights': 'distance'
                                      precision    recall  f1-score   support

                 AnalysisAndModeling       0.38      0.43      0.40        84
                    AnatomicalTarget       0.71      0.60      0.65       164
                             Imaging       0.63      0.62      0.62       133
                       Manufacturing       0.34      0.37      0.36        83
                  SpecificationofUse       0.34      0.47      0.39        79
                      SurgicalMethod       0.39      0.30      0.34        40
      AnalysisAndModeling_3DModeling       0.35      0.38      0.36        71
     AnatomicalTarget_LowerExtremity       0.60      0.50      0.54       113
              AnatomicalTarget_Torso       0.24      0.11      0.15        35
     AnatomicalTarget_UpperExtremity       0.16      0.16      0.16        31
                          Imaging_CT       0.24      0.32      0.27        59
                         Imaging_MRI       0.24      0.32      0.28        59
                  Imaging_Ultrasound       0.16      0.28      0.21        32
 Manufacturing_AdditiveManufacturing       0.18      0.13      0.15        38
       PersonalizedProduct_Guide/Jig       0.59      0.39      0.47       120
         PersonalizedProduct_Implant       0.56      0.76      0.64       124
          SpecificationofUse_Disease       0.19      0.33      0.24        30
 SpecificationofUse_JointReplacement       0.12      0.14      0.13        44
 AnatomicalTarget_LowerExtremity_Hip       0.33      0.30      0.32        40
AnatomicalTarget_LowerExtremity_Knee       0.48      0.39      0.43        82

                           micro avg       0.43      0.44      0.43      1461
                           macro avg       0.36      0.37      0.36      1461
                        weighted avg       0.45      0.44      0.44      1461
                         samples avg       0.45      0.46      0.41      1461

# KNN + SVD

 precision    recall  f1-score   support

                 AnalysisAndModeling       0.38      0.43      0.40        84
                    AnatomicalTarget       0.71      0.60      0.65       164
                             Imaging       0.63      0.62      0.62       133
                       Manufacturing       0.34      0.37      0.36        83
                  SpecificationofUse       0.34      0.47      0.39        79
                      SurgicalMethod       0.39      0.30      0.34        40
      AnalysisAndModeling_3DModeling       0.35      0.38      0.36        71
     AnatomicalTarget_LowerExtremity       0.60      0.50      0.54       113
              AnatomicalTarget_Torso       0.24      0.11      0.15        35
     AnatomicalTarget_UpperExtremity       0.16      0.16      0.16        31
                          Imaging_CT       0.24      0.32      0.27        59
                         Imaging_MRI       0.24      0.32      0.28        59
                  Imaging_Ultrasound       0.16      0.28      0.21        32
 Manufacturing_AdditiveManufacturing       0.18      0.13      0.15        38
       PersonalizedProduct_Guide/Jig       0.59      0.39      0.47       120
         PersonalizedProduct_Implant       0.56      0.76      0.64       124
          SpecificationofUse_Disease       0.19      0.33      0.24        30
 SpecificationofUse_JointReplacement       0.12      0.14      0.13        44
 AnatomicalTarget_LowerExtremity_Hip       0.33      0.30      0.32        40
AnatomicalTarget_LowerExtremity_Knee       0.48      0.39      0.43        82

                           micro avg       0.43      0.44      0.43      1461
                           macro avg       0.36      0.37      0.36      1461
                        weighted avg       0.45      0.44      0.44      1461
                         samples avg       0.45      0.46      0.41      1461

'svd__algorithm': 'randomized',
 'svd__n_components': 2564,
 'svd__n_iter': 5,
 'svd__random_state': 42,
 'svd__tol': 0.0,
 'knn__algorithm': 'auto',
 'knn__leaf_size': 30,
 'knn__metric': 'euclidean',
 'knn__metric_params': None,
 'knn__n_jobs': None,
 'knn__n_neighbors': 2,
 'knn__p': 2,
 'knn__weights': 'distance'
