In [121]:
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
from util import *

In [122]:
# from sklearn import set_config
# set_config(display='diagram') 

In [123]:
training_set = pd.read_json("training_set.json.gz", lines=True, orient="records")

In [124]:
testing_set = pd.read_json("testing_set.json.gz", lines=True, orient="records")

In [125]:
subset = list(set(tier1)-set(["PersonalizedProduct"]))
subset

['Verification',
 'CognitiveComputingAndLearningSystems',
 'DNAMapping/GeneticEngineering',
 'Imaging',
 'SurgicalTracking',
 'AnalysisAndModeling',
 'SpecificationofUse',
 'SurgicalMethod',
 'SurgicalRobotics',
 'AnatomicalTarget',
 'Manufacturing',
 'DataAcquisition',
 'Validation']

In [126]:
training_labels = training_set[subset]

In [127]:
testing_labels = testing_set[subset]

In [128]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [129]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

In [130]:
set(training_set.columns) - set(all_tiers)

{'abstract',
 'all_tiers',
 'all_tiers_100',
 'application_kind',
 'application_number',
 'application_number_formatted',
 'assignees',
 'citations',
 'cited_by',
 'claims',
 'country_code_x',
 'country_code_y',
 'cpc_codes',
 'description',
 'embedding_v1',
 'examiners',
 'family_id',
 'fi_codes',
 'filing_date',
 'fterm_codes',
 'gpa_number',
 'grant_date',
 'inventors',
 'kind',
 'kind_code',
 'padded_serial',
 'pct_number',
 'priority_date',
 'publication_date',
 'publication_number',
 'serial_x',
 'serial_y',
 'similar_npl',
 'similar_patents',
 'tier1_tags',
 'tier1_tier2_tags',
 'tier2_100',
 'tier3_100',
 'title',
 'top_terms',
 'url',
 'uspc_codes'}

In [131]:
training_set.top_terms

0      [prosthesis, member, coupling portion, portion...
1      [femur, tibia, implant, leg, patient, portion,...
2      [lateral, femur, patient, body, medial, slot, ...
3      [glenoid, virtual, alignment pin, patient, sca...
4      [bone, surface, patient, resection, jig, use, ...
                             ...                        
967    [method, bone, medial, patient, template, late...
968    [cutting guide, surgical cutting, customized s...
969    [patient, adapted, surface, surface model, imp...
970    [polymer, monomers, modulus, thiol, multifunct...
971    [implant, patient, peg, portion, bone, porous,...
Name: top_terms, Length: 972, dtype: object

In [132]:
cpc_embeddings = np.fromfile("/home/martin/builds/node2vec-c/cpc.node2vec.emb.16d.bin", dtype=np.float32).reshape((-1,16))

import joblib
cpc_labelizer = joblib.load('./node2id.joblib')
cpc_lookup = {c: n for n, c in enumerate(cpc_labelizer.classes_)}

@f.collecting
def convert_cpc_codes(codes):
    for code in codes:
        if code in cpc_lookup:
            yield cpc_lookup[code]
    
def embed_cpc_codes(codes):
    embedding = np.zeros(16)
    converted = convert_cpc_codes(codes)
    
    if not converted:
        return embedding
    
    for code_id in converted:
        embedding = embedding + cpc_embeddings[code_id]
        
    return embedding / len(converted)

training_set['embedded_cpc'] = training_set.cpc_codes.apply(embed_cpc_codes)
training_set.embedded_cpc

testing_set['embedded_cpc'] = testing_set.cpc_codes.apply(embed_cpc_codes)
testing_set.embedded_cpc

0      [0.6617838382720947, 0.5419959723949432, -0.45...
1      [0.6220737014498029, 0.4667179797376905, -0.30...
2      [0.5216278036435446, 0.4763353069623311, -0.40...
3      [0.5649837851524353, 0.4570807576179504, -0.39...
4      [0.49902863427996635, 0.4361288510262966, -0.3...
                             ...                        
238    [0.4623808066050212, 0.40974663694699603, -0.2...
239    [0.4705050190289815, 0.44273049632708233, -0.3...
240    [0.5656175762414932, 0.4283560812473297, -0.35...
241    [0.4972618932907398, 0.46143977687909055, -0.2...
242    [0.49393788874149325, 0.4575995460152626, -0.2...
Name: embedded_cpc, Length: 243, dtype: object

In [133]:
from sklearn.ensemble import *

In [134]:
from sklearn.svm import SVC, LinearSVC
from sklearn.neighbors import KNeighborsClassifier

In [135]:
from sklearn.dummy import DummyClassifier

In [136]:
from sklearn.linear_model import LogisticRegression

In [137]:
tfidf_default_settings = {
    'lowercase': True, 
    'strip_accents': 'ascii',
    'stop_words' : stopwords,
    'min_df': 5,
    #'max_df': 0.5#,
    #'ngram_range': (1,3)
}

transformer = ColumnTransformer([
     ('top_terms',
      CountVectorizer(analyzer=iden, binary=True, min_df=2),
     'top_terms'
     ),
     ('cited_by',
      CountVectorizer(analyzer=iden, binary=True, min_df=2),
     'cited_by'
     ),
    ('inventors',
      CountVectorizer(analyzer=iden, binary=True, min_df=2),
     'inventors'
     ),
     ('citations',
       CountVectorizer(analyzer=iden, binary=True, min_df=2),
      'citations'
      ),
#         ('similar_npl',
#       CountVectorizer(analyzer=lambda x:x, min_df=2),
#      'similar_npl'
#      ),
        ('similar_patents',
      CountVectorizer(analyzer=iden, binary=True, min_df=2),
     'similar_patents'
     )
  ,
     ('cpc',
      CountVectorizer(analyzer=cpc_split, binary=True, min_df=2),
     'cpc_codes'
     ),
#     ('embedding_v1', 
#      'passthrough',
#      'embedding_v1'
#     ),
    ('abstract_tfidf', 
    TfidfVectorizer(**tfidf_default_settings),
   'abstract'),
    ('claims_tfidf',
     TfidfVectorizer(**tfidf_default_settings),
     'claims'
    ),
#     ('description_tfidf',
#      TfidfVectorizer(**tfidf_default_settings),
#      'description'
#     )
      ], verbose=False, n_jobs=-1)

In [138]:
from sklearn.decomposition import TruncatedSVD

In [139]:
from sklearn.multiclass import OneVsRestClassifier, OneVsOneClassifier
from sklearn.naive_bayes import GaussianNB

In [140]:
from sklearn.metrics import *

In [141]:
from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer()
import util
X_train = transformer.fit_transform(training_set)
y_train = mlb.fit_transform(training_set[all_tiers_100].apply(util.array_labels, axis=1))

In [142]:
X_test = transformer.transform(testing_set)
y_test = mlb.fit_transform(testing_set[all_tiers_100].apply(util.array_labels, axis=1))

In [143]:
from skmultilearn.problem_transform import *
from sklearn.svm import SVC

In [144]:

#clf.fit(X_train, y_train)

In [145]:
np.array(training_set.embedded_cpc.values.tolist()).shape

(972, 16)

In [146]:
#X_train =  np.array(training_set[['embedded_cpc','embedding_v1']].apply(lambda x: x[0].tolist() + x[1], axis=1).values.tolist())
#X_test =  np.array(testing_set[['embedded_cpc','embedding_v1']].apply(lambda x: x[0].tolist() + x[1], axis=1).values.tolist())

X_train = np.array(training_set['embedded_cpc'].values.tolist())
X_test = np.array(testing_set['embedded_cpc'].values.tolist())

In [147]:
from skmultilearn.adapt import *
from sklearn import metrics
def evaluate(clf):
    prediction = clf.fit(X_train, y_train).predict(X_test)
    print(f"{clf.get_params()}")
    print(f"Hamming loss: {metrics.hamming_loss(y_test,prediction)}")
    print(metrics.classification_report(y_test, prediction, target_names=mlb.classes_))

In [148]:
# clf = BinaryRelevance(
#     classifier=RandomForestClassifier(n_jobs=-1),
#     require_dense=[True, True]
# )



# verse 32d cpc = 
# micro avg       0.54      0.37      0.44      1505
# macro avg       0.41      0.24      0.27      1505
# weighted avg       0.48      0.37      0.39      1505
# samples avg       0.55      0.40      0.43      1505

#deepwalk 32d cpc = 
#                                     micro avg       0.54      0.37      0.44      1505
#                                     macro avg       0.44      0.24      0.28      1505
#                                  weighted avg       0.49      0.37      0.39      1505
#                                   samples avg       0.54      0.40      0.42      1505


#node2vec 16d cpc = 
#                                     micro avg       0.55      0.37      0.44      1505
#                                     macro avg       0.43      0.24      0.28      1505
#                                  weighted avg       0.49      0.37      0.39      1505
#                                   samples avg       0.56      0.41      0.43      1505

#******* node2vec 32d cpc =
#                                     micro avg       0.55      0.38      0.45      1505
#                                     macro avg       0.42      0.25      0.28      1505
#                                  weighted avg       0.49      0.38      0.40      1505
#                                   samples avg       0.56      0.41      0.43      1505

#node2vec 64d cpc = 
#                                     micro avg       0.55      0.37      0.44      1505
#                                     macro avg       0.45      0.24      0.28      1505
#                                  weighted avg       0.50      0.37      0.39      1505
#                                   samples avg       0.56      0.40      0.43      1505

evaluate(RandomForestClassifier(n_jobs=-1))




{'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'max_features': 'auto', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 100, 'n_jobs': -1, 'oob_score': False, 'random_state': None, 'verbose': 0, 'warm_start': False}
Hamming loss: 0.26318742985409654
                                               precision    recall  f1-score   support

                        Analysis and Modeling       0.49      0.29      0.36        84
           Analysis and Modeling: 3D Modeling       0.41      0.17      0.24        71
                            Anatomical Target       0.69      0.80      0.74       164
           Anatomical Target: Lower Extremity       0.54      0.53      0.53       113
     Anatomical Target: Lower Extremity - Hip       0.33      0.07      0.12        40
    Anatomical Target: L

  _warn_prf(average, modifier, msg_start, len(result))


In [65]:
evaluate(RandomForestClassifier(n_jobs=-1))

{'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'max_features': 'auto', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 100, 'n_jobs': -1, 'oob_score': False, 'random_state': None, 'verbose': 0, 'warm_start': False}
Hamming loss: 0.2753460531238309
                                               precision    recall  f1-score   support

                        Analysis and Modeling       0.46      0.26      0.33        84
           Analysis and Modeling: 3D Modeling       0.48      0.18      0.27        71
                            Anatomical Target       0.70      0.80      0.75       164
           Anatomical Target: Lower Extremity       0.53      0.41      0.46       113
     Anatomical Target: Lower Extremity - Hip       0.40      0.05      0.09        40
    Anatomical Target: Lo

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [84]:
from skmultilearn.problem_transform import ClassifierChain
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC

parameters = [
    {
        'classifier': [MultinomialNB()],
        'classifier__alpha': [0.7, 1.0],
    },
    {
        'classifier': [SVC()],
        'classifier__kernel': ['rbf', 'linear'],
    },
]

clf = GridSearchCV(ClassifierChain(), parameters, scoring='accuracy')
clf.fit(X_train, y_train)

print (clf.best_params_, clf.best_score_)



{'classifier': MultinomialNB(alpha=0.7), 'classifier__alpha': 0.7} 0.061723499867829754


In [150]:
import lightgbm as lgb
evaluate(BinaryRelevance(lgb.LGBMClassifier()))

{'classifier': LGBMClassifier(), 'classifier__boosting_type': 'gbdt', 'classifier__class_weight': None, 'classifier__colsample_bytree': 1.0, 'classifier__importance_type': 'split', 'classifier__learning_rate': 0.1, 'classifier__max_depth': -1, 'classifier__min_child_samples': 20, 'classifier__min_child_weight': 0.001, 'classifier__min_split_gain': 0.0, 'classifier__n_estimators': 100, 'classifier__n_jobs': -1, 'classifier__num_leaves': 31, 'classifier__objective': None, 'classifier__random_state': None, 'classifier__reg_alpha': 0.0, 'classifier__reg_lambda': 0.0, 'classifier__silent': True, 'classifier__subsample': 1.0, 'classifier__subsample_for_bin': 200000, 'classifier__subsample_freq': 0, 'require_dense': [True, True]}
Hamming loss: 0.2774036662925552
                                               precision    recall  f1-score   support

                        Analysis and Modeling       0.42      0.33      0.37        84
           Analysis and Modeling: 3D Modeling       0.38   

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [149]:
evaluate(MLkNN())



{'k': 10, 's': 1.0, 'ignore_first_neighbours': 0}
Hamming loss: 0.2826412270856715
                                               precision    recall  f1-score   support

                        Analysis and Modeling       0.30      0.12      0.17        84
           Analysis and Modeling: 3D Modeling       0.40      0.08      0.14        71
                            Anatomical Target       0.66      0.79      0.72       164
           Anatomical Target: Lower Extremity       0.45      0.31      0.37       113
     Anatomical Target: Lower Extremity - Hip       0.40      0.05      0.09        40
    Anatomical Target: Lower Extremity - Knee       0.35      0.07      0.12        82
                     Anatomical Target: Torso       0.00      0.00      0.00        35
             Anatomical Target: Torso - Spine       0.00      0.00      0.00        21
           Anatomical Target: Upper Extremity       0.30      0.10      0.15        31
Anatomical Target: Upper Extremity - Shoulder 

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [69]:
evaluate(LabelPowerset(
    classifier=SVC(),
    require_dense=[False, True]
))

{'classifier': SVC(), 'classifier__C': 1.0, 'classifier__break_ties': False, 'classifier__cache_size': 200, 'classifier__class_weight': None, 'classifier__coef0': 0.0, 'classifier__decision_function_shape': 'ovr', 'classifier__degree': 3, 'classifier__gamma': 'scale', 'classifier__kernel': 'rbf', 'classifier__max_iter': -1, 'classifier__probability': False, 'classifier__random_state': None, 'classifier__shrinking': True, 'classifier__tol': 0.001, 'classifier__verbose': False, 'require_dense': [False, True]}
Hamming loss: 0.3887018331462776
                                               precision    recall  f1-score   support

                        Analysis and Modeling       0.00      0.00      0.00        84
           Analysis and Modeling: 3D Modeling       0.00      0.00      0.00        71
                            Anatomical Target       0.74      0.10      0.18       164
           Anatomical Target: Lower Extremity       0.61      0.12      0.21       113
     Anatomical Ta

In [None]:
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectKBest, SelectFromModel
from sklearn.feature_selection import f_classif


In [None]:
pipe = Pipeline(steps=[('transformer', transformer),
                       ('feature_selection', SelectFromModel(LinearSVC(penalty="l1"))),
                       #('svd', TruncatedSVD(random_state=42)),
                       #('dummy', OneVsRestClassifier(DummyClassifier()))
                       #('svc', OneVsRestClassifier(SVC(random_state=42), n_jobs=-1))
                       ('rf',  RandomForestClassifier(n_jobs=-1, random_state=42))
                       #('lr', OneVsRestClassifier(LogisticRegression(n_jobs=-1), n_jobs=-1))
                       #('cat', OneVsRestClassifier(CatBoostClassifier(verbose=True)))
                       #('knn', KNeighborsClassifier(n_jobs=-1))
                      ], 
                verbose=True,
                memory="cachedir/")

In [None]:
training_labels[:,0]

In [None]:
from sklearn.feature_selection import chi2
selected_features = [] 
for label in range(0, len(subset)):
    selector = SelectKBest(chi2, k='all')
    selector.fit(X, training_labels.values[:,label])
    selected_features.append(list(selector.scores_))

In [None]:
selected_features = np.nan_to_num(np.array(selected_features))

In [None]:
selected_features

In [None]:
selected_features.mean(axis=0).mean()

In [None]:
features_set = np.mean(np.nan_to_num(np.array(selected_features)), axis=0) > 0.8

In [None]:
features_set.astype(int).sum()

In [None]:
X.shape

In [None]:
feature_subset = X[:,features_set]

In [None]:
feature_subset.shape

In [None]:
model = RandomForestClassifier()
model.fit(X[:,features_set], training_labels)
predictions = model.predict(Xtest[:,features_set])
print(classification_report(testing_labels, predictions, target_names=subset))

In [None]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from skopt import BayesSearchCV
from skopt import *
from skopt.space import Real, Categorical, Integer

param_grid = {
    #'rf__bootstrap': [True, False],
    'max_depth': (10, 1000), #[10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 125, 150, 175, 200, 250, 300, 350, 400, None],
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': (1, 12),
 'min_samples_split': (2, 12),
 'n_estimators': (5, 1000)          
             }


# param_grid = {
#     'svd__n_components': Integer(64,10000),
#     'svc__estimator__C': Real(1e-6, 1e+6, prior='log-uniform'),
#     'svc__estimator__gamma': Real(1e-6, 1e+6, prior='log-uniform'),
#     'svc__estimator__degree': Integer(1,8),
#     'svc__estimator__kernel': Categorical(['linear', 'poly', 'rbf']),
# }

# param_grid = {
# #  'svd__n_components': np.arange(64, 5000, 100),
#  'knn__leaf_size': np.arange(1, 50, 1),
#  'knn__metric': ['minkowski', 'euclidean'],
#  'knn__n_neighbors': [2,3,4,5,6,7,8,9,10,11,12],
#  'knn__weights': ['distance', 'uniform']
# }
search = BayesSearchCV(model, param_grid, n_iter=50, n_points=3, pre_dispatch=36, refit=True, cv=3, verbose=10, random_state=42, n_jobs=-1)

from tqdm.auto import tqdm
from tqdm.utils import CallbackIOWrapper

with tqdm(total=search.total_iterations) as pbar:
    def on_step(optim_result):
        print(optim_result)
        pbar.update(9)
        return False
    search.fit(X[:,features_set], training_labels, callback=on_step)


In [None]:
f1_score(testing_labels, predictions, average="weighted")

In [None]:
hamming_loss(testing_labels, predictions)

In [None]:
plot_confusion_matrix(pipe, testing_set, testing_labels)

In [None]:
training_predictions = search.best_estimator_.predict(X[:,features_set])
print(classification_report(training_labels, training_predictions, target_names=subset))

In [None]:
from skopt import BayesSearchCV
from skopt import *
from skopt.space import Real, Categorical, Integer

In [None]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
param_grid = {
    #'rf__bootstrap': [True, False],
    'rf__max_depth': (10, 1000), #[10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 125, 150, 175, 200, 250, 300, 350, 400, None],
 #'rf__max_features': ['auto', 'sqrt'],
 'rf__min_samples_leaf': (1, 12),
 'rf__min_samples_split': (2, 12),
 'rf__n_estimators': (5, 1000)          
             }


# param_grid = {
#     'svd__n_components': Integer(64,10000),
#     'svc__estimator__C': Real(1e-6, 1e+6, prior='log-uniform'),
#     'svc__estimator__gamma': Real(1e-6, 1e+6, prior='log-uniform'),
#     'svc__estimator__degree': Integer(1,8),
#     'svc__estimator__kernel': Categorical(['linear', 'poly', 'rbf']),
# }

# param_grid = {
# #  'svd__n_components': np.arange(64, 5000, 100),
#  'knn__leaf_size': np.arange(1, 50, 1),
#  'knn__metric': ['minkowski', 'euclidean'],
#  'knn__n_neighbors': [2,3,4,5,6,7,8,9,10,11,12],
#  'knn__weights': ['distance', 'uniform']
# }
search = BayesSearchCV(pipe, param_grid, n_iter=50, n_points=3, pre_dispatch=36, refit=True, cv=3, verbose=10, random_state=42, n_jobs=-1)

from tqdm.auto import tqdm
from tqdm.utils import CallbackIOWrapper

with tqdm(total=search.total_iterations) as pbar:
    def on_step(optim_result):
        print(optim_result)
        pbar.update(9)
        return False
    search.fit(training_set, training_labels, callback=on_step)



In [None]:
print(search.best_estimator_.get_params())

In [None]:
from sklearn.metrics import *
#pipe.fit(training_set, training_labels)
#predictions = pipe.predict(testing_set)
predictions = search.best_estimator_.predict(Xtest[:,features_set])
print(classification_report(testing_labels, predictions, target_names=subset))

In [None]:
from sklearn.metrics import *
#pipe.fit(training_set, training_labels)
#predictions = pipe.predict(testing_set)
predictions = search.best_estimator_.predict(testing_set)
print(classification_report(testing_labels, predictions, target_names=all_tiers_100))

In [None]:
from sklearn.metrics import *
#pipe.fit(training_set, training_labels)
predictions = pipe.predict(testing_set)
print(classification_report(testing_labels, predictions, target_names=all_tiers_100))

In [None]:
predictions = predictions = search.best_estimator_.predict(training_set)
print(classification_report(training_labels, predictions, target_names=subset))


#  Guessing Baseline

                                      precision    recall  f1-score   support

                 AnalysisAndModeling       0.31      0.32      0.32        84
                    AnatomicalTarget       0.63      0.63      0.63       164
                             Imaging       0.57      0.56      0.56       133
                       Manufacturing       0.38      0.48      0.43        83
                  SpecificationofUse       0.33      0.33      0.33        79
                      SurgicalMethod       0.19      0.20      0.20        40
      AnalysisAndModeling_3DModeling       0.22      0.21      0.22        71
     AnatomicalTarget_LowerExtremity       0.45      0.43      0.44       113
              AnatomicalTarget_Torso       0.19      0.17      0.18        35
     AnatomicalTarget_UpperExtremity       0.26      0.26      0.26        31
                          Imaging_CT       0.14      0.19      0.16        59
                         Imaging_MRI       0.24      0.20      0.22        59
                  Imaging_Ultrasound       0.17      0.19      0.18        32
 Manufacturing_AdditiveManufacturing       0.24      0.24      0.24        38
       PersonalizedProduct_Guide/Jig       0.55      0.46      0.50       120
         PersonalizedProduct_Implant       0.49      0.53      0.51       124
          SpecificationofUse_Disease       0.19      0.20      0.20        30
 SpecificationofUse_JointReplacement       0.14      0.23      0.17        44
 AnatomicalTarget_LowerExtremity_Hip       0.21      0.17      0.19        40
AnatomicalTarget_LowerExtremity_Knee       0.32      0.34      0.33        82

                           micro avg       0.38      0.39      0.38      1461
                           macro avg       0.31      0.32      0.31      1461
                        weighted avg       0.38      0.39      0.39      1461
                         samples avg       0.38      0.40      0.37      1461

# RF

 {'rf__bootstrap': False,
 'rf__ccp_alpha': 0.0,
 'rf__class_weight': None,
 'rf__criterion': 'gini',
 'rf__max_depth': 150,
 'rf__max_features': 'sqrt',
 'rf__max_leaf_nodes': None,
 'rf__max_samples': None,
 'rf__min_impurity_decrease': 0.0,
 'rf__min_impurity_split': None,
 'rf__min_samples_leaf': 1,
 'rf__min_samples_split': 2,
 'rf__min_weight_fraction_leaf': 0.0,
 'rf__n_estimators': 200,
 'rf__n_jobs': None,
 'rf__oob_score': False,
 'rf__random_state': None,
 'rf__verbose': 0,
 'rf__warm_start': False}

                                      precision    recall  f1-score   support

                 AnalysisAndModeling       0.43      0.24      0.31        84
                    AnatomicalTarget       0.70      0.78      0.74       164
                             Imaging       0.60      0.59      0.60       133
                       Manufacturing       0.37      0.25      0.30        83
                  SpecificationofUse       0.42      0.32      0.36        79
                      SurgicalMethod       0.71      0.30      0.42        40
      AnalysisAndModeling_3DModeling       0.38      0.18      0.25        71
     AnatomicalTarget_LowerExtremity       0.53      0.46      0.49       113
              AnatomicalTarget_Torso       0.08      0.03      0.04        35
     AnatomicalTarget_UpperExtremity       0.11      0.03      0.05        31
                          Imaging_CT       0.18      0.10      0.13        59
                         Imaging_MRI       0.28      0.14      0.18        59
                  Imaging_Ultrasound       0.00      0.00      0.00        32
 Manufacturing_AdditiveManufacturing       0.23      0.08      0.12        38
       PersonalizedProduct_Guide/Jig       0.64      0.41      0.50       120
         PersonalizedProduct_Implant       0.59      0.74      0.66       124
          SpecificationofUse_Disease       0.06      0.03      0.04        30
 SpecificationofUse_JointReplacement       0.21      0.16      0.18        44
 AnatomicalTarget_LowerExtremity_Hip       0.26      0.12      0.17        40
AnatomicalTarget_LowerExtremity_Knee       0.47      0.33      0.39        82

                           micro avg       0.50      0.38      0.43      1461
                           macro avg       0.36      0.26      0.30      1461
                        weighted avg       0.45      0.38      0.40      1461
                         samples avg       0.53      0.41      0.43      1461



# KNN

 'knn__algorithm': 'auto',
 'knn__leaf_size': 30,
 'knn__metric': 'euclidean',
 'knn__metric_params': None,
 'knn__n_jobs': None,
 'knn__n_neighbors': 2,
 'knn__p': 2,
 'knn__weights': 'distance'
                                      precision    recall  f1-score   support

                 AnalysisAndModeling       0.38      0.43      0.40        84
                    AnatomicalTarget       0.71      0.60      0.65       164
                             Imaging       0.63      0.62      0.62       133
                       Manufacturing       0.34      0.37      0.36        83
                  SpecificationofUse       0.34      0.47      0.39        79
                      SurgicalMethod       0.39      0.30      0.34        40
      AnalysisAndModeling_3DModeling       0.35      0.38      0.36        71
     AnatomicalTarget_LowerExtremity       0.60      0.50      0.54       113
              AnatomicalTarget_Torso       0.24      0.11      0.15        35
     AnatomicalTarget_UpperExtremity       0.16      0.16      0.16        31
                          Imaging_CT       0.24      0.32      0.27        59
                         Imaging_MRI       0.24      0.32      0.28        59
                  Imaging_Ultrasound       0.16      0.28      0.21        32
 Manufacturing_AdditiveManufacturing       0.18      0.13      0.15        38
       PersonalizedProduct_Guide/Jig       0.59      0.39      0.47       120
         PersonalizedProduct_Implant       0.56      0.76      0.64       124
          SpecificationofUse_Disease       0.19      0.33      0.24        30
 SpecificationofUse_JointReplacement       0.12      0.14      0.13        44
 AnatomicalTarget_LowerExtremity_Hip       0.33      0.30      0.32        40
AnatomicalTarget_LowerExtremity_Knee       0.48      0.39      0.43        82

                           micro avg       0.43      0.44      0.43      1461
                           macro avg       0.36      0.37      0.36      1461
                        weighted avg       0.45      0.44      0.44      1461
                         samples avg       0.45      0.46      0.41      1461

# KNN + SVD

 precision    recall  f1-score   support

                 AnalysisAndModeling       0.38      0.43      0.40        84
                    AnatomicalTarget       0.71      0.60      0.65       164
                             Imaging       0.63      0.62      0.62       133
                       Manufacturing       0.34      0.37      0.36        83
                  SpecificationofUse       0.34      0.47      0.39        79
                      SurgicalMethod       0.39      0.30      0.34        40
      AnalysisAndModeling_3DModeling       0.35      0.38      0.36        71
     AnatomicalTarget_LowerExtremity       0.60      0.50      0.54       113
              AnatomicalTarget_Torso       0.24      0.11      0.15        35
     AnatomicalTarget_UpperExtremity       0.16      0.16      0.16        31
                          Imaging_CT       0.24      0.32      0.27        59
                         Imaging_MRI       0.24      0.32      0.28        59
                  Imaging_Ultrasound       0.16      0.28      0.21        32
 Manufacturing_AdditiveManufacturing       0.18      0.13      0.15        38
       PersonalizedProduct_Guide/Jig       0.59      0.39      0.47       120
         PersonalizedProduct_Implant       0.56      0.76      0.64       124
          SpecificationofUse_Disease       0.19      0.33      0.24        30
 SpecificationofUse_JointReplacement       0.12      0.14      0.13        44
 AnatomicalTarget_LowerExtremity_Hip       0.33      0.30      0.32        40
AnatomicalTarget_LowerExtremity_Knee       0.48      0.39      0.43        82

                           micro avg       0.43      0.44      0.43      1461
                           macro avg       0.36      0.37      0.36      1461
                        weighted avg       0.45      0.44      0.44      1461
                         samples avg       0.45      0.46      0.41      1461

'svd__algorithm': 'randomized',
 'svd__n_components': 2564,
 'svd__n_iter': 5,
 'svd__random_state': 42,
 'svd__tol': 0.0,
 'knn__algorithm': 'auto',
 'knn__leaf_size': 30,
 'knn__metric': 'euclidean',
 'knn__metric_params': None,
 'knn__n_jobs': None,
 'knn__n_neighbors': 2,
 'knn__p': 2,
 'knn__weights': 'distance'
