In [1]:
import pandas as pd
import numpy as np
from tqdm.auto import tqdm

import torch
import torchtext
from torch import nn
from torchtext.data import Field, LabelField
from torchtext.data import BucketIterator
from torchtext.datasets import IMDB
from transformers import BertTokenizer
from transformers import BertModel
from skorch import NeuralNetClassifier
from skorch.callbacks import Freezer
from skorch.callbacks import ProgressBar
from util import *

In [2]:
from sklearn import set_config
set_config(display='diagram') 

In [3]:
training_set = pd.read_json("training_set.json.gz", lines=True, orient="records")

In [4]:
testing_set = pd.read_json("testing_set.json.gz", lines=True, orient="records")

In [5]:
training_labels = training_set[all_tiers_100]

In [6]:
testing_labels = testing_set[all_tiers_100]

In [7]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

In [9]:
set(training_set.columns) - set(all_tiers)

{'abstract',
 'all_tiers',
 'all_tiers_100',
 'application_kind',
 'application_number',
 'application_number_formatted',
 'assignees',
 'citations',
 'cited_by',
 'claims',
 'country_code_x',
 'country_code_y',
 'cpc_codes',
 'description',
 'embedding_v1',
 'examiners',
 'family_id',
 'fi_codes',
 'filing_date',
 'fterm_codes',
 'gpa_number',
 'grant_date',
 'inventors',
 'kind',
 'kind_code',
 'padded_serial',
 'pct_number',
 'priority_date',
 'publication_date',
 'publication_number',
 'serial_x',
 'serial_y',
 'similar_npl',
 'similar_patents',
 'tier1_tags',
 'tier1_tier2_tags',
 'tier2_100',
 'tier3_100',
 'title',
 'top_terms',
 'url',
 'uspc_codes'}

In [10]:
training_set.top_terms

0      [prosthesis, member, coupling portion, portion...
1      [femur, tibia, implant, leg, patient, portion,...
2      [lateral, femur, patient, body, medial, slot, ...
3      [glenoid, virtual, alignment pin, patient, sca...
4      [bone, surface, patient, resection, jig, use, ...
                             ...                        
967    [method, bone, medial, patient, template, late...
968    [cutting guide, surgical cutting, customized s...
969    [patient, adapted, surface, surface model, imp...
970    [polymer, monomers, modulus, thiol, multifunct...
971    [implant, patient, peg, portion, bone, porous,...
Name: top_terms, Length: 972, dtype: object

In [11]:
from sklearn.ensemble import *

In [12]:
from sklearn.svm import SVC, LinearSVC
from sklearn.neighbors import KNeighborsClassifier

In [13]:
from sklearn.dummy import DummyClassifier

In [17]:
tfidf_default_settings = {
    'lowercase': True, 
    'strip_accents': 'ascii',
    'stop_words' : stopwords,
    'min_df': 2,
    #'max_df': 0.5#,
    #'ngram_range': (1,3)
}

transformer = ColumnTransformer([
     ('top_terms',
      CountVectorizer(analyzer=iden, min_df=2),
     'top_terms'
     ),
     ('cited_by',
      CountVectorizer(analyzer=iden, min_df=2),
     'cited_by'
     ),
    ('inventors',
      CountVectorizer(analyzer=iden, min_df=2),
     'inventors'
     ),
     ('citations',
       CountVectorizer(analyzer=iden, min_df=2),
      'citations'
      ),
#         ('similar_npl',
#       CountVectorizer(analyzer=lambda x:x, min_df=2),
#      'similar_npl'
#      ),
        ('similar_patents',
      CountVectorizer(analyzer=iden, min_df=2),
     'similar_patents'
     )
  ,
     ('cpc',
      CountVectorizer(analyzer=cpc_split, min_df=2),
     'cpc_codes'
     ),
#     ('embedding_v1', 
#      'passthrough',
#      'embedding_v1'
#     ),
    ('abstract_tfidf', 
    TfidfVectorizer(**tfidf_default_settings),
   'abstract'),
    ('claims_tfidf',
     TfidfVectorizer(**tfidf_default_settings),
     'claims'
    ),
    ('description_tfidf',
     TfidfVectorizer(**tfidf_default_settings),
     'description'
    )
      ], verbose=False, n_jobs=-1)

In [18]:
from sklearn.decomposition import TruncatedSVD

In [19]:
from sklearn.multiclass import OneVsRestClassifier, OneVsOneClassifier
from sklearn.naive_bayes import GaussianNB

In [23]:
from sklearn.metrics import *

In [21]:
pipe = Pipeline(steps=[('transformer', transformer),
                       #('svd', TruncatedSVD(random_state=42)),
                       #('dummy', OneVsRestClassifier(DummyClassifier()))
                       ('SVC', OneVsRestClassifier(SVC(random_state=42), n_jobs=-1))
                       #('rf',  RandomForestClassifier(n_jobs=-1, random_state=42))
                       #('knn', KNeighborsClassifier(n_jobs=-1))
                      ], 
                verbose=True,
                memory="cachedir/")

In [22]:
pipe.fit(training_set, training_labels)
predictions = pipe.predict(testing_set)
print(classification_report(testing_labels, predictions, target_names=all_tiers_100))

[Pipeline] ....... (step 1 of 2) Processing transformer, total=   8.9s


If this happens often in your code, it can cause performance problems 
(results will be correct in all cases). 
The reason for this is probably some large input arguments for a wrapped
 function (e.g. large strings).
THIS IS A JOBLIB ISSUE. If you can, kindly provide the joblib's team with an
 example so that they can fix the problem.
  X, fitted_transformer = fit_transform_one_cached(


[Pipeline] ............... (step 2 of 2) Processing SVC, total=  13.0s


NameError: name 'classification_report' is not defined

In [24]:
print(classification_report(testing_labels, predictions, target_names=all_tiers_100))

                                          precision    recall  f1-score   support

                     AnalysisAndModeling       0.50      0.02      0.05        84
          AnalysisAndModeling_3DModeling       0.00      0.00      0.00        71
                        AnatomicalTarget       0.69      0.93      0.79       164
         AnatomicalTarget_LowerExtremity       0.63      0.19      0.30       113
     AnatomicalTarget_LowerExtremity_Hip       0.00      0.00      0.00        40
    AnatomicalTarget_LowerExtremity_Knee       0.80      0.15      0.25        82
                  AnatomicalTarget_Torso       0.00      0.00      0.00        35
            AnatomicalTarget_Torso_Spine       0.00      0.00      0.00        21
         AnatomicalTarget_UpperExtremity       0.00      0.00      0.00        31
AnatomicalTarget_UpperExtremity_Shoulder       0.00      0.00      0.00        23
                                 Imaging       0.57      0.88      0.69       133
               

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
from skopt import BayesSearchCV

In [None]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
param_grid = {
    #'rf__bootstrap': [True, False],
    'rf__max_depth': (10, 1000), #[10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 125, 150, 175, 200, 250, 300, 350, 400, None],
 #'rf__max_features': ['auto', 'sqrt'],
 'rf__min_samples_leaf': (1, 12),
 'rf__min_samples_split': (2, 12),
 'rf__n_estimators': (5, 1000)          
             }

# param_grid = {
# #  'svd__n_components': np.arange(64, 5000, 100),
#  'knn__leaf_size': np.arange(1, 50, 1),
#  'knn__metric': ['minkowski', 'euclidean'],
#  'knn__n_neighbors': [2,3,4,5,6,7,8,9,10,11,12],
#  'knn__weights': ['distance', 'uniform']
# }
search = BayesSearchCV(pipe, param_grid, n_iter=50, n_points=3, pre_dispatch=36, refit=True, cv=3, verbose=10, random_state=42, n_jobs=-1)

from tqdm.auto import tqdm
from tqdm.utils import CallbackIOWrapper

with tqdm(total=search.total_iterations) as pbar:
    def on_step(optim_result):
        pbar.update(9)
        return False
    search.fit(training_set, training_labels, callback=on_step)

print(search.best_estimator_.get_params())

In [None]:
from sklearn.metrics import *
#pipe.fit(training_set, training_labels)
#predictions = pipe.predict(testing_set)
predictions = search.best_estimator_.predict(testing_set)
print(classification_report(testing_labels, predictions))

In [None]:
from sklearn.metrics import *
#pipe.fit(training_set, training_labels)
#predictions = pipe.predict(testing_set)
predictions = search.best_estimator_.predict(testing_set)
print(classification_report(testing_labels, predictions, target_names=all_tiers_100))

In [None]:
from sklearn.metrics import *
#pipe.fit(training_set, training_labels)
predictions = pipe.predict(testing_set)
print(classification_report(testing_labels, predictions, target_names=all_tiers_100))

In [None]:
predictions = predictions = search.best_estimator_.predict(training_set)
print(classification_report(training_labels, predictions, target_names=all_tiers_100))


#  Guessing Baseline

                                      precision    recall  f1-score   support

                 AnalysisAndModeling       0.31      0.32      0.32        84
                    AnatomicalTarget       0.63      0.63      0.63       164
                             Imaging       0.57      0.56      0.56       133
                       Manufacturing       0.38      0.48      0.43        83
                  SpecificationofUse       0.33      0.33      0.33        79
                      SurgicalMethod       0.19      0.20      0.20        40
      AnalysisAndModeling_3DModeling       0.22      0.21      0.22        71
     AnatomicalTarget_LowerExtremity       0.45      0.43      0.44       113
              AnatomicalTarget_Torso       0.19      0.17      0.18        35
     AnatomicalTarget_UpperExtremity       0.26      0.26      0.26        31
                          Imaging_CT       0.14      0.19      0.16        59
                         Imaging_MRI       0.24      0.20      0.22        59
                  Imaging_Ultrasound       0.17      0.19      0.18        32
 Manufacturing_AdditiveManufacturing       0.24      0.24      0.24        38
       PersonalizedProduct_Guide/Jig       0.55      0.46      0.50       120
         PersonalizedProduct_Implant       0.49      0.53      0.51       124
          SpecificationofUse_Disease       0.19      0.20      0.20        30
 SpecificationofUse_JointReplacement       0.14      0.23      0.17        44
 AnatomicalTarget_LowerExtremity_Hip       0.21      0.17      0.19        40
AnatomicalTarget_LowerExtremity_Knee       0.32      0.34      0.33        82

                           micro avg       0.38      0.39      0.38      1461
                           macro avg       0.31      0.32      0.31      1461
                        weighted avg       0.38      0.39      0.39      1461
                         samples avg       0.38      0.40      0.37      1461

# RF

 {'rf__bootstrap': False,
 'rf__ccp_alpha': 0.0,
 'rf__class_weight': None,
 'rf__criterion': 'gini',
 'rf__max_depth': 150,
 'rf__max_features': 'sqrt',
 'rf__max_leaf_nodes': None,
 'rf__max_samples': None,
 'rf__min_impurity_decrease': 0.0,
 'rf__min_impurity_split': None,
 'rf__min_samples_leaf': 1,
 'rf__min_samples_split': 2,
 'rf__min_weight_fraction_leaf': 0.0,
 'rf__n_estimators': 200,
 'rf__n_jobs': None,
 'rf__oob_score': False,
 'rf__random_state': None,
 'rf__verbose': 0,
 'rf__warm_start': False}

                                      precision    recall  f1-score   support

                 AnalysisAndModeling       0.43      0.24      0.31        84
                    AnatomicalTarget       0.70      0.78      0.74       164
                             Imaging       0.60      0.59      0.60       133
                       Manufacturing       0.37      0.25      0.30        83
                  SpecificationofUse       0.42      0.32      0.36        79
                      SurgicalMethod       0.71      0.30      0.42        40
      AnalysisAndModeling_3DModeling       0.38      0.18      0.25        71
     AnatomicalTarget_LowerExtremity       0.53      0.46      0.49       113
              AnatomicalTarget_Torso       0.08      0.03      0.04        35
     AnatomicalTarget_UpperExtremity       0.11      0.03      0.05        31
                          Imaging_CT       0.18      0.10      0.13        59
                         Imaging_MRI       0.28      0.14      0.18        59
                  Imaging_Ultrasound       0.00      0.00      0.00        32
 Manufacturing_AdditiveManufacturing       0.23      0.08      0.12        38
       PersonalizedProduct_Guide/Jig       0.64      0.41      0.50       120
         PersonalizedProduct_Implant       0.59      0.74      0.66       124
          SpecificationofUse_Disease       0.06      0.03      0.04        30
 SpecificationofUse_JointReplacement       0.21      0.16      0.18        44
 AnatomicalTarget_LowerExtremity_Hip       0.26      0.12      0.17        40
AnatomicalTarget_LowerExtremity_Knee       0.47      0.33      0.39        82

                           micro avg       0.50      0.38      0.43      1461
                           macro avg       0.36      0.26      0.30      1461
                        weighted avg       0.45      0.38      0.40      1461
                         samples avg       0.53      0.41      0.43      1461



# KNN

 'knn__algorithm': 'auto',
 'knn__leaf_size': 30,
 'knn__metric': 'euclidean',
 'knn__metric_params': None,
 'knn__n_jobs': None,
 'knn__n_neighbors': 2,
 'knn__p': 2,
 'knn__weights': 'distance'
                                      precision    recall  f1-score   support

                 AnalysisAndModeling       0.38      0.43      0.40        84
                    AnatomicalTarget       0.71      0.60      0.65       164
                             Imaging       0.63      0.62      0.62       133
                       Manufacturing       0.34      0.37      0.36        83
                  SpecificationofUse       0.34      0.47      0.39        79
                      SurgicalMethod       0.39      0.30      0.34        40
      AnalysisAndModeling_3DModeling       0.35      0.38      0.36        71
     AnatomicalTarget_LowerExtremity       0.60      0.50      0.54       113
              AnatomicalTarget_Torso       0.24      0.11      0.15        35
     AnatomicalTarget_UpperExtremity       0.16      0.16      0.16        31
                          Imaging_CT       0.24      0.32      0.27        59
                         Imaging_MRI       0.24      0.32      0.28        59
                  Imaging_Ultrasound       0.16      0.28      0.21        32
 Manufacturing_AdditiveManufacturing       0.18      0.13      0.15        38
       PersonalizedProduct_Guide/Jig       0.59      0.39      0.47       120
         PersonalizedProduct_Implant       0.56      0.76      0.64       124
          SpecificationofUse_Disease       0.19      0.33      0.24        30
 SpecificationofUse_JointReplacement       0.12      0.14      0.13        44
 AnatomicalTarget_LowerExtremity_Hip       0.33      0.30      0.32        40
AnatomicalTarget_LowerExtremity_Knee       0.48      0.39      0.43        82

                           micro avg       0.43      0.44      0.43      1461
                           macro avg       0.36      0.37      0.36      1461
                        weighted avg       0.45      0.44      0.44      1461
                         samples avg       0.45      0.46      0.41      1461

# KNN + SVD

 precision    recall  f1-score   support

                 AnalysisAndModeling       0.38      0.43      0.40        84
                    AnatomicalTarget       0.71      0.60      0.65       164
                             Imaging       0.63      0.62      0.62       133
                       Manufacturing       0.34      0.37      0.36        83
                  SpecificationofUse       0.34      0.47      0.39        79
                      SurgicalMethod       0.39      0.30      0.34        40
      AnalysisAndModeling_3DModeling       0.35      0.38      0.36        71
     AnatomicalTarget_LowerExtremity       0.60      0.50      0.54       113
              AnatomicalTarget_Torso       0.24      0.11      0.15        35
     AnatomicalTarget_UpperExtremity       0.16      0.16      0.16        31
                          Imaging_CT       0.24      0.32      0.27        59
                         Imaging_MRI       0.24      0.32      0.28        59
                  Imaging_Ultrasound       0.16      0.28      0.21        32
 Manufacturing_AdditiveManufacturing       0.18      0.13      0.15        38
       PersonalizedProduct_Guide/Jig       0.59      0.39      0.47       120
         PersonalizedProduct_Implant       0.56      0.76      0.64       124
          SpecificationofUse_Disease       0.19      0.33      0.24        30
 SpecificationofUse_JointReplacement       0.12      0.14      0.13        44
 AnatomicalTarget_LowerExtremity_Hip       0.33      0.30      0.32        40
AnatomicalTarget_LowerExtremity_Knee       0.48      0.39      0.43        82

                           micro avg       0.43      0.44      0.43      1461
                           macro avg       0.36      0.37      0.36      1461
                        weighted avg       0.45      0.44      0.44      1461
                         samples avg       0.45      0.46      0.41      1461

'svd__algorithm': 'randomized',
 'svd__n_components': 2564,
 'svd__n_iter': 5,
 'svd__random_state': 42,
 'svd__tol': 0.0,
 'knn__algorithm': 'auto',
 'knn__leaf_size': 30,
 'knn__metric': 'euclidean',
 'knn__metric_params': None,
 'knn__n_jobs': None,
 'knn__n_neighbors': 2,
 'knn__p': 2,
 'knn__weights': 'distance'
