In [1]:
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
from util import *

In [2]:
# from sklearn import set_config
# set_config(display='diagram') 

In [3]:
training_set = pd.read_json("training_set.json.gz", lines=True, orient="records")

testing_set = pd.read_json("testing_set.json.gz", lines=True, orient="records")

subset = list(sorted(set(all_tiers_100)-set(["PersonalizedProduct"])))

In [26]:
citation_embeddings = pd.read_parquet("/var/patentmark/citation_based_embeddings.parquet")

training_set = training_set.merge(citation_embeddings, right_index=True,left_on='publication_number', how="inner" )
testing_set = testing_set.merge(citation_embeddings, right_index=True, left_on='publication_number', how='inner')

In [27]:
count = training_set.shape[0]

In [28]:
pos_weights = (1 / (training_set[subset].sum() / count).values)

In [29]:
subset = list(sorted(set(all_tiers_100)-set(["PersonalizedProduct"])))
subset

['AnalysisAndModeling',
 'AnalysisAndModeling_3DModeling',
 'AnatomicalTarget',
 'AnatomicalTarget_LowerExtremity',
 'AnatomicalTarget_LowerExtremity_Hip',
 'AnatomicalTarget_LowerExtremity_Knee',
 'AnatomicalTarget_Torso',
 'AnatomicalTarget_Torso_Spine',
 'AnatomicalTarget_UpperExtremity',
 'AnatomicalTarget_UpperExtremity_Shoulder',
 'Imaging',
 'Imaging_CT',
 'Imaging_MRI',
 'Imaging_Ultrasound',
 'Manufacturing',
 'Manufacturing_AdditiveManufacturing',
 'PersonalizedProduct_Guide/Jig',
 'PersonalizedProduct_Implant',
 'SpecificationofUse',
 'SpecificationofUse_Disease',
 'SpecificationofUse_JointReplacement',
 'SurgicalMethod']

In [30]:
training_labels = training_set[subset]

In [31]:
testing_labels = testing_set[subset]

In [32]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [33]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

In [34]:
set(training_set.columns) - set(all_tiers)

{'abstract',
 'all_tiers',
 'all_tiers_100',
 'application_kind',
 'application_number',
 'application_number_formatted',
 'assignees',
 'citation_based_embedding',
 'citations',
 'cited_by',
 'claims',
 'country_code_x',
 'country_code_y',
 'cpc_codes',
 'description',
 'embedded_cpc',
 'embedding_v1',
 'examiners',
 'family_id',
 'fi_codes',
 'filing_date',
 'fterm_codes',
 'gpa_number',
 'grant_date',
 'inventors',
 'kind',
 'kind_code',
 'padded_serial',
 'pct_number',
 'priority_date',
 'publication_date',
 'publication_number',
 'serial_x',
 'serial_y',
 'similar_npl',
 'similar_patents',
 'tier1_tags',
 'tier1_tier2_tags',
 'tier2_100',
 'tier3_100',
 'title',
 'top_terms',
 'url',
 'uspc_codes'}

In [35]:
training_set.top_terms

0      [prosthesis, member, coupling portion, portion...
1      [femur, tibia, implant, leg, patient, portion,...
2      [lateral, femur, patient, body, medial, slot, ...
3      [glenoid, virtual, alignment pin, patient, sca...
4      [bone, surface, patient, resection, jig, use, ...
                             ...                        
965    [patient, guide, device, specific, figs, surgi...
966    [bone, ablating device, data set, control data...
967    [method, bone, medial, patient, template, late...
969    [patient, adapted, surface, surface model, imp...
970    [polymer, monomers, modulus, thiol, multifunct...
Name: top_terms, Length: 927, dtype: object

In [36]:
cpc_embeddings = np.fromfile("/home/martin/patentmark/cpc.node2vec.emb.32d.bin", dtype=np.float32).reshape((-1,32))

import joblib
cpc_labelizer = joblib.load('./node2id.joblib')
cpc_lookup = {c: n for n, c in enumerate(cpc_labelizer.classes_)}

@f.collecting
def convert_cpc_codes(codes):
    for code in codes:
        if code in cpc_lookup:
            yield cpc_lookup[code]
    
def embed_cpc_codes(codes):
    embedding = np.zeros(32)
    converted = convert_cpc_codes(codes)
    
    if not converted:
        return embedding
    
    for code_id in converted:
        embedding = embedding + cpc_embeddings[code_id]
        
    return embedding / len(converted)

training_set['embedded_cpc'] = training_set.cpc_codes.apply(embed_cpc_codes)
training_set.embedded_cpc

testing_set['embedded_cpc'] = testing_set.cpc_codes.apply(embed_cpc_codes)
testing_set.embedded_cpc



0      [0.09129103335241477, -0.8074875394503276, -0....
1      [-0.0626441298850945, -0.8264780470303127, -0....
2      [-0.2087969978650411, -0.8326806823412577, -0....
3      [0.020394775830209256, -0.8215901732444764, -0...
4      [-0.26043402403593063, -0.6891247034072876, -0...
                             ...                        
238    [-0.23802674313386282, -0.628900408744812, -0....
239    [-0.3754243354002635, -0.6894144614537557, -0....
240    [-0.12913421913981438, -0.6960149183869362, -0...
241    [-0.3880331997688, -0.702021429171929, -0.2717...
242    [-0.2977850042283535, -0.7015813589096069, -0....
Name: embedded_cpc, Length: 234, dtype: object

In [37]:
from sklearn.ensemble import *

In [38]:
from sklearn.svm import SVC, LinearSVC
from sklearn.neighbors import KNeighborsClassifier

In [39]:
from sklearn.dummy import DummyClassifier

In [40]:
from sklearn.linear_model import LogisticRegression

In [92]:
# from transformers import FeatureExtractionPipeline, AutoModel, AutoTokenizer
# transformer_model = AutoModel.from_pretrained("bertForPatents/", gradient_checkpointing=True)
# transformer_tokenizer = AutoTokenizer.from_pretrained("bertForPatents/")
# transformer = FeatureExtractionPipeline(transformer_model, tokenizer=transformer_tokenizer, device=0)

import sentence_transformers as st
from sentence_transformers import SentenceTransformer

# model_name = "/home/martin/patentmark/patentmark-charting-app/vectors/sPatent-v2/"
# text_embedder = SentenceTransformer(model_name)
model_name = "bertForPatents/"
word_embedding_model = st.models.Transformer(model_name)
pooling_model = st.models.Pooling(word_embedding_model.get_word_embedding_dimension(),
                               pooling_mode_mean_tokens=False,
                               pooling_mode_cls_token=True,
                               pooling_mode_max_tokens=False)
text_embedder = SentenceTransformer(modules=[word_embedding_model, pooling_model])

In [93]:
class TransformerFeatures(sklearn.base.BaseEstimator):
    def fit(self, X, y):
        return self
        
    def transform(self, X):
        return text_embedder.encode(X)
        

In [94]:
class VectorPassthrough(sklearn.base.BaseEstimator):
    def fit(self, X, y):
        return self
    
    def transform(self, X):
        return np.array(X.values.tolist())
        

In [95]:
from sklearn.decomposition import TruncatedSVD

In [100]:
tfidf_default_settings = {
    'lowercase': True, 
    'strip_accents': 'ascii',
    'stop_words' : stopwords,
    'min_df': 5,
    #'max_df': 0.5#,
    #'ngram_range': (1,3)
}

column_prep = ColumnTransformer([
#      ('top_terms',
#       CountVectorizer(analyzer=iden, binary=True, min_df=2),
#      'top_terms'
#      ),
#      ('cited_by',
#       CountVectorizer(analyzer=iden, binary=True, min_df=2),
#      'cited_by'
#      ),
#     ('inventors',
#       CountVectorizer(analyzer=iden, binary=True, min_df=2),
#      'inventors'
#      ),
#      ('citations',
#        CountVectorizer(analyzer=iden, binary=True, min_df=2),
#       'citations'
#       ),
# #         ('similar_npl',
# #       CountVectorizer(analyzer=lambda x:x, min_df=2),
# #      'similar_npl'
# #      ),
#         ('similar_patents',
#       CountVectorizer(analyzer=iden, binary=True, min_df=2),
#      'similar_patents'
#      )
#   ,
#      ('cpc',
#       CountVectorizer(analyzer=cpc_split, binary=True, min_df=2),
#      'cpc_codes'
#      ),
    ('citation_based_embedding', VectorPassthrough(), 'citation_based_embedding'),
    ('embedded_cpc', 
     VectorPassthrough(),
     'embedded_cpc'
    ),
    ('embedding_v1',
     VectorPassthrough(),
     'embedding_v1'
    ),
#     ('transformer_abstract',
#     TransformerFeatures(),
#      'abstract'
#     ),
    ('transformer_claims',
     TransformerFeatures(),
     'claims'
    )
], verbose=False, n_jobs=1)

from sklearn.preprocessing import Normalizer
pipeline = Pipeline(steps=[('columns', column_prep), ('norm', Normalizer())])
    
#     ('abstract_tfidf', 
#     TfidfVectorizer(**tfidf_default_settings),
#    'abstract'),
#     ('claims_tfidf',
#      TfidfVectorizer(**tfidf_default_settings),
#      'claims'
#     ),
#     ('description_tfidf',
#      TfidfVectorizer(**tfidf_default_settings),
#      'description'
#     )
      

In [101]:
from sklearn.decomposition import TruncatedSVD

In [102]:
from sklearn.multiclass import OneVsRestClassifier, OneVsOneClassifier
from sklearn.naive_bayes import GaussianNB

In [103]:
from sklearn.metrics import *

In [104]:
from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer()
import util
X_train = pipeline.fit_transform(training_set)
X_test = pipeline.transform(testing_set)

KeyError: 312

In [82]:
y_train = mlb.fit_transform(training_set[subset].apply(util.array_labels, axis=1))
y_test = mlb.transform(testing_set[subset].apply(util.array_labels, axis=1))

In [83]:
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectKBest, SelectFromModel
from sklearn.feature_selection import f_classif
from sklearn.svm import SVC

In [90]:
model = RandomForestClassifier(n_jobs=-1, class_weight='balanced')

In [91]:
model.fit(X_train, y_train)
predictions = model.predict(X_test)
print(classification_report(y_test, predictions, target_names=subset))

                                          precision    recall  f1-score   support

                     AnalysisAndModeling       0.26      0.09      0.13        78
          AnalysisAndModeling_3DModeling       0.15      0.03      0.05        65
                        AnatomicalTarget       0.70      0.87      0.78       158
         AnatomicalTarget_LowerExtremity       0.52      0.41      0.46       108
     AnatomicalTarget_LowerExtremity_Hip       0.00      0.00      0.00        38
    AnatomicalTarget_LowerExtremity_Knee       0.64      0.21      0.31        78
                  AnatomicalTarget_Torso       0.00      0.00      0.00        34
            AnatomicalTarget_Torso_Spine       0.00      0.00      0.00        20
         AnatomicalTarget_UpperExtremity       0.00      0.00      0.00        29
AnatomicalTarget_UpperExtremity_Shoulder       0.00      0.00      0.00        21
                                 Imaging       0.63      0.76      0.69       128
               

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from skopt import BayesSearchCV
from skopt import *
from skopt.space import Real, Categorical, Integer

param_grid = {
    #'rf__bootstrap': [True, False],
    'max_depth': (10, 1000), #[10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 125, 150, 175, 200, 250, 300, 350, 400, None],
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': (1, 12),
 'min_samples_split': (2, 12),
 'n_estimators': (5, 1000)          
             }


# param_grid = {
#     'svd__n_components': Integer(64,10000),
#     'svc__estimator__C': Real(1e-6, 1e+6, prior='log-uniform'),
#     'svc__estimator__gamma': Real(1e-6, 1e+6, prior='log-uniform'),
#     'svc__estimator__degree': Integer(1,8),
#     'svc__estimator__kernel': Categorical(['linear', 'poly', 'rbf']),
# }

# param_grid = {
# #  'svd__n_components': np.arange(64, 5000, 100),
#  'knn__leaf_size': np.arange(1, 50, 1),
#  'knn__metric': ['minkowski', 'euclidean'],
#  'knn__n_neighbors': [2,3,4,5,6,7,8,9,10,11,12],
#  'knn__weights': ['distance', 'uniform']
# }
model = RandomForestClassifier()
search = BayesSearchCV(model, param_grid, n_iter=50, n_points=3, pre_dispatch=36, refit=True, cv=3, verbose=10, random_state=42, n_jobs=-1)

from tqdm.auto import tqdm
from tqdm.utils import CallbackIOWrapper

with tqdm(total=search.total_iterations) as pbar:
    def on_step(optim_result):
        print(optim_result)
        pbar.update(9)
        return False
    search.fit(X_train, y_train, callback=on_step)


In [None]:
predictions = search.estimator.predict(X_test)

In [None]:
f1_score(testing_labels, predictions, average="weighted")

In [None]:
hamming_loss(testing_labels, predictions)

In [None]:
plot_confusion_matrix(pipe, testing_set, testing_labels)

In [None]:
training_predictions = search.best_estimator_.predict(X[:,features_set])
print(classification_report(training_labels, training_predictions, target_names=subset))

In [None]:
from skopt import BayesSearchCV
from skopt import *
from skopt.space import Real, Categorical, Integer

In [None]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
param_grid = {
    #'rf__bootstrap': [True, False],
    'rf__max_depth': (10, 1000), #[10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 125, 150, 175, 200, 250, 300, 350, 400, None],
 #'rf__max_features': ['auto', 'sqrt'],
 'rf__min_samples_leaf': (1, 12),
 'rf__min_samples_split': (2, 12),
 'rf__n_estimators': (5, 1000)          
             }


# param_grid = {
#     'svd__n_components': Integer(64,10000),
#     'svc__estimator__C': Real(1e-6, 1e+6, prior='log-uniform'),
#     'svc__estimator__gamma': Real(1e-6, 1e+6, prior='log-uniform'),
#     'svc__estimator__degree': Integer(1,8),
#     'svc__estimator__kernel': Categorical(['linear', 'poly', 'rbf']),
# }

# param_grid = {
# #  'svd__n_components': np.arange(64, 5000, 100),
#  'knn__leaf_size': np.arange(1, 50, 1),
#  'knn__metric': ['minkowski', 'euclidean'],
#  'knn__n_neighbors': [2,3,4,5,6,7,8,9,10,11,12],
#  'knn__weights': ['distance', 'uniform']
# }
search = BayesSearchCV(pipe, param_grid, n_iter=50, n_points=3, pre_dispatch=36, refit=True, cv=3, verbose=10, random_state=42, n_jobs=-1)

from tqdm.auto import tqdm
from tqdm.utils import CallbackIOWrapper

with tqdm(total=search.total_iterations) as pbar:
    def on_step(optim_result):
        print(optim_result)
        pbar.update(9)
        return False
    search.fit(training_set, training_labels, callback=on_step)



In [None]:
print(search.best_estimator_.get_params())

In [None]:
from sklearn.metrics import *
#pipe.fit(training_set, training_labels)
#predictions = pipe.predict(testing_set)
predictions = search.best_estimator_.predict(Xtest[:,features_set])
print(classification_report(testing_labels, predictions, target_names=subset))

In [None]:
from sklearn.metrics import *
#pipe.fit(training_set, training_labels)
#predictions = pipe.predict(testing_set)
predictions = search.best_estimator_.predict(testing_set)
print(classification_report(testing_labels, predictions, target_names=all_tiers_100))

In [None]:
from sklearn.metrics import *
#pipe.fit(training_set, training_labels)
predictions = pipe.predict(testing_set)
print(classification_report(testing_labels, predictions, target_names=all_tiers_100))

In [None]:
predictions = predictions = search.best_estimator_.predict(training_set)
print(classification_report(training_labels, predictions, target_names=subset))


#  Guessing Baseline

                                      precision    recall  f1-score   support

                 AnalysisAndModeling       0.31      0.32      0.32        84
                    AnatomicalTarget       0.63      0.63      0.63       164
                             Imaging       0.57      0.56      0.56       133
                       Manufacturing       0.38      0.48      0.43        83
                  SpecificationofUse       0.33      0.33      0.33        79
                      SurgicalMethod       0.19      0.20      0.20        40
      AnalysisAndModeling_3DModeling       0.22      0.21      0.22        71
     AnatomicalTarget_LowerExtremity       0.45      0.43      0.44       113
              AnatomicalTarget_Torso       0.19      0.17      0.18        35
     AnatomicalTarget_UpperExtremity       0.26      0.26      0.26        31
                          Imaging_CT       0.14      0.19      0.16        59
                         Imaging_MRI       0.24      0.20      0.22        59
                  Imaging_Ultrasound       0.17      0.19      0.18        32
 Manufacturing_AdditiveManufacturing       0.24      0.24      0.24        38
       PersonalizedProduct_Guide/Jig       0.55      0.46      0.50       120
         PersonalizedProduct_Implant       0.49      0.53      0.51       124
          SpecificationofUse_Disease       0.19      0.20      0.20        30
 SpecificationofUse_JointReplacement       0.14      0.23      0.17        44
 AnatomicalTarget_LowerExtremity_Hip       0.21      0.17      0.19        40
AnatomicalTarget_LowerExtremity_Knee       0.32      0.34      0.33        82

                           micro avg       0.38      0.39      0.38      1461
                           macro avg       0.31      0.32      0.31      1461
                        weighted avg       0.38      0.39      0.39      1461
                         samples avg       0.38      0.40      0.37      1461

# RF

 {'rf__bootstrap': False,
 'rf__ccp_alpha': 0.0,
 'rf__class_weight': None,
 'rf__criterion': 'gini',
 'rf__max_depth': 150,
 'rf__max_features': 'sqrt',
 'rf__max_leaf_nodes': None,
 'rf__max_samples': None,
 'rf__min_impurity_decrease': 0.0,
 'rf__min_impurity_split': None,
 'rf__min_samples_leaf': 1,
 'rf__min_samples_split': 2,
 'rf__min_weight_fraction_leaf': 0.0,
 'rf__n_estimators': 200,
 'rf__n_jobs': None,
 'rf__oob_score': False,
 'rf__random_state': None,
 'rf__verbose': 0,
 'rf__warm_start': False}

                                      precision    recall  f1-score   support

                 AnalysisAndModeling       0.43      0.24      0.31        84
                    AnatomicalTarget       0.70      0.78      0.74       164
                             Imaging       0.60      0.59      0.60       133
                       Manufacturing       0.37      0.25      0.30        83
                  SpecificationofUse       0.42      0.32      0.36        79
                      SurgicalMethod       0.71      0.30      0.42        40
      AnalysisAndModeling_3DModeling       0.38      0.18      0.25        71
     AnatomicalTarget_LowerExtremity       0.53      0.46      0.49       113
              AnatomicalTarget_Torso       0.08      0.03      0.04        35
     AnatomicalTarget_UpperExtremity       0.11      0.03      0.05        31
                          Imaging_CT       0.18      0.10      0.13        59
                         Imaging_MRI       0.28      0.14      0.18        59
                  Imaging_Ultrasound       0.00      0.00      0.00        32
 Manufacturing_AdditiveManufacturing       0.23      0.08      0.12        38
       PersonalizedProduct_Guide/Jig       0.64      0.41      0.50       120
         PersonalizedProduct_Implant       0.59      0.74      0.66       124
          SpecificationofUse_Disease       0.06      0.03      0.04        30
 SpecificationofUse_JointReplacement       0.21      0.16      0.18        44
 AnatomicalTarget_LowerExtremity_Hip       0.26      0.12      0.17        40
AnatomicalTarget_LowerExtremity_Knee       0.47      0.33      0.39        82

                           micro avg       0.50      0.38      0.43      1461
                           macro avg       0.36      0.26      0.30      1461
                        weighted avg       0.45      0.38      0.40      1461
                         samples avg       0.53      0.41      0.43      1461



# KNN

 'knn__algorithm': 'auto',
 'knn__leaf_size': 30,
 'knn__metric': 'euclidean',
 'knn__metric_params': None,
 'knn__n_jobs': None,
 'knn__n_neighbors': 2,
 'knn__p': 2,
 'knn__weights': 'distance'
                                      precision    recall  f1-score   support

                 AnalysisAndModeling       0.38      0.43      0.40        84
                    AnatomicalTarget       0.71      0.60      0.65       164
                             Imaging       0.63      0.62      0.62       133
                       Manufacturing       0.34      0.37      0.36        83
                  SpecificationofUse       0.34      0.47      0.39        79
                      SurgicalMethod       0.39      0.30      0.34        40
      AnalysisAndModeling_3DModeling       0.35      0.38      0.36        71
     AnatomicalTarget_LowerExtremity       0.60      0.50      0.54       113
              AnatomicalTarget_Torso       0.24      0.11      0.15        35
     AnatomicalTarget_UpperExtremity       0.16      0.16      0.16        31
                          Imaging_CT       0.24      0.32      0.27        59
                         Imaging_MRI       0.24      0.32      0.28        59
                  Imaging_Ultrasound       0.16      0.28      0.21        32
 Manufacturing_AdditiveManufacturing       0.18      0.13      0.15        38
       PersonalizedProduct_Guide/Jig       0.59      0.39      0.47       120
         PersonalizedProduct_Implant       0.56      0.76      0.64       124
          SpecificationofUse_Disease       0.19      0.33      0.24        30
 SpecificationofUse_JointReplacement       0.12      0.14      0.13        44
 AnatomicalTarget_LowerExtremity_Hip       0.33      0.30      0.32        40
AnatomicalTarget_LowerExtremity_Knee       0.48      0.39      0.43        82

                           micro avg       0.43      0.44      0.43      1461
                           macro avg       0.36      0.37      0.36      1461
                        weighted avg       0.45      0.44      0.44      1461
                         samples avg       0.45      0.46      0.41      1461

# KNN + SVD

 precision    recall  f1-score   support

                 AnalysisAndModeling       0.38      0.43      0.40        84
                    AnatomicalTarget       0.71      0.60      0.65       164
                             Imaging       0.63      0.62      0.62       133
                       Manufacturing       0.34      0.37      0.36        83
                  SpecificationofUse       0.34      0.47      0.39        79
                      SurgicalMethod       0.39      0.30      0.34        40
      AnalysisAndModeling_3DModeling       0.35      0.38      0.36        71
     AnatomicalTarget_LowerExtremity       0.60      0.50      0.54       113
              AnatomicalTarget_Torso       0.24      0.11      0.15        35
     AnatomicalTarget_UpperExtremity       0.16      0.16      0.16        31
                          Imaging_CT       0.24      0.32      0.27        59
                         Imaging_MRI       0.24      0.32      0.28        59
                  Imaging_Ultrasound       0.16      0.28      0.21        32
 Manufacturing_AdditiveManufacturing       0.18      0.13      0.15        38
       PersonalizedProduct_Guide/Jig       0.59      0.39      0.47       120
         PersonalizedProduct_Implant       0.56      0.76      0.64       124
          SpecificationofUse_Disease       0.19      0.33      0.24        30
 SpecificationofUse_JointReplacement       0.12      0.14      0.13        44
 AnatomicalTarget_LowerExtremity_Hip       0.33      0.30      0.32        40
AnatomicalTarget_LowerExtremity_Knee       0.48      0.39      0.43        82

                           micro avg       0.43      0.44      0.43      1461
                           macro avg       0.36      0.37      0.36      1461
                        weighted avg       0.45      0.44      0.44      1461
                         samples avg       0.45      0.46      0.41      1461

'svd__algorithm': 'randomized',
 'svd__n_components': 2564,
 'svd__n_iter': 5,
 'svd__random_state': 42,
 'svd__tol': 0.0,
 'knn__algorithm': 'auto',
 'knn__leaf_size': 30,
 'knn__metric': 'euclidean',
 'knn__metric_params': None,
 'knn__n_jobs': None,
 'knn__n_neighbors': 2,
 'knn__p': 2,
 'knn__weights': 'distance'
