In [4]:
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
from gensim.parsing.preprocessing import strip_non_alphanum, strip_punctuation
import torch
import torchtext
from sklearn.model_selection import *
from torch import nn
import gensim.downloader as api

from torchtext.data import Field, LabelField
from torchtext.data import BucketIterator
from torchtext.datasets import IMDB
from transformers import BertTokenizer
from transformers import BertModel
from skorch import NeuralNetClassifier
from skorch.callbacks import Freezer
from sklearn.ensemble import *
from sklearn.metrics import *
from sklearn.pipeline import * 
from sklearn.compose import *
from sklearn.linear_model import *
from sklearn.naive_bayes import *
from sklearn.neural_network import *
from sklearn.preprocessing import *
from skorch.callbacks import ProgressBar
from util import *
import sklearn
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import *
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import fse
from fse.models import uSIF

from tqdm.auto import tqdm
from tqdm.utils import CallbackIOWrapper
from skopt.space import Real, Integer
from skopt import BayesSearchCV
from transformers import AutoModel, AutoTokenizer, AutoConfig, TFAutoModel

In [3]:

model = 'distilbert-base-uncased'

tokenizer = AutoTokenizer.from_pretrained(model, do_lower_case=True, add_special_tokens=True,
                                                max_length=128, pad_to_max_length=True)


In [5]:
def tokenize(sentences, tokenizer):
    input_ids, input_masks, input_segments = [],[],[]
    for sentence in tqdm(sentences):
        inputs = tokenizer.encode_plus(sentence, add_special_tokens=True, max_length=128, pad_to_max_length=True, 
                                             return_attention_mask=True, return_token_type_ids=True)
        input_ids.append(inputs['input_ids'])
        input_masks.append(inputs['attention_mask'])
        input_segments.append(inputs['token_type_ids'])        
        
    return np.asarray(input_ids, dtype='int32'), np.asarray(input_masks, dtype='int32'), np.asarray(input_segments, dtype='int32')


In [2]:
config = AutoConfig(dropout=0.2, attention_dropout=0.2)
config.output_hidden_states = False
transformer_model = TFAutoModel.from_pretrained(model, config = config)

input_ids_in = tf.keras.layers.Input(shape=(128,), name='input_token', dtype='int32')
input_masks_in = tf.keras.layers.Input(shape=(128,), name='masked_token', dtype='int32') 

embedding_layer = transformer_model(input_ids_in, attention_mask=input_masks_in)[0]
X = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(50, return_sequences=True, dropout=0.1, recurrent_dropout=0.1))(embedding_layer)
X = tf.keras.layers.GlobalMaxPool1D()(X)
X = tf.keras.layers.Dense(50, activation='relu')(X)
X = tf.keras.layers.Dropout(0.2)(X)
X = tf.keras.layers.Dense(6, activation='sigmoid')(X)
model = tf.keras.Model(inputs=[input_ids_in, input_masks_in], outputs = X)

for layer in model.layers[:3]:
  layer.trainable = False

In [3]:
glove = api.load("glove-wiki-gigaword-100")

In [4]:
usif = uSIF(glove, workers=32, lang_freq="en")

In [5]:
training_set = pd.read_json("training_set.json.gz", lines=True, orient="records")
testing_set = pd.read_json("testing_set.json.gz", lines=True, orient="records")

In [6]:
subset = list((set(tier1)-set(["PersonalizedProduct"]))&set(all_tiers_100))
subset

['AnatomicalTarget',
 'AnalysisAndModeling',
 'Manufacturing',
 'Imaging',
 'SurgicalMethod',
 'SpecificationofUse']

In [7]:
training_labels = training_set[subset]
testing_labels = testing_set[subset]

In [13]:
tfidf_default_settings = {
    'lowercase': True, 
    'strip_accents': 'ascii',
    #'stop_words' : stopwords,
    'min_df': 5,
    'max_df': 0.5,
    #'ngram_range': (1,3)
}

transformer = ColumnTransformer([
     ('top_terms', CountVectorizer(analyzer=iden, binary=True, min_df=2), 'top_terms'),
     ('cited_by', CountVectorizer(analyzer=iden, binary=True, min_df=2), 'cited_by'),
     ('inventors', CountVectorizer(analyzer=iden, binary=True, min_df=2), 'inventors'),
     ('citations', CountVectorizer(analyzer=citations_split, binary=True, min_df=2), 'citations'),
     ('similar_patents', CountVectorizer(analyzer=iden, binary=True, min_df=2), 'similar_patents'),
     ('cpc', CountVectorizer(analyzer=cpc_split, binary=True, min_df=2), 'cpc_codes'),
     ('embedding_v1', Extract(), 'embedding_v1'),
     ('usif_abstract', Embedder(usif), 'abstract'),
     ('usif_claims', Embedder(usif), 'claims'),
     ('usif_description', Embedder(usif), 'description'),
     ('abstract_tfidf', TfidfVectorizer(**tfidf_default_settings), 'abstract'),
     ('claims_tfidf', TfidfVectorizer(**tfidf_default_settings), 'claims'),
     ('description_tfidf', TfidfVectorizer(**tfidf_default_settings), 'description')
    ], verbose=False, n_jobs=-1)



transformer_grid = {
#     'top_terms__min_df': (1, 5),
#     'top_terms__max_df': Real(0.1, 1.0),
#     'cited_by__min_df': (1, 5),
#     'cited_by__max_df': Real(0.1, 1.0),
#     'inventors__min_df': (1, 20),
#     'inventors__max_df': Real(0.1, 1.0),
#     'citations__min_df': (1, 20),
#     'citations__max_df': Real(0.1, 1.0),
#     'similar_patents__min_df': (1, 20),
#     'similar_patents__max_df': Real(0.1, 1.0),
#     'cpc__min_df': (1, 20),
#     'cpc__max_df': Real(0.1, 1.0),
#     'abstract_tfidf__min_df': (1, 20),
#     'abstract_tfidf__max_df': Real(0.1, 1.0),
#     'claims_tfidf__min_df': (1, 20),
#     'claims_tfidf__max_df': Real(0.1, 1.0),
#     'description_tfidf__min_df': (1, 20),
#     'description_tfidf__max_df': Real(0.1, 1.0),
}



In [14]:
pipe = Pipeline(steps=[('transformer', transformer),
                       ('svd', TruncatedSVD(random_state=42, n_components=1024)),
                       #('dummy', OneVsRestClassifier(DummyClassifier()))
                       #('svc', OneVsRestClassifier(SVC(random_state=42), n_jobs=-1))
                       ('rf',  RandomForestClassifier(n_jobs=-1, random_state=42))
                       #('lr', OneVsRestClassifier(LogisticRegression(n_jobs=-1), n_jobs=-1))
                       #('cat', OneVsRestClassifier(CatBoostClassifier(verbose=True)))
                       #('knn', KNeighborsClassifier(n_jobs=-1))
                      ], 
                verbose=True,
                memory="cachedir/")

model_grid = {
    'svd__n_components': (64, 1024*16),
    'rf__max_depth': (10, 1000), #[10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 125, 150, 175, 200, 250, 300, 350, 400, None],
    'rf__min_samples_leaf': (1, 12),
    'rf__min_samples_split': (2, 12),
    'rf__n_estimators': (5, 1000)          
}

In [15]:
# pipe.fit(training_set, training_labels)
# predictions = pipe.predict(testing_set)
# print(classification_report(testing_labels, predictions, target_names=subset))

In [16]:
param_grid = {**model_grid, **({f"transformer__{k}": v for k,v in transformer_grid.items()})}
param_grid

{'svd__n_components': (64, 16384),
 'rf__max_depth': (10, 1000),
 'rf__min_samples_leaf': (1, 12),
 'rf__min_samples_split': (2, 12),
 'rf__n_estimators': (5, 1000)}

In [None]:
search = BayesSearchCV(pipe, param_grid, n_iter=50, n_points=3, pre_dispatch=36, refit=True, cv=3, verbose=10, random_state=42, n_jobs=-1)

with tqdm(total=search.total_iterations) as pbar:
    def on_step(optim_result):
        print(optim_result)
        pbar.update(9)
        return False
    search.fit(training_set, training_labels, callback=on_step)

HBox(children=(FloatProgress(value=0.0, max=250.0), HTML(value='')))

Fitting 3 folds for each of 3 candidates, totalling 9 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 31 concurrent workers.


In [None]:
# Untuned RF with uSIF (Glove)

#      SurgicalMethod       1.00      0.03      0.05        40
#    AnatomicalTarget       0.68      0.95      0.79       164
#             Imaging       0.55      0.78      0.64       133
#  SpecificationofUse       0.50      0.04      0.07        79
# AnalysisAndModeling       0.20      0.01      0.02        84
#       Manufacturing       0.57      0.10      0.16        83

#           micro avg       0.61      0.47      0.53       583
#           macro avg       0.58      0.32      0.29       583
#        weighted avg       0.56      0.47      0.41       583
#         samples avg       0.63      0.50      0.52       583