In [2]:
import os
import gc
import glob
import spacy
import nltk
import random
import itertools
import torch
import unicodedata
import datasets, transformers
import pandas as pd
import numpy as np
import seaborn as sns
from yellowbrick.model_selection import FeatureImportances
from nltk.corpus import stopwords, wordnet
from joblib import dump
import scipy as sp
from scipy import stats
from itertools import groupby
from joblib import parallel_backend
from sklearn import linear_model, decomposition
from collections import OrderedDict
from operator import itemgetter
from sklearn import metrics
from joblib import Parallel, delayed
from transformers import TrainingArguments, Trainer, AutoModelForSequenceClassification, AutoTokenizer
from sklearn.preprocessing import MinMaxScaler, StandardScaler, FunctionTransformer, OneHotEncoder, LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn.compose import TransformedTargetRegressor
from sklearn.ensemble import RandomForestRegressor, IsolationForest
from sklearn.feature_selection import VarianceThreshold
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import GridSearchCV, cross_validate, cross_val_score, train_test_split, KFold
from sklearn.base import BaseEstimator, ClassifierMixin, TransformerMixin, clone, RegressorMixin
from sklearn.neighbors import LocalOutlierFactor
from sklearn.impute import SimpleImputer
from sklearn.metrics import r2_score
from spacy.tokenizer import Tokenizer
from typing import List
from spacy.lang import char_classes
from spacy.symbols import ORTH
from spacy.tokenizer import Tokenizer
from spacy.util import compile_prefix_regex, compile_infix_regex, compile_suffix_regex
from spacy.language import Language
from spacy.lang.char_classes import CONCAT_QUOTES, LIST_ELLIPSES, LIST_ICONS, ALPHA, ALPHA_LOWER, ALPHA_UPPER
# from scispacy.abbreviation import AbbreviationDetector
from spacy.pipeline import EntityRecognizer

try:
    from sklearn.utils._testing import ignore_warnings
except:
    from sklearn.utils.testing import ignore_warnings
from sklearn.exceptions import ConvergenceWarning

%env TOKENIZERS_PARALLELISM=true

os.environ["WANDB_DISABLED"] = "true"

env: TOKENIZERS_PARALLELISM=true


In [3]:
INPUT_DIR = '../input/us-patent-phrase-to-phrase-matching/'
OUTPUT_DIR = './'
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)

In [6]:
nlp = spacy.load('en_core_web_lg')
# nlp.add_pipe("abbreviation_detector")
re_token_match = spacy.tokenizer._get_regex_pattern(nlp.Defaults.token_match)

In [8]:
import warnings
warnings.simplefilter('ignore')

pd.set_option('display.precision', 4)
# pd.set_option('display.max_rows', 500)
# pd.set_option('display.max_columns', 500)
# pd.set_option('display.width', 1000)

cm = sns.light_palette('green', as_cmap=True)
props_param = "color:white; font-weight:bold; background-color:green;"

CUSTOM_SEED = 42
CUSTOM_BATCH = 24

# Stopwords and infixes
ADDITIONAL_STOPWORDS = ['one or more', 'a', 'needn', 'a', 'not', 'able', 'never', 'about', 'needn’t', 'accordance', 'now', 'often', 'above', 'no', 'according', 'of', 'mentioned', 'others', 'after', 'nor', 'all', 'on', 'accordingly', 'otherwise', 'again', 'not', 'also', 'onto', 'across', 'overall', 'against', 'now', 'an', 'or', 'along', 'rather', 'ain', 'o', 'and', 'other', 'already', 'remarkably', 'all', 'of', 'another', 'particularly', 'alternatively', 'significantly', 'am', 'off', 'are', 'preferably', 'always', 'simply', 'an', 'on', 'as', 'preferred', 'among', 'sometimes', 'and', 'once', 'at', 'present', 'and/or', 'specifically', 'any', 'only', 'be', 'provide', 'anything', 'straight', 'are', 'or', 'because', 'provided', 'anywhere', 'forward', 'aren', 'other', 'been', 'provides', 'better', 'substantially', 'aren’t', 'our', 'being', 'relatively', 'disclosure', 'thereafter', 'as', 'ours', 'by', 'respectively', 'due', 'therebetween', 'at', 'ourselves', 'claim', 'said', 'easily', 'therefor', 'be', 'out', 'comprises', 'comprising', 'should', 'easy', 'therefrom', 'because', 'over', 'since', 'e.g', 'therein', 'been', 'own', 'could', 'some', 'either', 'thereinto', 'before', 're', 'described', 'such', 'elsewhere', 'thereon', 'being', 's', 'desired', 'suitable', 'enough', 'therethrough', 'below', 'same', 'do', 'than', 'especially', 'therewith', 'between', 'shan', 'does', 'that', 'essentially', 'together', 'both', 'shan’t', 'each', 'the', 'et', 'al', 'toward', 'but', 'she', 'embodiment', 'their', 'etc', 'towards', 'by', 'she’s', 'fig', 'then', 'eventually', 'typical', 'can', 'should', 'figs', 'there', 'excellent', 'upon', 'couldn', 'should’ve', 'for', 'thereby', 'finally', 'via', 'couldn’t', 'shouldn', 'from', 'therefore', 'furthermore', 'vice', 'versa', 'd', 'shouldn’t', 'further', 'thereof', 'good', 'whatever', 'did', 'so', 'generally', 'thereto', 'hence', 'whereas', 'didn', 'some', 'had', 'these', 'he/she', 'whereat', 'didn’t', 'such', 'has', 'they', 'him/her', 'wherever', 'do', 't', 'have', 'this', 'his/her', 'whether', 'does', 'than', 'having', 'those', 'ie', 'whose', 'doesn', 'that', 'herein', 'thus', 'ii', 'within', 'doesn’t', 'that’ll', 'however', 'to', 'iii', 'without', 'doing', 'the', 'if', 'use', 'instead', 'yet', 'don', 'their', 'in', 'various', 'later', 'don’t', 'theirs', 'into', 'was', 'like', 'down', 'them', 'invention', 'were', 'little', 'during', 'themselves', 'is', 'what', 'many', 'each', 'there', 'it', 'when', 'may', 'few', 'these', 'its', 'where', 'meanwhile', 'for', 'they', 'means', 'whereby', 'might', 'from', 'this', 'wherein', 'moreover', 'further', 'those', 'which', 'much', 'had', 'through', 'while', 'must', 'hadn', 'to', 'who', 'hadn’t', 'too', 'will', 'has', 'under', 'with', 'hasn', 'until', 'Would', 'hasn’t', 'up', 'have', 've', 'haven', 'very', 'haven’t', 'was', 'having', 'wasn', 'he', 'wasn’t', 'her', 'we', 'here', 'were', 'hers', 'weren', 'herself', 'weren’t', 'him', 'what', 'himself', 'when', 'his', 'where', 'how', 'which', 'i', 'while', 'if', 'who', 'in', 'whom', 'into', 'why', 'is', 'will', 'isn', 'with', 'isn’t', 'won', 'it', 'won’t', 'it’s', 'wouldn', 'its', 'wouldn’t', 'itself', 'y', 'just', 'you', 'll', 'you’d', 'm', 'you’ll', 'ma', 'you’re', 'me', 'you’ve', 'mightn', 'your', 'mightn’t', 'yours', 'more', 'yourself', 'most', 'yourselves', 'mustn', 'mustn’t', 'my', 'myself']

puncts = ['\u200d','?', '....','..','...','', ',', '.', '"', ':', ')', '(', '-', '!', '|', ';', "'", '$', '&', '/', '[', ']', '>', '%', '=', '*', '+', '\\',
    '•', '~', '£', '·', '_', '{', '}', '©', '^', '®', '`',  '<', '→', '°', '€', '™', '›',  '♥', '←', '×', '§', '″', '′', 'Â', '█',
    '½', 'à', '…', '“', '★', '”', '–', '●', 'â', '►', '−', '¢', '²', '¬', '░', '¶', '↑', '±', '¿', '▾', '═', '¦', '║', '―', '¥', '▓',
    '—', '‹', '─', '▒', '：', '¼', '⊕', '▼', '▪', '†', '■', '’', '▀', '¨', '▄', '♫', '☆', 'é', '¯', '♦', '¤', '▲', 'è', '¸', '¾',
    'Ã', '⋅', '‘', '∞', '∙', '）', '↓', '、', '│', '（', '»', '，', '♪', '╩', '╚', '³', '・', '╦', '╣', '╔', '╗', '▬', '❤', 'ï', 'Ø',
    '¹', '≤', '‡', '√', '!','🅰','🅱']

ABBREVIATIONS: List[str] = [
    "sec.",
    "secs.",
    "Sec.",
    "Secs.",
    "fig.",
    "figs.",
    "Fig.",
    "Figs.",
    "eq.",
    "eqs.",
    "Eq.",
    "Eqs.",
    "no.",
    "nos.",
    "No.",
    "Nos.",
    "al.",
    "gen.",
    "sp.",
    "nov.",
]

stopwords = nltk.corpus.stopwords.words('english') + ADDITIONAL_STOPWORDS

def combined_rule_prefixes() -> List[str]:
    """Helper function that returns the prefix pattern for the tokenizer.
    It is a helper function to accomodate spacy tests that only test
    prefixes.
    """
    # add lookahead assertions for brackets (may not work properly for unbalanced brackets)
    prefix_punct = char_classes.PUNCT.replace("|", " ")
    prefix_punct = prefix_punct.replace(r"\(", r"\((?![^\(\s]+\)\S+)")
    prefix_punct = prefix_punct.replace(r"\[", r"\[(?![^\[\s]+\]\S+)")
    prefix_punct = prefix_punct.replace(r"\{", r"\{(?![^\{\s]+\}\S+)")

    prefixes = (
        ["§", "%", "=", r"\+"]
        + char_classes.split_chars(prefix_punct)
        + char_classes.LIST_ELLIPSES
        + char_classes.LIST_QUOTES
        + char_classes.LIST_CURRENCY
        + char_classes.LIST_ICONS
    )
    return prefixes

def customize_tokenizer(nlp: Language) -> Tokenizer:
    """Creates a custom tokenizer on top of spaCy's default tokenizer. The
    intended use of this function is to replace the tokenizer in a spaCy
    pipeline like so:
         nlp = spacy.load("some_spacy_model")
         nlp.tokenizer = combined_rule_tokenizer(nlp)
    @param nlp: a loaded spaCy model
    """
    # remove the first hyphen to prevent tokenization of the normal hyphen
    hyphens = char_classes.HYPHENS.replace("-|", "", 1)

    infixes = (
        char_classes.LIST_ELLIPSES
        + char_classes.LIST_ICONS
        + [
            r"×",  # added this special x character to tokenize it separately
            r"(?<=[0-9])[+\-\*^](?=[0-9-])",
            r"(?<=[{al}])\.(?=[{au}])".format(
                al=char_classes.ALPHA_LOWER, au=char_classes.ALPHA_UPPER
            ),
            r"(?<=[{a}]),(?=[{a}])".format(a=char_classes.ALPHA),
            r'(?<=[{a}])[?";:=,.]*(?:{h})(?=[{a}])'.format(
                a=char_classes.ALPHA, h=hyphens
            ),
            # removed / to prevent tokenization of /
            r'(?<=[{a}"])[:<>=](?=[{a}])'.format(a=char_classes.ALPHA),
        ]
    )

    prefixes = combined_rule_prefixes()

    # add the last apostrophe
    quotes = char_classes.LIST_QUOTES.copy() + ["’"]

    # add lookbehind assertions for brackets (may not work properly for unbalanced brackets)
    suffix_punct = char_classes.PUNCT.replace("|", " ")
    # These lookbehinds are commented out because they are variable width lookbehinds, and as of spacy 2.1,
    # spacy uses the re package instead of the regex package. The re package does not support variable width
    # lookbehinds. Hacking spacy internals to allow us to use the regex package is doable, but would require
    # creating our own instance of the language class, with our own Tokenizer class, with the from_bytes method
    # using the regex package instead of the re package
    # suffix_punct = suffix_punct.replace(r"\)", r"(?<!\S+\([^\)\s]+)\)")
    # suffix_punct = suffix_punct.replace(r"\]", r"(?<!\S+\[[^\]\s]+)\]")
    # suffix_punct = suffix_punct.replace(r"\}", r"(?<!\S+\{[^\}\s]+)\}")

    suffixes = (
        char_classes.split_chars(suffix_punct)
        + char_classes.LIST_ELLIPSES
        + quotes
        + char_classes.LIST_ICONS
        + ["'s", "'S", "’s", "’S", "’s", "’S"]
        + [
            r"(?<=[0-9])\+",
            r"(?<=°[FfCcKk])\.",
            r"(?<=[0-9])(?:{})".format(char_classes.CURRENCY),
            # this is another place where we used a variable width lookbehind
            # so now things like 'H3g' will be tokenized as ['H3', 'g']
            # previously the lookbehind was (^[0-9]+)
            r"(?<=[0-9])(?:{u})".format(u=char_classes.UNITS),
            r"(?<=[0-9{}{}(?:{})])\.".format(
                char_classes.ALPHA_LOWER, r"%²\-\)\]\+", "|".join(quotes)
            ),
            # add |\d to split off the period of a sentence that ends with 1D.
            r"(?<=[{a}|\d][{a}])\.".format(a=char_classes.ALPHA_UPPER),
        ]
    )

    infix_re = compile_infix_regex(infixes)
    prefix_re = compile_prefix_regex(prefixes)
    suffix_re = compile_suffix_regex(suffixes)

    # Update exclusions to include these abbreviations so the period is not split off
    exclusions = {
        abbreviation: [{ORTH: abbreviation}] for abbreviation in ABBREVIATIONS
    }
    tokenizer_exceptions = nlp.Defaults.tokenizer_exceptions.copy()
    tokenizer_exceptions.update(exclusions)

    tokenizer = Tokenizer(
        nlp.vocab,
        tokenizer_exceptions,
        prefix_search=prefix_re.search,
        suffix_search=suffix_re.search,
        infix_finditer=infix_re.finditer,
        token_match=nlp.tokenizer.token_match,  # type: ignore
    )
    return tokenizer

nlp.tokenizer = customize_tokenizer(nlp)

In [None]:
table = {
'A': 'Human Necessities',
'B': 'Operations and Transport',
'C': 'Chemistry and Metallurgy',
'D': 'Textiles',
'E': 'Fixed Constructions',
'F': 'Mechanical Engineering',
'G': 'Physics',
'H': 'Electricity',
'Y': 'Emerging Cross-Sectional Technologies'
}

In [None]:
def remove_from_list(x, stuff_to_remove) -> list:
    for item in stuff_to_remove:
        # Making sure to iterate through the entire token
        for i,token in enumerate(x):
            if item == token:
                del x[i]
    return x

def Remove_Duplicates(text_in):
    return re.sub(r"\b(\w+)(?:\W\1\b)+", r"\1", text_in, flags=re.IGNORECASE)


def remove_consecutive_nums(text):
    # Remove any chunks of consecutive numbers
    number_strings = re.findall(r'\d+[ \t]\d+', text)
    ind_num_strings = []
    for j in number_strings:
        x = [int(i) for i in j.split()]
        ind_num_strings.append(x)

    flat_num_list = [item for sublist in ind_num_strings for item in sublist]

    for i in flat_num_list:
        j=re.sub(r'\d+','',str(i))
        text = text.replace(str(i),j)
    return text


def basic_clean(text_list, infixes, stopwords):
    """
    A simple function to clean up the data. All the words that
    are not designated as a stop word is then lemmatized after
    encoding and basic regex parsing are performed.
    """

    text_list_clean = []
    for text in text_list:
        text = re.sub(r'[\)\(\.\,\;\\\?\&\%\!\+\-]', '', re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f\x7f-\xff\xad\x0c6§\\\£\Â*_<>""⎫•{}Γ~]', ' ', str(' '.join(re.split('\s*-\s*', text)))))
        if len(text.split("  ")) > 1000:
            text = " ".join(["".join(w.split(" ")) if len(w.split(' '))>1 else w for w in text.split("  ")])
        text_list_clean.append([i for i in remove_from_list(re.sub('\s+', ' ', re.sub('\s\s+', ' ', re.sub('\s+\s+', ' ', Remove_Duplicates(re.sub(r"\b(?=[mdclxvii])m{0,4}(cm|cd|d?c{0,3})(xc|xl|l?x{0,3})([ii]x|[ii]v|v?[ii]{0,3})\b\.?", '', (unicodedata.normalize('NFKD', re.sub(' +', ' ', re.sub(r"\s+\s+"," ", re.sub(r"\\,",",", re.sub(r" \,",",", re.sub(r"\\.",".", re.sub(r" \.",".", re.sub(r"\(\s+\)","", re.sub(r"\(\)","", re.sub(r" \)","", re.sub(r"\( ","", remove_consecutive_nums(re.sub(r"\s+"," ", re.sub(r"([A-z])\- ([A-z])", r"\1\2", re.sub(r'\s', ' ', text)).replace('\'','').replace('. .', '.').replace('\'',''))))))))))))).lower())
        .encode('ascii', 'ignore')
        .decode('utf-8', 'ignore')
        .lower())))))).split(), puncts) if not i.isdigit() or i in stopwords])
        del text

    return '. '.join(x.strip().capitalize() for x in '. '.join(' '.join([word for word in sent]) for sent in text_list_clean).split('.')) + '.'


def get_cpc_texts():
    """
    Function taken from Y Nakama's notebook:
    https://www.kaggle.com/code/yasufuminakama/pppm-deberta-v3-large-baseline-w-w-b-train
    """
    contexts = []
    pattern = '[A-Z]\d+'
    for file_name in os.listdir('cpc-data/CPCSchemeXML202105'):
        result = re.findall(pattern, file_name)
        if result:
            contexts.append(result)
    contexts = sorted(set(sum(contexts, [])))
    results = {}
    for cpc in ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'Y']:
        with open(f'cpc-data/CPCTitleList202202/cpc-section-{cpc}_20220201.txt') as f:
            s = f.read()
        pattern = f'{cpc}\t\t.+'
        result = re.findall(pattern, s)
        cpc_result = result[0].lstrip(pattern)
        for context in [c for c in contexts if c[0] == cpc]:
            pattern = f'{context}\t\t.+'
            result = re.findall(pattern, s)
            results[context] = cpc_result + ". " + result[0].lstrip(pattern)
    return results

In [None]:
train = pd.read_csv('../input/us-patent-phrase-to-phrase-matching/train.csv')
test = pd.read_csv('../input/us-patent-phrase-to-phrase-matching/test.csv')

In [None]:
train['general_context'] = train['context'].apply(lambda x: table[x[0].upper()])
test['general_context'] = test['context'].apply(lambda x: table[x[0].upper()])

train = pd.concat([train, pd.get_dummies(train['general_context'])], axis=1)
test = pd.concat([test, pd.get_dummies(test['general_context'])], axis=1)

cpc_texts = torch.load(f"../input/cpc-texts/cpc_texts.pth")
train['context_text'] = train['context'].map(cpc_texts)
test['context_text'] = test['context'].map(cpc_texts)

train['section'] = train['context'].astype(str).str[0]
train['classes'] = train['context'].astype(str).str[1:]
test['section'] = test['context'].astype(str).str[0]
test['classes'] = test['context'].astype(str).str[1:]

train['anchor_len'] = train['anchor'].str.split().str.len()
train['target_len'] = train['target'].str.split().str.len()

test['anchor_len'] = test['anchor'].str.split().str.len()
test['target_len'] = test['target'].str.split().str.len()

train['len_diff'] = np.abs(train['target_len'] - train['anchor_len'])
test['len_diff'] = np.abs(test['target_len'] - test['anchor_len'])

train['num_anchor_stops'] = test['anchor'].str.count('|'.join(stopwords))
test['num_anchor_stops'] = test['anchor'].str.count('|'.join(stopwords))
train['num_target_stops'] = test['target'].str.count('|'.join(stopwords))
test['num_target_stops'] = test['target'].str.count('|'.join(stopwords))

train['dataset'] = 'train'
test['dataset'] = 'test'

In [None]:
train = train.loc[~train.index.duplicated(keep='first')]
test = test.loc[~test.index.duplicated(keep='first')]
train = train.reset_index(drop=True)
test = test.reset_index(drop=True)
df_all = train.append(test)

In [None]:
df_all['anchor_parsed'] = df_all['anchor'].apply(
    lambda text:
        " ".join(
            token.lemma_ for token in nlp(text)
                if token.lemma_.lower() not in stopwords and token.is_alpha
        )
)

df_all['target_parsed'] = df_all['target'].apply(
    lambda text:
        " ".join(
            token.lemma_ for token in nlp(text)
                if token.lemma_.lower() not in stopwords and token.is_alpha
        )
)

In [None]:
df_all['anchor_nlp'] = df_all.anchor.apply(lambda series: nlp(series))
df_all['target_nlp'] = df_all.target.apply(lambda series: nlp(series))

df_all['anchor_VERB'] = df_all.anchor_nlp.apply(lambda series: len([token for token in series if token.pos_ == 'VERB']))
df_all['target_VERB'] = df_all.target_nlp.apply(lambda series: len([token for token in series if token.pos_ == 'VERB']))

df_all['anchor_NOUN'] = df_all.anchor_nlp.apply(lambda series: len([token for token in series if token.pos_ == 'NOUN']))
df_all['target_NOUN'] = df_all.target_nlp.apply(lambda series: len([token for token in series if token.pos_ == 'NOUN']))

df_all['anchor_DET'] = df_all.anchor_nlp.apply(lambda series: len([token for token in series if token.pos_ == 'DET']))
df_all['target_DET'] = df_all.target_nlp.apply(lambda series: len([token for token in series if token.pos_ == 'DET']))

df_all['anchor_ADJ'] = df_all.anchor_nlp.apply(lambda series: len([token for token in series if token.pos_ == 'ADJ']))
df_all['target_ADJ'] = df_all.target_nlp.apply(lambda series: len([token for token in series if token.pos_ == 'ADJ']))

df_all['anchor_in_target'] = df_all.apply(lambda x: x["anchor_parsed"] in x["target"], axis=1)
df_all['target_in_anchor'] = df_all.apply(lambda x: x["target_parsed"] in x["anchor"], axis=1)

In [None]:
sims = df_all[["anchor_parsed", "target_parsed"]]
similarityValue = []
for i in range(sims.count()[0]):
    sentence_1 = nlp(sims.iloc[i][0])
    sentence_2 = nlp(sims.iloc[i][1])
    similarityValue.append(sentence_1.similarity(sentence_2))

df_all['anchor_target_cos_sim'] = similarityValue

train = df_all.loc[df_all['dataset'] == 'train']
test = df_all.loc[df_all['dataset'] == 'test']

In [None]:
train['text'] = train['anchor'] + '[SEP]' + train['target'] + '[SEP]' + train['context_text']
test['text'] = test['anchor'] + '[SEP]' + test['target'] + '[SEP]' + test['context_text']

# Transformers

In [None]:
class CFG:
    input_path = '../input/us-patent-phrase-to-phrase-matching/'
    model_path = [
                  '../input/deberta-v3-5folds/',
                  '../input/xlm-roberta-large-5folds/',
                  '../input/electra-upppm/electra_upppm/',
                 ]
    model_num = 3
    num_fold = 5

In [None]:
def process_test(unit):
        return {
        **tokenizer(unit['text'])
    }
    
for i in range (CFG.model_num):   
    tokenizer = AutoTokenizer.from_pretrained(f'{CFG.model_path[i]}fold0')
    test_ds = datasets.Dataset.from_pandas(test[['text']])
    test_ds = test_ds.map(process_test)

    predictions_test = []
    for fold in range(CFG.num_fold):        
        trainer = Trainer(
                AutoModelForSequenceClassification.from_pretrained(f'{CFG.model_path[i]}fold{fold}', 
                                                                   num_labels=5, problem_type='multi_label_classification'),
                tokenizer=tokenizer,
            )
        
        predictions_test.append(trainer.predict(test_ds).predictions)
        del trainer
        gc.collect()
        
    test[f"predictions_{CFG.model_path[i].split('/')[-2]}"] = np.average(predictions_test, axis=0)

    del tokenizer, test_ds
    gc.collect()

In [None]:
def process_valid(unit):
        return {
        **tokenizer(unit['text'])
    }
    
for i in range (CFG.model_num):   
    tokenizer = AutoTokenizer.from_pretrained(f'{CFG.model_path[i]}fold0')
    valid_ds = datasets.Dataset.from_pandas(train[['text']])
    valid_ds = valid_ds.map(process_valid)

    predictions_valid = []
    for fold in range(CFG.num_fold):        
        trainer = Trainer(
                AutoModelForSequenceClassification.from_pretrained(f'{CFG.model_path[i]}fold{fold}', 
                                                                   num_labels=1),
                tokenizer=tokenizer,
            )
        
        predictions_valid.append(trainer.predict(valid_ds).predictions)
        del trainer
        gc.collect()

    train[f"predictions_{CFG.model_path[i].split('/')[-2]}"] = np.average(predictions_valid, axis=0)

    del tokenizer, valid_ds
    gc.collect()

In [None]:
X = train.drop(columns=['id', 'anchor', 'target', 'context', 'score', 'general_context', 'context_text',
       'section', 'classes', 'dataset', 'anchor_parsed', 'target_parsed', 'anchor_nlp', 'target_nlp', 'text']).astype('float32')
y = train['score']

In [None]:
X

In [None]:
def slice_by_corr(X, r_min=0):
    # Create correlation matrix
    corr_matrix = X.corr().abs()

    # Select upper triangle of correlation matrix
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))

    # Find features with correlation greater than r_min
    return X[[column for column in upper.columns if any(upper[column] > r_min)]]

def variance_inflation_factor(X, exog_idx):
    clf = LinearRegression(fit_intercept=True)
    sub_X = np.delete(np.nan_to_num(X), exog_idx, axis=1)
    sub_y = X[:, exog_idx][np.newaxis].T
    sub_clf = clf.fit(sub_X, sub_y)
    return 1 / (1 - r2_score(sub_y, sub_clf.predict(sub_X)))

class ReduceVIF(BaseEstimator, TransformerMixin):

    def __init__(self, thresh=10.0, nthreads=4, r_min=0, obs=250):
        self.thresh = thresh
        self.nthreads = nthreads
        self.r_min = r_min
        self.obs = obs
        
    def fit(self, X):
        self.X = X
        return self

    def transform(self, X):
        return ReduceVIF.calculate_vif(X, self.thresh, 
                                       self.nthreads, 
                                       self.r_min, 
                                       self.obs)

    @staticmethod
    def calculate_vif(X, thresh=10.0, nthreads=16, r_min=0, obs=250):        
        dropped = True
        vif_cols = []
        X_vif_candidates = slice_by_corr(X, r_min)
        X_vif_candidates = X_vif_candidates.sample(n=obs)
        while dropped:
            variables = X_vif_candidates.columns
            dropped = False
            with Parallel(n_jobs=nthreads, backend='threading') as parallel:
                vif = parallel(
                    delayed(variance_inflation_factor)(
                        np.asarray(X_vif_candidates[variables].values),
                        X_vif_candidates.columns.get_loc(var)) for var in 
                    X_vif_candidates.columns)
            max_vif = max(vif)
            if max_vif > thresh:
                maxloc = vif.index(max_vif)
                print(f'Dropping {X_vif_candidates.columns[maxloc]} with vif={max_vif}')
                vif_cols.append(X_vif_candidates.columns.tolist()[maxloc])
                X_vif_candidates = X_vif_candidates.drop(
                    [X_vif_candidates.columns.tolist()[maxloc]], axis=1)
                dropped = True
        
        if len(vif_cols) > 0:
            return X.drop(columns=vif_cols), vif_cols
        else:
            return X, vif_cols

    
def preprocess_x_y(X, nodrop_columns=[],
                   var_thr=0.95, remove_multi=True,
                   standardize=True, standardizer='mm',
                   std_dev=3, vif_thr=10, missingness_thr=0.50,
                   zero_thr=0.99, nthreads=4):
    from colorama import Fore, Style

    # Replace all near-zero with zeros
    # Drop excessively sparse columns with >zero_thr zeros
    if zero_thr > 0:
        X = X.apply(lambda x: np.where(np.abs(x) < 0.000001, 0, x))
        X_tmp = X.T.loc[(X == 0).sum() < (float(zero_thr)) * X.shape[0]].T

        if len(nodrop_columns) > 0:
            X = pd.concat([X_tmp, X[[i for i in X.columns if i in
                                     nodrop_columns and i not in
                                     X_tmp.columns]]], axis=1)
        else:
            X = X_tmp
        del X_tmp

        if X.empty or len(X.columns) < 5:
            print(f"\n\n{Fore.RED}Empty feature-space (Zero Columns): "
                  f"{X}{Style.RESET_ALL}\n\n")
            return X

    # Remove columns with excessive missing values
    X = X.dropna(thresh=len(X) * (1 - missingness_thr), axis=1)
    if X.empty:
        print(f"\n\n{Fore.RED}Empty feature-space (missingness): "
              f"{X}{Style.RESET_ALL}\n\n")
        return X

    # Apply a simple imputer (note that this assumes extreme cases of
    # missingness have already been addressed). The SimpleImputer is better
    # for smaller datasets, whereas the IterativeImputer performs best on
    # larger sets.

    # from sklearn.experimental import enable_iterative_imputer
    # from sklearn.impute import IterativeImputer
    # imp = IterativeImputer(random_state=0, sample_posterior=True)
    # X = pd.DataFrame(imp.fit_transform(X, y), columns=X.columns)
    imp1 = SimpleImputer()
    X = pd.DataFrame(imp1.fit_transform(X.astype('float32')),
                     columns=X.columns)

    # Standardize X
    if standardize is True:
        if standardizer == 'ss':
            scaler = StandardScaler()
        else:
            scaler = MinMaxScaler()
        X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

    # Remove low-variance columns
    sel = VarianceThreshold(threshold=(var_thr*(1-var_thr)))
    sel.fit(X)
    if len(nodrop_columns) > 0:
        good_var_cols = X.columns[np.concatenate(
            [sel.get_support(indices=True), np.array([X.columns.get_loc(c)
                                                      for c in
                                                      nodrop_columns if
                                                      c in X])])]
    else:
        good_var_cols = X.columns[sel.get_support(indices=True)]
    low_var_cols = [i for i in X.columns if i not in list(good_var_cols)]
    if len(low_var_cols) > 0:
        print(f"Dropping {low_var_cols} for low variance...")
    X = X[good_var_cols]

    if X.empty:
        print(f"\n\n{Fore.RED}Empty feature-space (low-variance): "
              f"{X}{Style.RESET_ALL}\n\n")
        return X
        
    # Remove multicollinear columns
    if remove_multi is True:
        try:
            rvif = ReduceVIF(thresh=vif_thr, nthreads=nthreads)
            X = rvif.fit_transform(X)[0]
            if X.empty or len(X.columns) < 5:
                print(f"\n\n{Fore.RED}Empty feature-space "
                      f"(multicollinearity): "
                      f"{X}{Style.RESET_ALL}\n\n")
                return X
        except:
            print(f"\n\n{Fore.RED}Empty feature-space (multicollinearity): "
                  f"{X}{Style.RESET_ALL}\n\n")
            return X

    print(f"\nX: {X}\n")
    print(f"Features: {list(X.columns)}\n")
    return X


class Razors(object):
    """
    Razors is a callable refit option for `GridSearchCV` whose aim is to
    balance model complexity and cross-validated score in the spirit of the
    "one standard error" rule of Breiman et al. (1984), which showed that
    the tuning hyperparameter associated with the best performing model may be
    prone to overfit. To help mitigate this risk, we can instead instruct
    gridsearch to refit the highest performing 'parsimonious' model, as defined
    using simple statistical rules (e.g. standard error (`sigma`),
    percentile (`eta`), or significance level (`alpha`)) to compare
    distributions of model performance across folds. Importantly, this
    strategy assumes that the grid of multiple cross-validated models
    can be principly ordered from simplest to most complex with respect to some
    target hyperparameter of interest. To use the razors suite, supply
    the `simplify` function partial of the `Razors` class as a callable
    directly to the `refit` argument of `GridSearchCV`.

    Parameters
    ----------
    cv_results : dict of numpy(masked) ndarrays
        See attribute cv_results_ of `GridSearchCV`.
    scoring : str
        Refit scoring metric.
    param : str
        Parameter whose complexity will be optimized.
    rule : str
        Rule for balancing model complexity with performance.
        Options are 'se', 'percentile', and 'ranksum'. Default is 'se'.
    sigma : int
        Number of standard errors tolerance in the case that a standard error
        threshold is used to filter outlying scores across folds. Required if
        `rule`=='se'. Default is 1.
    eta : float
        Percentile tolerance in the case that a percentile threshold
        is used to filter outlier scores across folds. Required if
        `rule`=='percentile'. Default is 0.68.
    alpha : float
        An alpha significance level in the case that wilcoxon rank sum
        hypothesis testing is used to filter outlying scores across folds.
        Required if `rule`=='ranksum'. Default is 0.05.

    References
    ----------
    Breiman, Friedman, Olshen, and Stone. (1984) Classification and Regression
    Trees. Wadsworth.

    Notes
    -----
    Here, 'simplest' is defined by the complexity of the model as influenced by
    some user-defined target parameter (e.g. number of components, number of
    estimators, polynomial degree, cost, scale, number hidden units, weight
    decay, number of nearest neighbors, L1/L2 penalty, etc.).

    The callable API accordingly assumes that the `params` attribute of
    `cv_results_` 1) contains the indicated hyperparameter (`param`) of
    interest, and 2) contains a sequence of values (numeric, boolean, or
    categorical) that are ordered from least to most complex.
    """
    __slots__ = ('cv_results', 'param', 'param_complexity', 'scoring',
                 'rule', 'greater_is_better',
                 '_scoring_funcs', '_scoring_dict',
                 '_n_folds', '_splits', '_score_grid',
                 '_cv_means', '_sigma', '_eta', '_alpha')

    def __init__(
            self,
            cv_results_,
            param,
            scoring,
            rule,
            sigma=1,
            eta=0.95,
            alpha=0.01,
    ):
        import sklearn.metrics

        self.cv_results = cv_results_
        self.param = param
        self.scoring = scoring
        self.rule = rule
        self._scoring_funcs = [
            met
            for met in sklearn.metrics.__all__
            if (met.endswith("_score")) or (met.endswith("_error"))
        ]
        # Set _score metrics to True and _error metrics to False
        self._scoring_dict = dict(
            zip(
                self._scoring_funcs,
                [met.endswith("_score") for met in self._scoring_funcs],
            )
        )
        self.greater_is_better = self._check_scorer()
        self._n_folds = len(list(set([i.split('_')[0] for i in
                                     list(self.cv_results.keys()) if
                                     i.startswith('split')])))
        # Extract subgrid corresponding to the scoring metric of interest
        self._splits = [i for i in list(self.cv_results.keys()) if
                        i.endswith(f"test_{self.scoring}") and
                        i.startswith('split')]
        self._score_grid = np.vstack([self.cv_results[cv] for cv in
                                      self._splits]).T
        self._cv_means = np.array(np.nanmean(self._score_grid, axis=1))
        self._sigma = sigma
        self._eta = eta
        self._alpha = alpha

    def _check_scorer(self):
        """
        Check whether the target refit scorer is negated. If so, adjust
        greater_is_better accordingly.
        """

        if (
                self.scoring not in self._scoring_dict.keys()
                and f"{self.scoring}_score" not in self._scoring_dict.keys()
        ):
            if self.scoring.startswith("neg_"):
                self.greater_is_better = True
            else:
                raise NotImplementedError(f"Scoring metric {self.scoring} not "
                                          f"recognized.")
        else:
            self.greater_is_better = [
                value for key, value in self._scoring_dict.items() if
                self.scoring in key][0]
        return self.greater_is_better

    def _best_low_complexity(self):
        """
        Balance model complexity with cross-validated score.

        Return
        ------
        int
            Index of a model that has the lowest complexity but its test score
            is the highest on average across folds as compared to other models
            that are equally likely to occur.
        """

        # Check parameter(s) whose complexity we seek to restrict
        if not any(self.param in x for x in
                   self.cv_results["params"][0].keys()):
            raise KeyError(f"Parameter {self.param} not found in cv grid.")
        else:
            hyperparam = [
                i for i in self.cv_results["params"][0].keys() if
                i.endswith(self.param)][0]

        # Select low complexity threshold based on specified evaluation rule
        if self.rule == "se":
            if not self._sigma:
                raise ValueError(
                    "For `se` rule, the tolerance "
                    "(i.e. `_sigma`) parameter cannot be null."
                )
            l_cutoff, h_cutoff = self.call_standard_error()
        elif self.rule == "percentile":
            if not self._eta:
                raise ValueError(
                    "For `percentile` rule, the tolerance "
                    "(i.e. `_eta`) parameter cannot be null."
                )
            l_cutoff, h_cutoff = self.call_percentile()
        elif self.rule == "ranksum":
            if not self._alpha:
                raise ValueError(
                    "For `ranksum` rule, the alpha-level "
                    "(i.e. `_alpha`) parameter cannot be null."
                )
            l_cutoff, h_cutoff = self.call_rank_sum_test()
        else:
            raise NotImplementedError(f"{self.rule} is not a valid "
                                      f"rule of RazorCV.")

        self.cv_results[f"param_{hyperparam}"].mask = np.where(
            (self._cv_means >= float(l_cutoff)) &
            (self._cv_means <= float(h_cutoff)),
            True, False)

        if np.sum(self.cv_results[f"param_{hyperparam}"].mask) == 0:
            print(f"\nLow: {l_cutoff}")
            print(f"High: {h_cutoff}")
            print(f"{self._cv_means}")
            print(f"hyperparam: {hyperparam}\n")
            raise ValueError("No valid grid columns remain within the "
                             "boundaries of the specified razor")

        highest_surviving_rank = np.nanmin(
            self.cv_results[f"rank_test_{self.scoring}"][
                self.cv_results[f"param_{hyperparam}"].mask])

        # print(f"Highest surviving rank: {highest_surviving_rank}\n")

        return np.flatnonzero(np.isin(
            self.cv_results[f"rank_test_{self.scoring}"],
            highest_surviving_rank))[0]

    def call_standard_error(self):
        """
        Returns the simplest model whose performance is within `sigma`
        standard errors of the average highest performing model.
        """

        # Estimate the standard error across folds for each column of the grid
        cv_se = np.array(np.nanstd(self._score_grid, axis=1) /
                         np.sqrt(self._n_folds))

        # Determine confidence interval
        if self.greater_is_better:
            best_score_idx = np.nanargmax(self._cv_means)
            h_cutoff = self._cv_means[best_score_idx] + cv_se[best_score_idx]
            l_cutoff = self._cv_means[best_score_idx] - cv_se[best_score_idx]
        else:
            best_score_idx = np.nanargmin(self._cv_means)
            h_cutoff = self._cv_means[best_score_idx] - cv_se[best_score_idx]
            l_cutoff = self._cv_means[best_score_idx] + cv_se[best_score_idx]

        return l_cutoff, h_cutoff

    def call_rank_sum_test(self):
        """
        Returns the simplest model whose paired performance across folds is
        insignificantly different from the average highest performing,
        at a predefined `alpha` level of significance.
        """

        from scipy.stats import wilcoxon
        import itertools

        if self.greater_is_better:
            best_score_idx = np.nanargmax(self._cv_means)
        else:
            best_score_idx = np.nanargmin(self._cv_means)

        # Perform signed Wilcoxon rank sum test for each pair combination of
        # columns against the best average score column
        tests = [pair for pair in list(itertools.combinations(range(
            self._score_grid.shape[0]), 2)) if best_score_idx in pair]

        p_dict = {}
        for i, test in enumerate(tests):
            p_dict[i] = wilcoxon(self._score_grid[test[0], :],
                                 self._score_grid[test[1], :])[1]

        # Sort and prune away significant tests
        p_dict = {k: v for k, v in sorted(p_dict.items(),
                                          key=lambda item: item[1]) if
                  v > self._alpha}

        # Flatten list of tuples, remove best score index, and take the
        # lowest and highest remaining bounds
        tests = [j for j in list(set(list(sum([tests[i] for i in
                                               list(p_dict.keys())],
                                              ())))) if j != best_score_idx]
        if self.greater_is_better:
            h_cutoff = self._cv_means[
                np.nanargmin(self.cv_results[
                                 f"rank_test_{self.scoring}"][tests])]
            l_cutoff = self._cv_means[
                np.nanargmax(self.cv_results[
                                 f"rank_test_{self.scoring}"][tests])]
        else:
            h_cutoff = self._cv_means[
                np.nanargmax(self.cv_results[
                                 f"rank_test_{self.scoring}"][tests])]
            l_cutoff = self._cv_means[
                np.nanargmin(self.cv_results[
                                 f"rank_test_{self.scoring}"][tests])]

        return l_cutoff, h_cutoff


    def call_percentile(self):
        """
        Returns the simplest model whose performance is within the `eta`
        percentile of the average highest performing model.
        """

        # Estimate the indicated percentile, and its inverse, across folds for
        # each column of the grid
        perc_cutoff = np.nanpercentile(self._score_grid,
                                       [100 * self._eta,
                                        100 - 100 * self._eta], axis=1)

        # Determine bounds of the percentile interval
        if self.greater_is_better:
            best_score_idx = np.nanargmax(self._cv_means)
            h_cutoff = perc_cutoff[0, best_score_idx]
            l_cutoff = perc_cutoff[1, best_score_idx]
        else:
            best_score_idx = np.nanargmin(self._cv_means)
            h_cutoff = perc_cutoff[0, best_score_idx]
            l_cutoff = perc_cutoff[1, best_score_idx]

        return l_cutoff, h_cutoff

    @staticmethod
    def simplify(param, scoring, rule='se', sigma=1, eta=0.68, alpha=0.01):
        """
        Callable to be run as `refit` argument of `GridsearchCV`.

        Parameters
        ----------
        param : str
            Parameter with the largest influence on model complexity.
        scoring : str
            Refit scoring metric.
        sigma : int
            Number of standard errors tolerance in the case that a standard
            error threshold is used to filter outlying scores across folds.
            Only applicable if `rule`=='se'. Default is 1.
        eta : float
            Acceptable percent tolerance in the case that a percentile
            threshold is used. Only applicable if `rule`=='percentile'.
            Default is 0.68.
        alpha : float
            Alpha-level to use for signed wilcoxon rank sum testing.
            Only applicable if `rule`=='ranksum'. Default is 0.01.
        """
        from functools import partial

        def razor_pass(
                cv_results_, param, scoring, rule, sigma, alpha, eta
        ):
            rcv = Razors(cv_results_, param, scoring, rule=rule,
                         sigma=sigma, alpha=alpha, eta=eta)
            return rcv._best_low_complexity()

        return partial(
            razor_pass,
            param=param,
            scoring=scoring,
            rule=rule,
            sigma=sigma,
            alpha=alpha,
            eta=eta,
        )

def divide_df(df_all,train_len):
    return df_all.loc[:train_len-1], df_all.loc[train_len:].drop('target',axis=1)

def concat_df(train_data, test_data):
    return pd.concat([train_data, test_data], sort=True).reset_index(drop=True)

In [None]:
preprocess = FunctionTransformer(preprocess_x_y)
X_clean = preprocess.fit_transform(X=X)
surviving_features = list(X_clean.columns)

In [None]:
seed=42
X_train, X_test, y_train, y_test = train_test_split(X_clean, y, random_state=seed)

X_train = X_train.reset_index(drop=True)
y_train = pd.DataFrame(y_train).reset_index(drop=True)

X_train = X_train.head(10000)
y_train = y_train.head(10000)

X_test = X_test.reset_index(drop=True)
y_test = pd.DataFrame(y_test).reset_index(drop=True)

X_test = X_test.head(2000)
y_test = y_test.head(2000)

In [None]:
models = [
            'rf'
         ]

estimators = [
        RandomForestRegressor(random_state=42, min_samples_split=3)
]

In [None]:
params={models[0]: {
                    'max_depth': [3, 4],
                    'n_estimators': [50, 75, 100],
                    'min_samples_leaf': [3, 5, 7]
                   }
       }

In [None]:
model_factory = {}

inner_scoring = "neg_mean_absolute_error"

for name, estimator in zip(models, estimators):
    print(name)
    model_factory[name] = {}
    
    pipe = Pipeline([
        (name, TransformedTargetRegressor(regressor=estimator, transformer=MinMaxScaler()))
    ])
    model_params = {}
    for hyperparam in params[name].keys():
        model_params[f"{name}__regressor__{hyperparam}"] = params[name][hyperparam]
    pipe_grid_cv = GridSearchCV(pipe, model_params, scoring=[inner_scoring],
                       refit=Razors.simplify(param=f'{name}__regressor__n_estimators',
                                             scoring=inner_scoring, rule="se", sigma=1),
                       cv=KFold(n_splits=5, shuffle=True, random_state=seed), n_jobs=-1)
    pipe_grid_cv.fit(X_train, y_train.values.ravel())
    model_factory[name]['oos_score'] = cross_val_score(pipe_grid_cv, X_test, y_test.values.ravel(),
                                                       scoring='r2',
                                                       cv=KFold(n_splits=10, shuffle=True,
                                                                random_state=seed + 1))
    model_factory[name]['best_params'] = pipe_grid_cv.best_params_
    model_factory[name]['best_estimator'] = pipe_grid_cv.best_estimator_

leaderboard = {}
for mod in model_factory.keys():
    leaderboard[mod] = np.mean(model_factory[mod]['oos_score'])

best_estimator_name = max(leaderboard, key=leaderboard.get)

best_estimator = model_factory[best_estimator_name]['best_estimator']

model_factory

In [None]:
model_factory[name]['best_params']

In [None]:
outer_best = KFold(n_splits=10, shuffle=True, random_state=seed)

best_estimator.fit(X_clean, y)

scores = cross_val_score(best_estimator, X_clean, y, scoring='r2', cv=outer_best, n_jobs=-1, error_score='raise')
scores

In [None]:
viz = FeatureImportances(best_estimator.named_steps['rf'].regressor_)
viz.fit(X_clean, y)
viz.show()

In [None]:
model_path = (
    f"/kaggle/working/rf_model.joblib"
)
dump(best_estimator, model_path)

In [None]:
submission = pd.read_csv('../input/us-patent-phrase-to-phrase-matching/sample_submission.csv')

In [None]:
scaler = StandardScaler()
y_pred = best_estimator.predict(pd.DataFrame(scaler.fit_transform(test[surviving_features]), columns=surviving_features))

In [None]:
submission['id'] = test['id']
submission['score'] = y_pred

In [None]:
submission.to_csv('submission.csv', index=False)