In [100]:
from typing import List
import numpy as np

In [1]:
from transformers import pipeline

In [3]:
import transformers as tr

In [69]:
model = 'mrm8488/distill-bert-base-spanish-wwm-cased-finetuned-spa-squad2-es'

In [79]:
featurizer = pipeline( 'feature-extraction', device=0, model='mrm8488/bert-spanish-cased-finetuned-pos' )

In [232]:
del q_norms

In [185]:
def avg_feats( feat_list: List[List[float]] ):
    arr = np.array( feat_list )
    # print( arr.shape )
    agg = arr.mean(axis=0)
    x = agg - agg.mean()
    sigma = np.sqrt( (x ** 2).sum() )
    return x / sigma

def avg_feats_for_each( predict_result: List[List[List[float]]]):
    return np.vstack([avg_feats(feats) for feats in predict_result])

In [None]:
def to_numpy_normalized( feat : List[List[float]] ):
    mat = np.array( feat )
    means = mat.mean(axis=1, keepdims=True)
    print( means.shape )
    mat -= means
    sigmas = np.sqrt( (mat ** 2).sum(axis=1, keepdims=True) ) 
    return mat / sigmas

In [235]:
descs = [
    """Cabrera 85-12, es un proyecto de apartamentos tipo loft, ubicado en el corazón gastronómico y cultural de Bogotá. El edificio se encuentra sobre la calle 85 rodeado de restaurantes, centros comerciales, el centro financiero y vías de acceso. Los apartamentos cuentan con acabados y se entregan semi-dotados. Cuenta con zonas comunes como terraza con TRX, salón de trabajo, lavandería y parqueaderos.""",
    "DEC0 120 es un proyecto ubicado en una de las mejores y más exclusivas zonas de la ciudad, rodeado de parques y calles comerciales, es el lugar perfecto para vivir. Con excelentes vías de acceso y los mejores restaurantes y comercios a solo pasos del proyecto, hacen de este edificio con acabados de lujo, una hermosa terraza y apartamentos de 1 y 2 habitaciones uno de los proyectos mas destacados del sector.".replace(' 2 ', ' dos ')
]

qrys = [ "apartamentos tipo loft",
         "apartamentos de dos habitaciones",
         "apartamentos de 2 habitaciones",
         "apartamentos de tres habitaciones Bogotá",
         "casass de tres habitaciones"]


In [238]:
desc_feats = [ to_numpy_normalized(feat) for feat in featurizer.predict( descs ) ]

q_feats = [ to_numpy_normalized(feat) for feat in featurizer.predict( qrys ) ]

q_mat = avg_feats_for_each( q_feats )

# for feat in desc_feats: 
#    print( feat.shape )
#    print( (feat ** 2).sum(axis=1))

(84, 1)
(84, 1)
(7, 1)
(7, 1)
(7, 1)
(7, 1)
(7, 1)


In [220]:
desc_mat = avg_feats_for_each( desc_feats )
desc_norms = np.sqrt( (desc_mat * desc_mat).sum(axis=1))

In [189]:
desc_norms

array([1., 1.])

In [126]:
desc_mat.shape

(2, 768)

In [159]:
desc_mat.mean(axis=1), q_mat.mean(axis=1)

(array([5.78241159e-18, 0.00000000e+00]),
 array([ 0.00000000e+00, -1.15648232e-17,  4.62592927e-18, -1.50342701e-17]))

In [180]:
np.dot( q_mat, desc_mat.transpose() ) / q_norms.reshape(-1, 1) / desc_norms.reshape(1, -1)

array([[0.73820766, 0.68683912],
       [0.78245837, 0.72678742],
       [0.8830536 , 0.88933139],
       [0.70350262, 0.66868772],
       [0.70216861, 0.65906851]])

In [182]:
# for f in feats[1]: print( len(f))

In [237]:
def q_avg_best_match( q_feat_avg: np.array, desc_feats: np.array ):
    return np.max( np.dot( desc_feats, q_feat_avg ) )

In [240]:
for q_i, q in enumerate(qrys):
    print( q_i, q ) 
    for d_i, desc_feat in enumerate(desc_feats):
        score =  q_avg_best_match( q_mat[q_i, :], desc_feat )
        print( '\t', score )

0 apartamentos tipo loft
	 0.8272203830852722
	 0.8099554336580235
1 apartamentos de dos habitaciones
	 0.7568476878900734
	 0.7667325027540492
2 apartamentos de 2 habitaciones
	 0.772081891032358
	 0.782980245204741
3 apartamentos de tres habitaciones Bogotá
	 0.7691742877510046
	 0.7885440435250596
4 casass de tres habitaciones
	 0.7717626807417166
	 0.7779716321332767


In [66]:
classifier( {'question': 'tienen aptos con balcon?', 
             'context': 'precios desde $400, el area es 20 m2. la haus es una inmobiliaria digital. todo lo que quieras'})



{'score': 0.15911121666431427, 'start': 8, 'end': 13, 'answer': 'desde'}

In [6]:
help( pipeline )

Help on function pipeline in module transformers.pipelines:

pipeline(task: str, model: Optional = None, config: Union[str, transformers.configuration_utils.PretrainedConfig, NoneType] = None, tokenizer: Union[str, transformers.tokenization_utils.PreTrainedTokenizer, NoneType] = None, framework: Union[str, NoneType] = None, **kwargs) -> transformers.pipelines.Pipeline
    Utility factory method to build a :class:`~transformers.Pipeline`.
    
    Pipelines are made of:
    
        - A :doc:`tokenizer <tokenizer>` in charge of mapping raw textual input to token.
        - A :doc:`model <model>` to make predictions from the inputs.
        - Some (optional) post processing for enhancing model's output.
    
    Args:
        task (:obj:`str`):
            The task defining which pipeline will be returned. Currently accepted tasks are:
    
            - :obj:`"feature-extraction"`: will return a :class:`~transformers.FeatureExtractionPipeline`.
            - :obj:`"sentiment-analysis"`:

In [68]:
from transformers import AutoTokenizer

In [72]:
tokenizer = AutoTokenizer.from_pretrained( model ) 

In [76]:
tokenizer.tokenize( "I'm in love")

['i', '[UNK]', 'm', 'in', 'lo', '##ve']

In [52]:
classifier(['I love you', 'we hope you don''t hate it'])

[{'label': 'POSITIVE', 'score': 0.9998656511306763},
 {'label': 'NEGATIVE', 'score': 0.9600155353546143}]

In [50]:
featurizer.model

DistilBertModel(
  (embeddings): Embeddings(
    (word_embeddings): Embedding(28996, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (transformer): Transformer(
    (layer): ModuleList(
      (0): TransformerBlock(
        (attention): MultiHeadSelfAttention(
          (dropout): Dropout(p=0.1, inplace=False)
          (q_lin): Linear(in_features=768, out_features=768, bias=True)
          (k_lin): Linear(in_features=768, out_features=768, bias=True)
          (v_lin): Linear(in_features=768, out_features=768, bias=True)
          (out_lin): Linear(in_features=768, out_features=768, bias=True)
        )
        (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (ffn): FFN(
          (dropout): Dropout(p=0.1, inplace=False)
          (lin1): Linear(in_features=768, out_features=3072, bias=True)
          (lin2): Linear(i

In [49]:
featurizer = pipeline('feature-extraction', device=0)

In [46]:
featurizer.predict( 'yeah right' )[0][0]

[0.2736629545688629,
 0.17628303170204163,
 -0.05449565127491951,
 -0.22952410578727722,
 -0.24320414662361145,
 -0.10173299908638,
 0.3366072475910187,
 -0.0261160247027874,
 0.04735410585999489,
 -1.099527359008789,
 -0.2792090177536011,
 0.11661259829998016,
 -0.13834711909294128,
 0.034527137875556946,
 -0.46139174699783325,
 0.08165648579597473,
 0.0911460667848587,
 0.10929464548826218,
 -0.02681615948677063,
 -0.2182285487651825,
 0.07168041914701462,
 -0.3322373330593109,
 0.5394918322563171,
 -0.20307424664497375,
 0.11105145514011383,
 0.07048407942056656,
 0.26183366775512695,
 0.18038900196552277,
 -0.22586673498153687,
 0.34462854266166687,
 0.16476604342460632,
 0.10727236419916153,
 -0.023726899176836014,
 -0.11883237957954407,
 -0.14308322966098785,
 0.150472491979599,
 -0.012073159217834473,
 -0.23489944636821747,
 -0.07982596009969711,
 -0.11661696434020996,
 -0.4239788353443146,
 0.12952463328838348,
 0.5323837399482727,
 -0.041764579713344574,
 0.14296166598796844,


In [45]:
help(featurizer)

Help on FeatureExtractionPipeline in module transformers.pipelines object:

class FeatureExtractionPipeline(Pipeline)
 |  FeatureExtractionPipeline(model: Union[ForwardRef('PreTrainedModel'), ForwardRef('TFPreTrainedModel')], tokenizer: transformers.tokenization_utils.PreTrainedTokenizer, modelcard: Union[transformers.modelcard.ModelCard, NoneType] = None, framework: Union[str, NoneType] = None, args_parser: transformers.pipelines.ArgumentHandler = None, device: int = -1, task: str = '')
 |  
 |  Feature extraction pipeline using no model head. This pipeline extracts the hidden states from the base
 |  transformer, which can be used as features in downstream tasks.
 |  
 |  This feature extraction pipeline can currently be loaded from :func:`~transformers.pipeline` using the task
 |  identifier: :obj:`"feature-extraction"`.
 |  
 |  All models may be used for this pipeline. See a list of all models, including community-contributed models on
 |  `huggingface.co/models <https://huggingfa

In [15]:
import os
from pathlib import Path

In [20]:
dir(classifier)

['__abstractmethods__',
 '__call__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__slots__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_abc_impl',
 '_args_parser',
 '_forward',
 '_parse_and_tokenize',
 'binary_output',
 'check_model_type',
 'default_input_names',
 'device',
 'device_placement',
 'ensure_tensor_on_device',
 'framework',
 'model',
 'modelcard',
 'predict',
 'return_all_scores',
 'save_pretrained',
 'task',
 'tokenizer',
 'transform']

In [34]:
classifier.predict(["whatever you want", "Yeah right..."] )

[{'label': 'POSITIVE', 'score': 0.995046079158783},
 {'label': 'POSITIVE', 'score': 0.9995409250259399}]