In [None]:
!pip install transformers

In [None]:
!pip install datasets

## Distilbert Test

In [31]:
from transformers import DistilBertTokenizer, DistilBertModel, DistilBertForSequenceClassification
distil_tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-multilingual-cased")
distil_model = DistilBertModel.from_pretrained("Distil_Model")




Some weights of the model checkpoint at Distil_Model were not used when initializing DistilBertModel: ['vocab_transform.bias', 'vocab_projector.weight', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_projector.bias', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [32]:
inputs = distil_tokenizer("Hello, my dog is cute", return_tensors="pt")
outputs = distil_model(**inputs)


In [33]:
import torch
vec_out = torch.mean(outputs.last_hidden_state, dim = 1).squeeze()

In [34]:
vec_out.shape

torch.Size([768])

In [35]:
from datasets import load_dataset
ds = load_dataset("csv", data_files="title_conference.csv", split="train")


Using custom data configuration default-aea0528d0efab2d2
Reusing dataset csv (C:\Users\paperspace\.cache\huggingface\datasets\csv\default-aea0528d0efab2d2\0.0.0\652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a)


In [36]:
ds = ds.train_test_split(test_size=0.1)

In [37]:
ds

DatasetDict({
    train: Dataset({
        features: ['Title', 'Conference'],
        num_rows: 2256
    })
    test: Dataset({
        features: ['Title', 'Conference'],
        num_rows: 251
    })
})

In [38]:
with torch.no_grad():
    ds_with_bert_embeddings_v2 = ds["train"].map(lambda example: {'bert_embeddings': torch.mean(distil_model(**distil_tokenizer(example["Title"], return_tensors="pt")).last_hidden_state, dim = 1).squeeze().numpy()})

  0%|          | 0/2256 [00:00<?, ?ex/s]

In [39]:
ds_with_bert_embeddings_v2

Dataset({
    features: ['Title', 'Conference', 'bert_embeddings'],
    num_rows: 2256
})

In [40]:
ds_with_bert_embeddings_v2.add_faiss_index(column='bert_embeddings')

  0%|          | 0/3 [00:00<?, ?it/s]

Dataset({
    features: ['Title', 'Conference', 'bert_embeddings'],
    num_rows: 2256
})

In [41]:
def predict(query):
    with torch.no_grad():
        question_embedding = torch.mean(distil_model(**distil_tokenizer(query, return_tensors="pt")).last_hidden_state, dim = 1).squeeze().numpy()
    
    scores, retrieved_examples = ds_with_bert_embeddings_v2.get_nearest_examples('bert_embeddings', question_embedding, k=3)
    return retrieved_examples["Conference"][0]

In [42]:
with torch.no_grad():
    ds_with_test_predictions = ds["test"].map(lambda example: {'Predictions': predict(example["Title"])})

  0%|          | 0/251 [00:00<?, ?ex/s]

In [43]:
ds_with_test_predictions.to_csv("distil_predictions_conference_v4.csv")

Creating CSV from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

20832

## T5 Test

In [110]:
!pip install sentencepiece

Collecting sentencepiece
  Downloading sentencepiece-0.1.97-cp39-cp39-win_amd64.whl (1.1 MB)
Installing collected packages: sentencepiece
Successfully installed sentencepiece-0.1.97


In [46]:
from transformers import MT5Tokenizer, MT5EncoderModel
t5_tokenizer = MT5Tokenizer.from_pretrained("google/mt5-small")
t5_model = MT5EncoderModel.from_pretrained("google/mt5-small")
input_ids = t5_tokenizer("Studies have been shown that owning a dog is good for you", return_tensors="pt").input_ids 
outputs = t5_model(input_ids=input_ids)

Downloading spiece.model:   0%|          | 0.00/4.11M [00:00<?, ?B/s]

Downloading special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

Downloading tokenizer_config.json:   0%|          | 0.00/82.0 [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/553 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of the model checkpoint at google/mt5-small were not used when initializing MT5EncoderModel: ['decoder.block.4.layer.1.layer_norm.weight', 'decoder.block.6.layer.1.EncDecAttention.o.weight', 'decoder.block.3.layer.0.layer_norm.weight', 'decoder.block.1.layer.1.layer_norm.weight', 'decoder.block.1.layer.1.EncDecAttention.v.weight', 'decoder.block.0.layer.1.EncDecAttention.v.weight', 'decoder.block.5.layer.0.SelfAttention.v.weight', 'decoder.block.2.layer.0.SelfAttention.o.weight', 'decoder.block.5.layer.0.layer_norm.weight', 'decoder.block.5.layer.2.DenseReluDense.wi_0.weight', 'decoder.block.1.layer.1.EncDecAttention.o.weight', 'decoder.block.3.layer.1.EncDecAttention.k.weight', 'decoder.embed_tokens.weight', 'decoder.block.1.layer.1.EncDecAttention.k.weight', 'decoder.block.6.layer.2.DenseReluDense.wi_0.weight', 'decoder.block.7.layer.1.EncDecAttention.q.weight', 'decoder.block.3.layer.0.SelfAttention.v.weight', 'decoder.block.1.layer.0.SelfAttention.o.weight', 'decoder.b

In [47]:
outputs.last_hidden_state.shape

torch.Size([1, 16, 512])

In [48]:
import torch

In [49]:
with torch.no_grad():
    ds_with_t5_embeddings = ds.map(lambda example: {'t5_embeddings': torch.mean(t5_model(input_ids=t5_tokenizer(example["Title"], return_tensors="pt").input_ids).last_hidden_state, dim = 1).squeeze().numpy()})

  0%|          | 0/2256 [00:00<?, ?ex/s]

  0%|          | 0/251 [00:00<?, ?ex/s]

In [50]:
ds_with_t5_embeddings["train"].add_faiss_index(column='t5_embeddings')

  0%|          | 0/3 [00:00<?, ?it/s]

Dataset({
    features: ['Title', 'Conference', 't5_embeddings'],
    num_rows: 2256
})

In [51]:
def t5_predict(query):
    with torch.no_grad():
        question_embedding = torch.mean(t5_model(input_ids=t5_tokenizer(query, return_tensors="pt").input_ids).last_hidden_state, dim = 1).squeeze().numpy()
    
    scores, retrieved_examples = ds_with_t5_embeddings["train"].get_nearest_examples('t5_embeddings', question_embedding, k=3)
    return retrieved_examples["Conference"][0] + "|" + retrieved_examples["Title"][0]

In [52]:
with torch.no_grad():
    ds_with_t5_predictions = ds["test"].map(lambda example: {'Predictions': t5_predict(example["Title"])})

  0%|          | 0/251 [00:00<?, ?ex/s]

In [53]:
ds_with_t5_predictions.to_csv('Conference_Predictions_MT5.csv')

Creating CSV from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

38894

In [16]:
retrieved_examples["answer"]

['We will provide tracking number once the order has been shipped',
 'We will provide shipment details once the order has been shipped',
 'The customer can place return request at returns@dncfashionista.com']