In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
! pip install -U sentence-transformers
! pip install fugashi
! pip install ipadic

Collecting sentence-transformers
  Downloading sentence-transformers-2.0.0.tar.gz (85 kB)
[K     |████████████████████████████████| 85 kB 2.7 MB/s 
[?25hCollecting transformers<5.0.0,>=4.6.0
  Downloading transformers-4.10.2-py3-none-any.whl (2.8 MB)
[K     |████████████████████████████████| 2.8 MB 31.9 MB/s 
Collecting sentencepiece
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 14.1 MB/s 
[?25hCollecting huggingface-hub
  Downloading huggingface_hub-0.0.17-py3-none-any.whl (52 kB)
[K     |████████████████████████████████| 52 kB 1.3 MB/s 
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 33.6 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-5.4.1-cp37-cp37m-manylinux1_x86_64.whl (636 kB)
[K     |██

### Loading Libraries

In [None]:
import pandas as pd
import numpy as np

from tqdm._tqdm_notebook import tqdm_notebook as tqdm
tqdm.pandas()

from sentence_transformers import SentenceTransformer
from transformers import BertJapaneseTokenizer, BertModel
from sentence_transformers import SentenceTransformer, util

import torch

from sklearn.metrics import accuracy_score

Please use `tqdm.notebook.*` instead of `tqdm._tqdm_notebook.*`
  after removing the cwd from sys.path.


In [None]:
SEED = 13
np.random.seed(SEED)

### Load Data

In [None]:
filepath = "/content/drive/MyDrive/Macnica/Projects/STS/"
xls =  pd.ExcelFile(filepath + "data/dataset_JA_Clinical_STS.xls")
df = xls.parse(0)

In [None]:
df.columns = ['s1', 's2', 'score']

In [None]:
class SentenceBertJapanese:
    def __init__(self, model_name_or_path, device=None):
        self.tokenizer = BertJapaneseTokenizer.from_pretrained(model_name_or_path)
        self.model = BertModel.from_pretrained(model_name_or_path)
        self.model.eval()

        if device is None:
            device = "cuda" if torch.cuda.is_available() else "cpu"
        self.device = torch.device(device)
        self.model.to(device)

    def _mean_pooling(self, model_output, attention_mask):
        token_embeddings = model_output[0] #First element of model_output contains all token embeddings
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
        return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

    @torch.no_grad()
    def encode(self, sentences, batch_size=8):
        all_embeddings = []
        iterator = range(0, len(sentences), batch_size)
        for batch_idx in iterator:
            batch = sentences[batch_idx:batch_idx + batch_size]

            encoded_input = self.tokenizer.batch_encode_plus(batch, padding="longest", 
                                           truncation=True, return_tensors="pt").to(self.device)
            model_output = self.model(**encoded_input)
            sentence_embeddings = self._mean_pooling(model_output, encoded_input["attention_mask"]).to('cpu')

            all_embeddings.extend(sentence_embeddings)

        # return torch.stack(all_embeddings).numpy()
        return torch.stack(all_embeddings)



In [None]:
from sentence_transformers import SentenceTransformer, util
models = ['distiluse-base-multilingual-cased-v2',
          'paraphrase-multilingual-MiniLM-L12-v2',
          'paraphrase-multilingual-mpnet-base-v2',
          'sonoisa/sentence-bert-base-ja-mean-tokens',
          'sentence-transformers/paraphrase-xlm-r-multilingual-v1']

model1 = SentenceTransformer(models[0])
model2 = SentenceTransformer(models[1])
model3 = SentenceTransformer(models[2])
model4 = SentenceBertJapanese(models[3])
model5 = SentenceTransformer(models[4])


The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'BertTokenizer'. 
The class this function is called from is 'BertJapaneseTokenizer'.


In [None]:
from cleaning import *
df['s1'] = cleaning(df['s1'])
df['s2'] = cleaning(df['s2'])

  0%|          | 0/3669 [00:00<?, ?it/s]

  0%|          | 0/3669 [00:00<?, ?it/s]

  0%|          | 0/3669 [00:00<?, ?it/s]

  0%|          | 0/3669 [00:00<?, ?it/s]

  0%|          | 0/3669 [00:00<?, ?it/s]

  0%|          | 0/3669 [00:00<?, ?it/s]

  0%|          | 0/3669 [00:00<?, ?it/s]

  0%|          | 0/3669 [00:00<?, ?it/s]

  0%|          | 0/3669 [00:00<?, ?it/s]

  0%|          | 0/3669 [00:00<?, ?it/s]

  0%|          | 0/3669 [00:00<?, ?it/s]

  0%|          | 0/3669 [00:00<?, ?it/s]

  0%|          | 0/3669 [00:00<?, ?it/s]

  0%|          | 0/3669 [00:00<?, ?it/s]

In [None]:
from sklearn.model_selection import train_test_split

df['split'] = ''

df_train, df_test = train_test_split(df,  test_size=0.4)
df_test, df_dev = train_test_split(df_test,  test_size=0.5)

df_train = df_train.reset_index(drop = True)
df_train['split'] = 'train'

df_test = df_test.reset_index(drop =True)
df_test['split'] = 'test'

df_dev = df_dev.reset_index(drop =True)
df_dev['split'] = 'dev'

frames = [df_train, df_test, df_dev ]
df = pd.concat(frames)


In [None]:
n = '5'
exec(f'model = model{n}')

In [None]:
#Compute embedding for both lists
#Change models numbers according to the need
embeddings1 = model.encode(df_train['s1'])
embeddings2 = model.encode(df_train['s2'])

#Compute cosine-similarities
cosine_scores = util.pytorch_cos_sim(embeddings1, embeddings2)
#Creating an empty pred column to add prediction scores
df_train['cs'+n] = ''
for i in range(len(df_train['s1'])):
    df_train['cs'+n][i] = cosine_scores[i][i].item()

df_train['cs'+n] = df_train['cs'+n].apply(lambda x: int(round(x*5, 0)))

#Compute embedding for both lists
embeddings1 = model.encode(df_test['s1'])
embeddings2 = model.encode(df_test['s2'])

#Compute cosine-similarities
cosine_scores = util.pytorch_cos_sim(embeddings1, embeddings2)
df_test['cs'+n] = ''
for i in range(len(df_test['s1'])):
    df_test['cs'+n][i] = cosine_scores[i][i].item()

df_test['cs'+n] = df_test['cs'+n].apply(lambda x: int(round(x*5, 0)))

#Compute embedding for both lists
embeddings1 = model.encode(df_dev['s1'])
embeddings2 = model.encode(df_dev['s2'])

#Compute cosine-similarities
cosine_scores = util.pytorch_cos_sim(embeddings1, embeddings2)
df_dev['cs'+n] = ''
for i in range(len(df_dev['s1'])):
    df_dev['cs'+n][i] = cosine_scores[i][i].item()

df_dev['cs'+n] = df_dev['cs'+n].apply(lambda x: int(round(x*5, 0)))

print("Accuracy of Train cs" + n + ":", accuracy_score(df_train['score'], df_train['cs'+n]))
print("Accuracy of Test cs" + n + ":", accuracy_score(df_test['score'], df_test['cs'+n]))
print("Accuracy of Dev cs" + n + ":", accuracy_score(df_dev['score'], df_dev['cs'+n]))



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Accuracy of Train cs5: 0.23262153566560653
Accuracy of Test cs5: 0.21389645776566757
Accuracy of Dev cs5: 0.22752043596730245


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


## Retraining Models

In [None]:
from sentence_transformers import SentenceTransformer, LoggingHandler, losses, util, InputExample
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
from sentence_transformers import SentenceTransformer, SentencesDataset, InputExample, losses

train_samples = []
dev_samples = []
test_samples = []
for i in range(df.shape[0]):
  score = float(df['score'].iloc[i])/5.0
  inp_example = InputExample(texts=[df['s1'].iloc[i], df['s2'].iloc[i]], label=score)

  if df['split'].iloc[i] == 'dev':
    dev_samples.append(inp_example)
  elif df['split'].iloc[i] == 'test':
    test_samples.append(inp_example)
  else:
    train_samples.append(inp_example)




In [None]:
from torch.utils.data import DataLoader
import logging
import math
from sentence_transformers import SentenceTransformer, LoggingHandler, losses, util, InputExample
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
from datetime import datetime
import os
import gzip
import csv

model_name = models[int(n)-1]

train_batch_size = 16
num_epochs = 4

model_save_path = 'output/training_stsbenchmark_'+model_name.replace("/", "-")+'-'+datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=train_batch_size)
train_loss = losses.CosineSimilarityLoss(model=model)

#evaluator = EmbeddingSimilarityEvaluator(df_train['s1'], df_train['s1'], scores)
evaluator = EmbeddingSimilarityEvaluator.from_input_examples(dev_samples, name='sts-dev')


# Configure the training. We skip evaluation in this example
warmup_steps = math.ceil(len(train_dataloader) * num_epochs  * 0.1) #10% of train data for warm-up
logging.info("Warmup-steps: {}".format(warmup_steps))


# Train the model
model.fit(train_objectives=[(train_dataloader, train_loss)],
          evaluator=evaluator,
          epochs=num_epochs,
          evaluation_steps=1000,
          warmup_steps=warmup_steps)

Epoch:   0%|          | 0/4 [00:00<?, ?it/s]

Iteration:   0%|          | 0/138 [00:00<?, ?it/s]

Iteration:   0%|          | 0/138 [00:00<?, ?it/s]

Iteration:   0%|          | 0/138 [00:00<?, ?it/s]

Iteration:   0%|          | 0/138 [00:00<?, ?it/s]

In [None]:
#Compute embedding for both lists
embeddings1 = model.encode(df_dev['s1'])
embeddings2 = model.encode(df_dev['s2'])

#Compute cosine-similarities
cosine_scores = util.pytorch_cos_sim(embeddings1, embeddings2)
df_dev['cs'+n] = ''
for i in range(len(df_dev['s1'])):
    df_dev['cs'+n][i] = cosine_scores[i][i].item()

from sklearn.metrics import accuracy_score
df_dev['cs'+n] = df_dev['cs'+n].apply(lambda x: int(round(x*5, 0)))

print("Accuracy of Dev cs" + n + ":", accuracy_score(df_dev['score'], df_dev['cs'+n]))


Accuracy of Dev cs5: 0.44005449591280654


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':


In [None]:
#Compute embedding for both lists
embeddings1 = model.encode(df_test['s1'])
embeddings2 = model.encode(df_test['s2'])

#Compute cosine-similarities
cosine_scores = util.pytorch_cos_sim(embeddings1, embeddings2)
df_test['cs'+n] = ''
for i in range(len(df_test['s1'])):
    df_test['cs'+n][i] = cosine_scores[i][i].item()

from sklearn.metrics import accuracy_score
df_test['cs'+n] = df_test['cs'+n].apply(lambda x: int(round(x*5, 0)))

print("Accuracy of Test cs" + n + ":", accuracy_score(df_test['score'], df_test['cs'+n]))


Accuracy of Test cs5: 0.44005449591280654


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':


In [None]:
#Compute embedding for both lists
embeddings1 = model.encode(df_train['s1'])
embeddings2 = model.encode(df_train['s2'])

#Compute cosine-similarities
cosine_scores = util.pytorch_cos_sim(embeddings1, embeddings2)
df_train['cs'+n] = ''
for i in range(len(df_train['s1'])):
    df_train['cs'+n][i] = cosine_scores[i][i].item()

from sklearn.metrics import accuracy_score
df_train['cs'+n] = df_train['cs'+n].apply(lambda x: int(round(x*5, 0)))

print("Accuracy of Train cs" + n + ":", accuracy_score(df_train['score'], df_train['cs'+n]))


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':


Accuracy of Train cs5: 0.825988187187642


In [None]:
print("Accuracy of Train cs" + n + ":", accuracy_score(df_train['score'], df_train['cs'+n]))
print("Accuracy of Test cs" + n + ":", accuracy_score(df_test['score'], df_test['cs'+n]))
print("Accuracy of Dev cs" + n + ":", accuracy_score(df_dev['score'], df_dev['cs'+n]))



Accuracy of Train cs5: 0.825988187187642
Accuracy of Test cs5: 0.44005449591280654
Accuracy of Dev cs5: 0.44005449591280654


In [None]:
c_tr = cosine_score

In [None]:
values,indices = c_tr[0,].topk(5)

print(values)
print(indices)

tensor([0.8804, 0.3554, 0.3306, 0.3280, 0.3264])
tensor([  3, 105, 482, 256, 261])


In [None]:
df_train['s2'].iloc[indices.tolist()]

3      7歳9か月時には上顎非裂側側切歯の萌出が また8歳6か月時には上顎裂側側切歯の萌出がみられた
105           手術 両側前頭開頭 interhemispheric approachを行った
482                               手術時間は2時間分 出血量はgであった
256         bracket装置を装着した上下顎ともレベリングに続いて犬歯の遠心移動前歯部の後退
261                3の治療点に置針し極超短波を併用した初回の治療直後痛みは軽快しfns
Name: s2, dtype: object

In [None]:
df[df.s1 == df_dev['s1'].iloc[0]]

'中切歯萌出前には この歯胚が顎裂側に張り出していることが多い'