from google.colab import drive
drive.mount('/content/drive')

In [2]:
! pip install -U sentence-transformers
! pip install fugashi
! pip install ipadic

Collecting sentence-transformers
  Downloading sentence-transformers-2.0.0.tar.gz (85 kB)
[K     |████████████████████████████████| 85 kB 2.3 MB/s 
[?25hCollecting transformers<5.0.0,>=4.6.0
  Downloading transformers-4.10.2-py3-none-any.whl (2.8 MB)
[K     |████████████████████████████████| 2.8 MB 33.0 MB/s 
Collecting sentencepiece
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 36.7 MB/s 
[?25hCollecting huggingface-hub
  Downloading huggingface_hub-0.0.16-py3-none-any.whl (50 kB)
[K     |████████████████████████████████| 50 kB 5.6 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-5.4.1-cp37-cp37m-manylinux1_x86_64.whl (636 kB)
[K     |████████████████████████████████| 636 kB 43.5 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.45-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 38.8 MB/s 
[?25hCollecting tokenizers<0.11,>=0.10.1
  Downlo

### Loading Libraries

In [4]:
import pandas as pd

from tqdm._tqdm_notebook import tqdm_notebook as tqdm
tqdm.pandas()

from sentence_transformers import SentenceTransformer
from transformers import BertJapaneseTokenizer, BertModel
from sentence_transformers import SentenceTransformer, util

import torch

from sklearn.metrics import accuracy_score


### Load Data

In [5]:
filepath = "/content/drive/MyDrive/Macnica/Projects/STS/"
xls =  pd.ExcelFile(filepath + "data/dataset_JA_Clinical_STS.xls")
df = xls.parse(0)

In [6]:
df.columns = ['s1', 's2', 'score']

In [7]:
class SentenceBertJapanese:
    def __init__(self, model_name_or_path, device=None):
        self.tokenizer = BertJapaneseTokenizer.from_pretrained(model_name_or_path)
        self.model = BertModel.from_pretrained(model_name_or_path)
        self.model.eval()

        if device is None:
            device = "cuda" if torch.cuda.is_available() else "cpu"
        self.device = torch.device(device)
        self.model.to(device)

    def _mean_pooling(self, model_output, attention_mask):
        token_embeddings = model_output[0] #First element of model_output contains all token embeddings
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
        return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

    @torch.no_grad()
    def encode(self, sentences, batch_size=8):
        all_embeddings = []
        iterator = range(0, len(sentences), batch_size)
        for batch_idx in iterator:
            batch = sentences[batch_idx:batch_idx + batch_size]

            encoded_input = self.tokenizer.batch_encode_plus(batch, padding="longest", 
                                           truncation=True, return_tensors="pt").to(self.device)
            model_output = self.model(**encoded_input)
            sentence_embeddings = self._mean_pooling(model_output, encoded_input["attention_mask"]).to('cpu')

            all_embeddings.extend(sentence_embeddings)

        # return torch.stack(all_embeddings).numpy()
        return torch.stack(all_embeddings)



In [8]:
from sentence_transformers import SentenceTransformer, util
model1 = SentenceTransformer('distiluse-base-multilingual-cased-v2')
model2 = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')
model3 = SentenceTransformer('paraphrase-multilingual-mpnet-base-v2')
model4 = SentenceBertJapanese("sonoisa/sentence-bert-base-ja-mean-tokens")

Downloading:   0%|          | 0.00/690 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.35k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/610 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/341 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/539M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/531 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/996k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/114 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.58M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/968 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/3.76k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/645 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/229 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/471M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/480 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/14.8M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/690 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/3.77k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/723 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/229 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/402 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/258k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/241 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/730 [00:00<?, ?B/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'BertTokenizer'. 
The class this function is called from is 'BertJapaneseTokenizer'.


Downloading:   0%|          | 0.00/443M [00:00<?, ?B/s]

In [10]:
from cleaning import *
df['s1'] = cleaning(df['s1'])
df['s2'] = cleaning(df['s2'])

  0%|          | 0/3669 [00:00<?, ?it/s]

  0%|          | 0/3669 [00:00<?, ?it/s]

  0%|          | 0/3669 [00:00<?, ?it/s]

  0%|          | 0/3669 [00:00<?, ?it/s]

  0%|          | 0/3669 [00:00<?, ?it/s]

  0%|          | 0/3669 [00:00<?, ?it/s]

  0%|          | 0/3669 [00:00<?, ?it/s]

  0%|          | 0/3669 [00:00<?, ?it/s]

  0%|          | 0/3669 [00:00<?, ?it/s]

  0%|          | 0/3669 [00:00<?, ?it/s]

  0%|          | 0/3669 [00:00<?, ?it/s]

  0%|          | 0/3669 [00:00<?, ?it/s]

  0%|          | 0/3669 [00:00<?, ?it/s]

  0%|          | 0/3669 [00:00<?, ?it/s]

In [None]:
#Compute embedding for both lists
#Change models numbers according to the need
embeddings1 = model1.encode(df['s1'])
embeddings2 = model1.encode(df['s2'])

#Compute cosine-similarities
cosine_scores = util.pytorch_cos_sim(embeddings1, embeddings2)
#Creating an empty pred column to add prediction scores
df['cs1'] = ''
for i in range(len(df['s1'])):
    df['cs1'][i] = cosine_scores[i][i].item()

In [None]:
df['cs1'] = df['cs1'].apply(lambda x: int(round(x*5, 0)))

print("Accuracy of cs1:", accuracy_score(df['score'], df['cs1']))

## Retraining Models

In [17]:
from sentence_transformers import SentenceTransformer, LoggingHandler, losses, util, InputExample
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
from sentence_transformers import SentenceTransformer, SentencesDataset, InputExample, losses

score = df['score'] / 5  # Normalize score to range 0 ... 1
train_examples = InputExample(texts=[df['s1'], df['s2']], label=score)

train_dataset = SentencesDataset(train_examples, model1)


In [25]:
train_samples = []
for i in range(df.shape[0]):
  score = float(df['score'].iloc[i])/5.0
  inp_example = InputExample(texts=[df['s1'].iloc[i], df['s2'].iloc[i]], label=score)
  train_samples.append(inp_example)



In [None]:
from torch.utils.data import DataLoader
import logging
import math
from sentence_transformers import SentenceTransformer, LoggingHandler, losses, util, InputExample
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
from datetime import datetime
import os
import gzip
import csv

train_batch_size = 16
num_epochs = 4

train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=train_batch_size)
train_loss = losses.CosineSimilarityLoss(model=model1)



# Configure the training. We skip evaluation in this example
warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1) #10% of train data for warm-up
logging.info("Warmup-steps: {}".format(warmup_steps))


# Train the model
model1.fit(train_objectives=[(train_dataloader, train_loss)],
          evaluator=evaluator,
          epochs=num_epochs,
          evaluation_steps=1000,
          warmup_steps=warmup_steps)


In [None]:
#Compute embedding for both lists
embeddings1 = model1.encode(df['s1'])
embeddings2 = model1.encode(df['s2'])

#Compute cosine-similarities
cosine_scores = util.pytorch_cos_sim(embeddings1, embeddings2)
df['cs1'] = ''
for i in range(len(df['s1'])):
    df['cs1'][i] = cosine_scores[i][i].item()

In [None]:
from sklearn.metrics import accuracy_score
df['cs1'] = df['cs1'].apply(lambda x: int(round(x*5, 0)))

print("Accuracy of cs4:", accuracy_score(df['score'], df['cs4']))
