In [1]:
import pandas as pd
import numpy as np

from sentence_transformers import SentenceTransformer
from sentence_transformers.util import pytorch_cos_sim
model = SentenceTransformer("stsb-xlm-r-multilingual")

from glob import glob
from bert_utils import *
from config import *

from pandarallel import pandarallel
pandarallel.initialize(progress_bar=True)

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.

****** SEED fixed : 42 ******


INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [2]:
train = pd.read_csv(data_path+"train.csv")
test = pd.read_csv(data_path+"test.csv")
df = pd.concat([train, test]).reset_index(drop=True)
train_shape = train.shape[0]
df["clean_text"] = df["text"].map(lambda x: clean_text(x))

In [3]:
# 対象とするデータの読み込み --
corpus_paths = glob(f"{input_root}*.feather")
Debug_print(corpus_paths)

corpus_dfs = []
for corpus_path in corpus_paths:
    _df = pd.read_feather(corpus_path)
    _df = _df.reset_index(drop=False, names="id")
    _df["id"] = corpus_path.split("/")[-1].split(".")[0] + "_" + _df["id"].astype(str)
    corpus_dfs.append(_df)
corpus_df = pd.concat(corpus_dfs)

[33m['./input/news4vip.feather', './input/newsplus.feather', './input/livejupiter.feather'][39m


In [4]:
df_embeddings = model.encode(df["clean_text"].values)

In [5]:
#corpus_embeddings = model.encode(corpus_df["clean_text"].values)

In [6]:
sentences = [
    "私はうさぎです",
]
targets = model.encode(sentences, convert_to_tensor=True)
targets.shape

torch.Size([1, 768])

In [7]:
sentence = "僕はクマ"
emb = model.encode(sentence, convert_to_tensor=True)

In [8]:
scores = pytorch_cos_sim(emb, targets)

In [9]:
scores

tensor([[0.4201]], device='cuda:0')

### 並列化下書き --

In [10]:
def task(_n):
    s = 0
    for i in range(1, _n+1):
        s += i
        time.sleep(0.1)
    return s

In [11]:
ns = list(np.arange(1, 11))
start = time.time()

sms_single = []
for n in ns:
    sms_single.append(task(n))

end = time.time()
delta = end - start
print(np.round(delta, 3))

5.508


* 並行処理（マルチスレッド）

In [12]:
from concurrent.futures import ThreadPoolExecutor

start = time.time()

with ThreadPoolExecutor(8) as e:
    ret = e.map(task, ns)
sms_multi = [r for r in ret]

end = time.time()
delta = end - start
print(np.round(delta, 3))

1.204


* 並列処理（マルチプロセス）

In [13]:
from concurrent.futures import ProcessPoolExecutor

start = time.time()

with ProcessPoolExecutor(16) as e:
    ret = e.map(task, ns)
sms_multi = [r for r in ret]

end = time.time()
delta = end - start
print(np.round(delta, 3))

1.859


In [14]:
df_embeddings = model.encode(df["clean_text"].values, convert_to_tensor=False)

In [19]:
def sentence_bert_embedding(text):
    model = SentenceTransformer("stsb-xlm-r-multilingual")
    return model.encode(text, convert_to_tensor=True)

In [20]:
with ProcessPoolExecutor(16) as e:
    ret = e.map(sentence_bert_embedding, df["clean_text"].values)
df_embeddings_multi = [r for r in ret]