# Measuring Document Similarity


In [1]:
from ekorpkit import eKonf

if eKonf.is_colab():
    eKonf.mount_google_drive()
ws = eKonf.set_workspace(
    workspace="/workspace", 
    project="ekorpkit-book/exmaples", 
    task="esg", 
    log_level="INFO",
    verbose=True
)
print("version:", ws.version)
print("project_dir:", ws.project_dir)

INFO:ekorpkit.hyfi.utils.logging:Set environment variable EKORPKIT_DATA_ROOT=/workspace/data
INFO:ekorpkit.hyfi.utils.logging:Set environment variable CACHED_PATH_CACHE_ROOT=/workspace/.cache/cached_path
INFO:ekorpkit.hyfi.utils.logging:Set environment variable WANDB_DIR=/workspace/projects/ekorpkit-book/exmaples/logs
INFO:ekorpkit.hyfi.utils.logging:Set environment variable WANDB_PROJECT=ekorpkit-book-exmaples
INFO:ekorpkit.hyfi.utils.logging:Set environment variable WANDB_NOTEBOOK_NAME=/workspace/projects/ekorpkit-book/exmaples/logs/esg-nb
INFO:ekorpkit.hyfi.utils.logging:Set environment variable WANDB_SILENT=False
INFO:ekorpkit.hyfi.utils.logging:Loaded .env from /workspace/projects/ekorpkit-book/config/.env
INFO:ekorpkit.hyfi.utils.logging:initialized batcher with <ekorpkit.hyfi.utils.batch.batcher.Batcher object at 0x7f405c2ab520>


version: 0.1.40.post0.dev90
project_dir: /workspace/projects/ekorpkit-book/exmaples
time: 1.41 s (started: 2023-02-10 03:22:55 +00:00)


In [2]:
print(eKonf.__ekorpkit_path__)
print(eKonf.__hyfi_path__)

/workspace/projects/ekorpkit/ekorpkit
/workspace/projects/ekorpkit/ekorpkit/hyfi
time: 377 µs (started: 2023-02-10 03:23:01 +00:00)


## Load data to predict


In [3]:
news_data_dir = ws.project_dir / "esg/data/econ_news_kr/news_slice"
filename = "esg_news_valid_20221229.parquet"

valid_data = eKonf.load_data(filename, news_data_dir)
cols = ["text", "filename", "chunk_id", "codes"]
valid_data[cols].head()

Unnamed: 0,text,filename,chunk_id,codes
0,◆ 2020 경제기상도 / 업종별 전망 (반도체) ◆ 지난해 미·중 무역분쟁과 공...,02100101.20200101040200001.txt,0,660
2,"◆ 2020 경제기상도 / 업종별 전망 (가전) ◆ TV, 냉장고, 세탁기 등 전...",02100101.20200101040200002.txt,0,66570
3,"◆ 2020 경제기상도 / 업종별 전망 (가전) ◆ TV, 냉장고, 세탁기 등 전...",02100101.20200101040200002.txt,0,5930
4,◆ 2020 경제기상도 / 업종별 전망 (디스플레이) ◆ 액정표시장치(LCD) 시...,02100101.20200101040201001.txt,0,34220
5,디스플레이 업계 등에서는 삼성과 LG가 글로벌 디스플레이 시장에서 중국 업체의 L...,02100101.20200101040201001.txt,1,3550


time: 6.42 s (started: 2023-02-10 02:55:08 +00:00)


In [4]:
data = valid_data.sample(1000)

time: 11.1 ms (started: 2023-02-10 02:55:14 +00:00)


In [5]:
cfg_norm = eKonf.compose("preprocessor/normalizer=formal_ko")
cfg_mcb = eKonf.compose("preprocessor/tokenizer=mecab_econ")
cfg_mcb.normalize = cfg_norm
mecab = eKonf.instantiate(cfg_mcb, verbose=True)


INFO:ekorpkit.preprocessors.tokenizer:Initializing mecab with {'userdic_path': '/workspace/projects/ekorpkit/ekorpkit/resources/dictionaries/mecab/ekon_v1.dic', 'backend': 'mecab-python3', 'verbose': True}...
INFO:ekorpkit.preprocessors.tokenizer:instantiating ekorpkit.preprocessors.normalizer.Normalizer...
INFO:ekorpkit.preprocessors.tokenizer:instantiating ekorpkit.preprocessors.stopwords.Stopwords...
INFO:ekorpkit.hyfi.utils.logging:instantiating ekorpkit.preprocessors.stopwords.Stopwords ...
INFO:ekorpkit.preprocessors.tokenizer:MecabTokenizer initialized with:
INFO:ekorpkit.preprocessors.tokenizer:	return_as_list: False
INFO:ekorpkit.tokenizers.mecab:MeCab uses mecab-python3 as backend.
INFO:ekorpkit.tokenizers.mecab:Mecab uses system dictionary: /opt/conda/lib/python3.8/site-packages/mecab_ko_dic/dicdir, user dictionary: /workspace/projects/ekorpkit/ekorpkit/resources/dictionaries/mecab/ekon_v1.dic


time: 1.07 s (started: 2023-02-10 02:55:14 +00:00)


In [6]:
# Tokenize

cfg = eKonf.compose("pipeline/tokenize")
data = eKonf.pipe(data, cfg)
data.head()

INFO:ekorpkit.hyfi.utils.logging:Applying pipe: functools.partial(<function tokenize at 0x7f20e743e940>)
INFO:ekorpkit.pipelines.pipe:instantiating tokenizer
INFO:ekorpkit.hyfi.utils.logging:instantiating ekorpkit.preprocessors.tokenizer.MecabTokenizer ...
INFO:ekorpkit.preprocessors.tokenizer:Initializing mecab with {'userdic_path': None, 'backend': 'mecab-python3', 'verbose': True}...
INFO:ekorpkit.preprocessors.tokenizer:instantiating ekorpkit.preprocessors.stopwords.Stopwords...
INFO:ekorpkit.hyfi.utils.logging:instantiating ekorpkit.preprocessors.stopwords.Stopwords ...
INFO:ekorpkit.preprocessors.tokenizer:MecabTokenizer initialized with:
INFO:ekorpkit.preprocessors.tokenizer:	return_as_list: False
INFO:ekorpkit.tokenizers.mecab:MeCab uses mecab-python3 as backend.
INFO:ekorpkit.tokenizers.mecab:Mecab uses system dictionary: /opt/conda/lib/python3.8/site-packages/mecab_ko_dic/dicdir, user dictionary: None
INFO:ekorpkit.hyfi.utils.logging:Using batcher with minibatch size: 21
INFO

Tokenizing column: text:   0%|          | 0/48 [00:00<?, ?it/s]

INFO:ekorpkit.pipelines.pipe: >> elapsed time to segment: 0:00:03.707165


Unnamed: 0,filename,chunk_id,text,codes
505565,02100851.20210520123331001.txt,0,에스/NNP 씨/NNB 디/NNG 가/JKS /SP 코스닥/NNG /SP 시장/NN...,42110
428505,02100311.20210218191853001.txt,0,"지난해/NNG /SP 9/SN 월/NNBC 에/JKB 도/JX /SP 2/SN ,/...",12330
511819,02100201.20210401153611001.txt,2,오명훈/NNP /SP CFO/SL 는/JX /SP 2001/SN 년/NNBC /SP...,17670
697962,02100201.20211212103613001.txt,0,[/SSO 머니/NNP 투데이/NNP /SP 변/XSN 휘/MAG /SP 기자/NN...,17670
191882,02100851.20200903183248001.txt,0,[/SSO 사진/NNG =/SY IBK/SL 기업/NNG 은행/NNG ]/SSC /...,24110


time: 4.03 s (started: 2023-02-10 02:55:15 +00:00)


In [7]:
# Extract tokens
# stopwords_file = ws.project_dir / "esg/data/stopwords/stopwords.txt"
tkn_cfg = eKonf.compose("preprocessor/tokenizer=mecab_econ")
# tkn_cfg.extract.strip_pos = False

cfg = eKonf.compose("pipeline/extract_tokens")
cfg.preprocessor.tokenizer = tkn_cfg
cfg.nouns_only = False
# cfg.stopwords_path = str(stopwords_file)
# eKonf.print(cfg)
data = eKonf.pipe(data, cfg)

tokenized_data_file = ws.project_dir / "esg/data/similarity/tokenized_data.parquet"
eKonf.save_data(data, tokenized_data_file)


INFO:ekorpkit.hyfi.utils.logging:Applying pipe: functools.partial(<function extract_tokens at 0x7f20e743e9d0>)
INFO:ekorpkit.pipelines.pipe:instantiating tokenizer
INFO:ekorpkit.hyfi.utils.logging:instantiating ekorpkit.preprocessors.tokenizer.MecabTokenizer ...
INFO:ekorpkit.preprocessors.tokenizer:Initializing mecab with {'userdic_path': '/workspace/projects/ekorpkit/ekorpkit/resources/dictionaries/mecab/ekon_v1.dic', 'backend': 'mecab-python3', 'verbose': True}...
INFO:ekorpkit.preprocessors.tokenizer:instantiating ekorpkit.preprocessors.stopwords.Stopwords...
INFO:ekorpkit.hyfi.utils.logging:instantiating ekorpkit.preprocessors.stopwords.Stopwords ...
INFO:ekorpkit.preprocessors.tokenizer:MecabTokenizer initialized with:
INFO:ekorpkit.preprocessors.tokenizer:	return_as_list: False
INFO:ekorpkit.tokenizers.mecab:MeCab uses mecab-python3 as backend.
INFO:ekorpkit.tokenizers.mecab:Mecab uses system dictionary: /opt/conda/lib/python3.8/site-packages/mecab_ko_dic/dicdir, user dictionary

Extracting column: text:   0%|          | 0/48 [00:00<?, ?it/s]

INFO:ekorpkit.pipelines.pipe: >> elapsed time to extract tokens: 0:00:00.848701
INFO:ekorpkit.hyfi.io.file:Saving dataframe to /workspace/projects/ekorpkit-book/exmaples/esg/data/similarity/tokenized_data.parquet


time: 1.55 s (started: 2023-02-10 02:55:19 +00:00)


## Predict similarity

Similarity will be measured among the news articles on the same day. The similarity is measured by the cosine similarity of the document vectors.


In [21]:
from transformers import ElectraModel, ElectraTokenizer

# Initialize the Electra model and tokenizer
model = ElectraModel.from_pretrained('entelecheia/ekonelectra-base-discriminator')
tokenizer = ElectraTokenizer.from_pretrained('entelecheia/ekonelectra-base-discriminator')

Some weights of the model checkpoint at entelecheia/ekonelectra-base-discriminator were not used when initializing ElectraModel: ['discriminator_predictions.dense.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense_prediction.bias']
- This IS expected if you are initializing ElectraModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


time: 2.78 s (started: 2023-01-19 08:56:23 +00:00)


In [22]:
# Tokenize and encode the documents
encoded_docs = [tokenizer.encode(doc, return_tensors='pt', truncation=True) for doc in documents]


time: 31.7 ms (started: 2023-01-19 08:56:26 +00:00)


In [23]:
# Generate the embeddings for the documents
embeddings = [model(doc)[0][:, 0, :].detach().numpy() for doc in encoded_docs]

time: 2.47 s (started: 2023-01-19 08:56:27 +00:00)


In [30]:
import numpy as np

# Get the mean of the embeddings
mean_embeddings = [np.mean(embedding, axis=0) for embedding in embeddings]
# Gey CLS token embedding
cls_embeddings = [embedding[0] for embedding in embeddings]

time: 1.38 ms (started: 2023-01-19 08:59:45 +00:00)


In [31]:
# Compute the similarity matrix using cosine similarity
from sklearn.metrics.pairwise import cosine_similarity
similarity_matrix = cosine_similarity(cls_embeddings)

print(similarity_matrix)

[[1.0000001  0.968363   0.92001873 0.9597888  0.9621922  0.9525209
  0.96460295 0.9683971  0.9587421  0.9609047 ]
 [0.968363   1.         0.9243116  0.99047786 0.9818795  0.974076
  0.9874664  0.99173224 0.9730664  0.9540314 ]
 [0.92001873 0.9243116  1.0000002  0.90941644 0.9215709  0.91559356
  0.92173624 0.9292955  0.94094986 0.9056404 ]
 [0.9597888  0.99047786 0.90941644 1.0000002  0.9848912  0.97961116
  0.9907066  0.9894204  0.9725035  0.9612313 ]
 [0.9621922  0.9818795  0.9215709  0.9848912  1.         0.9741106
  0.9906047  0.9855628  0.9789446  0.9729687 ]
 [0.9525209  0.974076   0.91559356 0.97961116 0.9741106  1.
  0.9774029  0.9789716  0.97945994 0.96554124]
 [0.96460295 0.9874664  0.92173624 0.9907066  0.9906047  0.9774029
  0.9999999  0.99083006 0.97806597 0.9706383 ]
 [0.9683971  0.99173224 0.9292955  0.9894204  0.9855628  0.9789716
  0.99083006 0.9999997  0.9804057  0.9656478 ]
 [0.9587421  0.9730664  0.94094986 0.9725035  0.9789446  0.97945994
  0.97806597 0.9804057  1.