# Measuring Document Similarity


In [2]:
from ekorpkit import eKonf

if eKonf.is_colab():
    eKonf.mount_google_drive()
ws = eKonf.set_workspace(
    workspace="/workspace", 
    project="ekorpkit-book/exmaples", 
    task="esg", 
    log_level="INFO",
    verbose=True
)
print("version:", ws.version)
print("project_dir:", ws.project_dir)


INFO:ekorpkit.hyfi.env:Set environment variable EKORPKIT_DATA_ROOT=/workspace/data
INFO:ekorpkit.hyfi.env:Set environment variable CACHED_PATH_CACHE_ROOT=/workspace/.cache/cached_path
INFO:ekorpkit.hyfi.env:Set environment variable WANDB_DIR=/workspace/projects/ekorpkit-book/exmaples/logs
INFO:ekorpkit.hyfi.env:Set environment variable WANDB_PROJECT=ekorpkit-book-exmaples
INFO:ekorpkit.hyfi.env:Set environment variable WANDB_NOTEBOOK_NAME=/workspace/projects/ekorpkit-book/exmaples/logs/esg-nb
INFO:ekorpkit.hyfi.env:Set environment variable WANDB_SILENT=False
INFO:ekorpkit.hyfi.utils.env:Loaded .env from /workspace/projects/ekorpkit-book/config/.env
INFO:ekorpkit.hyfi.hydra:initialized batcher with <ekorpkit.hyfi.utils.batch.batcher.Batcher object at 0x7f95fb3c5df0>


version: 0.1.40.post0.dev100
project_dir: /workspace/projects/ekorpkit-book/exmaples
time: 1.26 s (started: 2023-02-13 10:31:35 +00:00)


## Load data to measure similarity


In [12]:
import pandas as pd

news_data_dir = ws.project_dir / "esg/data/econ_news_kr/news_slice"
filename = "esg_news_valid_20221229.parquet"

valid_data = eKonf.load_data(filename, news_data_dir)
id_cols = ["filename", "codes", "chunk_id"]
valid_data.chunk_id = valid_data.chunk_id.astype(str)
valid_data["doc_id"] = valid_data[id_cols].apply(lambda x: "_".join(x), axis=1)

# make date column from filename by splitting filename by ".", second element is date
valid_data["date"] = valid_data.filename.str.split(".").str[1]
# only need first 14 characters
valid_data["date"] = valid_data.date.str[:14]
# convert date column to datetime
valid_data["date"] = pd.to_datetime(valid_data.date, format="%Y%m%d%H%M%S")

source_data_file = ws.project_dir / "esg/data/similarity/source_data.parquet"
eKonf.save_data(valid_data, source_data_file)

INFO:ekorpkit.hyfi.io.file:Saving dataframe to /workspace/projects/ekorpkit-book/exmaples/esg/data/similarity/source_data.parquet


time: 1min 17s (started: 2023-02-13 09:07:39 +00:00)


In [10]:
# load source data
source_data_file = ws.project_dir / "esg/data/similarity/source_data.parquet"
data = eKonf.load_data(source_data_file)
cols = ["date", "doc_id", "text"]
# data = data[cols].sample(1000)
data.head()

Unnamed: 0,filename,chunk_id,text,codes,doc_id,date
0,02100101.20200101040200001.txt,0,◆ 2020 경제기상도 / 업종별 전망 (반도체) ◆ 지난해 미·중 무역분쟁과 공...,660,02100101.20200101040200001.txt_000660_0,2020-01-01 04:02:00
2,02100101.20200101040200002.txt,0,"◆ 2020 경제기상도 / 업종별 전망 (가전) ◆ TV, 냉장고, 세탁기 등 전...",66570,02100101.20200101040200002.txt_066570_0,2020-01-01 04:02:00
3,02100101.20200101040200002.txt,0,"◆ 2020 경제기상도 / 업종별 전망 (가전) ◆ TV, 냉장고, 세탁기 등 전...",5930,02100101.20200101040200002.txt_005930_0,2020-01-01 04:02:00
4,02100101.20200101040201001.txt,0,◆ 2020 경제기상도 / 업종별 전망 (디스플레이) ◆ 액정표시장치(LCD) 시...,34220,02100101.20200101040201001.txt_034220_0,2020-01-01 04:02:01
5,02100101.20200101040201001.txt,1,디스플레이 업계 등에서는 삼성과 LG가 글로벌 디스플레이 시장에서 중국 업체의 L...,3550,02100101.20200101040201001.txt_003550_1,2020-01-01 04:02:01


time: 6.33 s (started: 2023-02-13 10:36:52 +00:00)


In [11]:
cfg_norm = eKonf.compose("preprocessor/normalizer=formal_ko")
cfg_mcb = eKonf.compose("preprocessor/tokenizer=mecab_econ")
cfg_mcb.normalize = cfg_norm
mecab = eKonf.instantiate(cfg_mcb, verbose=True)


INFO:ekorpkit.preprocessors.tokenizer:Initializing mecab with {'userdic_path': '/workspace/projects/ekorpkit/ekorpkit/resources/dictionaries/mecab/ekon_v1.dic', 'backend': 'mecab-python3', 'verbose': True}...
INFO:ekorpkit.preprocessors.tokenizer:instantiating ekorpkit.preprocessors.normalizer.Normalizer...
INFO:ekorpkit.preprocessors.tokenizer:instantiating ekorpkit.preprocessors.stopwords.Stopwords...
INFO:ekorpkit.hyfi.hydra:instantiating ekorpkit.preprocessors.stopwords.Stopwords ...
INFO:ekorpkit.preprocessors.tokenizer:MecabTokenizer initialized with:
INFO:ekorpkit.preprocessors.tokenizer:	return_as_list: False
INFO:ekorpkit.tokenizers.mecab:MeCab uses mecab-python3 as backend.
INFO:ekorpkit.tokenizers.mecab:Mecab uses system dictionary: /opt/conda/lib/python3.8/site-packages/mecab_ko_dic/dicdir, user dictionary: /workspace/projects/ekorpkit/ekorpkit/resources/dictionaries/mecab/ekon_v1.dic


time: 953 ms (started: 2023-02-13 10:37:01 +00:00)


In [12]:
# Tokenize

cfg = eKonf.compose("pipeline/tokenize")
data = eKonf.pipe(data, cfg)
data.head()

INFO:ekorpkit.hyfi.pipe:Applying pipe: functools.partial(<function tokenize at 0x7f91dcf11a60>)
INFO:ekorpkit.pipelines.pipe:instantiating tokenizer
INFO:ekorpkit.hyfi.hydra:instantiating ekorpkit.preprocessors.tokenizer.MecabTokenizer ...
INFO:ekorpkit.preprocessors.tokenizer:Initializing mecab with {'userdic_path': None, 'backend': 'mecab-python3', 'verbose': True}...
INFO:ekorpkit.preprocessors.tokenizer:instantiating ekorpkit.preprocessors.stopwords.Stopwords...
INFO:ekorpkit.hyfi.hydra:instantiating ekorpkit.preprocessors.stopwords.Stopwords ...
INFO:ekorpkit.preprocessors.tokenizer:MecabTokenizer initialized with:
INFO:ekorpkit.preprocessors.tokenizer:	return_as_list: False
INFO:ekorpkit.tokenizers.mecab:MeCab uses mecab-python3 as backend.
INFO:ekorpkit.tokenizers.mecab:Mecab uses system dictionary: /opt/conda/lib/python3.8/site-packages/mecab_ko_dic/dicdir, user dictionary: None
INFO:ekorpkit.hyfi.pipe:Using batcher with minibatch size: 1000
INFO:ekorpkit.hyfi.utils.batch.batch

Tokenizing column: text:   0%|          | 0/559 [00:00<?, ?it/s]

INFO:ekorpkit.pipelines.pipe: >> elapsed time to segment: 0:01:11.623086


Unnamed: 0,filename,chunk_id,text,codes,doc_id,date
0,02100101.20200101040200001.txt,0,◆/SY /SP 2020/SN /SP 경제/NNG 기상도/NNG /SP //SC /...,660,02100101.20200101040200001.txt_000660_0,2020-01-01 04:02:00
2,02100101.20200101040200002.txt,0,◆/SY /SP 2020/SN /SP 경제/NNG 기상도/NNG /SP //SC /...,66570,02100101.20200101040200002.txt_066570_0,2020-01-01 04:02:00
3,02100101.20200101040200002.txt,0,◆/SY /SP 2020/SN /SP 경제/NNG 기상도/NNG /SP //SC /...,5930,02100101.20200101040200002.txt_005930_0,2020-01-01 04:02:00
4,02100101.20200101040201001.txt,0,◆/SY /SP 2020/SN /SP 경제/NNG 기상도/NNG /SP //SC /...,34220,02100101.20200101040201001.txt_034220_0,2020-01-01 04:02:01
5,02100101.20200101040201001.txt,1,디스플레이/NNG /SP 업계/NNG /SP 등/NNB 에서/JKB 는/JX /SP...,3550,02100101.20200101040201001.txt_003550_1,2020-01-01 04:02:01


time: 1min 11s (started: 2023-02-13 10:37:02 +00:00)


In [13]:
# Extract tokens
# stopwords_file = ws.project_dir / "esg/data/stopwords/stopwords.txt"
tkn_cfg = eKonf.compose("preprocessor/tokenizer=mecab_econ")
# tkn_cfg.extract.strip_pos = False

cfg = eKonf.compose("pipeline/extract_tokens")
cfg.preprocessor.tokenizer = tkn_cfg
cfg.nouns_only = False
# cfg.stopwords_path = str(stopwords_file)
# eKonf.print(cfg)
data = eKonf.pipe(data, cfg)

tokenized_data_file = ws.project_dir / "esg/data/similarity/tokenized_data.parquet"
eKonf.save_data(data, tokenized_data_file)


INFO:ekorpkit.hyfi.pipe:Applying pipe: functools.partial(<function extract_tokens at 0x7f91dcf11af0>)
INFO:ekorpkit.pipelines.pipe:instantiating tokenizer
INFO:ekorpkit.hyfi.hydra:instantiating ekorpkit.preprocessors.tokenizer.MecabTokenizer ...
INFO:ekorpkit.preprocessors.tokenizer:Initializing mecab with {'userdic_path': '/workspace/projects/ekorpkit/ekorpkit/resources/dictionaries/mecab/ekon_v1.dic', 'backend': 'mecab-python3', 'verbose': True}...
INFO:ekorpkit.preprocessors.tokenizer:instantiating ekorpkit.preprocessors.stopwords.Stopwords...
INFO:ekorpkit.hyfi.hydra:instantiating ekorpkit.preprocessors.stopwords.Stopwords ...
INFO:ekorpkit.preprocessors.tokenizer:MecabTokenizer initialized with:
INFO:ekorpkit.preprocessors.tokenizer:	return_as_list: False
INFO:ekorpkit.tokenizers.mecab:MeCab uses mecab-python3 as backend.
INFO:ekorpkit.tokenizers.mecab:Mecab uses system dictionary: /opt/conda/lib/python3.8/site-packages/mecab_ko_dic/dicdir, user dictionary: /workspace/projects/eko

Extracting column: text:   0%|          | 0/559 [00:00<?, ?it/s]

INFO:ekorpkit.pipelines.pipe: >> elapsed time to extract tokens: 0:00:13.634850
INFO:ekorpkit.hyfi.io.file:Saving dataframe to /workspace/projects/ekorpkit-book/exmaples/esg/data/similarity/tokenized_data.parquet


time: 1min 23s (started: 2023-02-13 10:38:15 +00:00)


## Predict similarity

Similarity will be measured among the news articles on the same day. The similarity is measured by the cosine similarity of the document vectors.


In [16]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Load the data into a pandas dataframe
tokenized_data_file = ws.project_dir / "esg/data/similarity/tokenized_data.parquet"
data = eKonf.load_data(tokenized_data_file)
data = data.reset_index(drop=True)
# Extract the date part
data['date'] = data['date'].dt.date

# Convert the data into a matrix representation using TF-IDF vectorization
vectorizer = TfidfVectorizer(tokenizer=lambda x: x.split(" "))

# Initialize a list to store the results
results = []

# Iterate over the unique dates, ignoring time information
unique_dates = data['date'].unique()
for i, date in enumerate(unique_dates):
    # Calculate the similarity between the current document and the previous seven days
    current_doc = data[data['date'] == date]['doc_id'].iloc[0]
    current_text = data[data['date'] == date]['text'].iloc[0]
    previous_period_start = date - pd.Timedelta(7, 'd')
    previous_period_end = date
    previous_period = data[(data['date'] >= previous_period_start) & (data['date'] < previous_period_end)]['doc_id']
    previous_text = data[(data['date'] >= previous_period_start) & (data['date'] < previous_period_end)]['text']
    matrix = vectorizer.fit_transform(previous_text.append(pd.Series(current_text)))
    similarity = cosine_similarity(matrix)
    current_doc_index = matrix.shape[0] - 1
    for j, doc in enumerate(previous_period):
        sim = similarity[current_doc_index][j]
        results.append([date, previous_period_start, previous_period_end, current_doc, doc, sim])

# Convert the results list into a data frame
results = pd.DataFrame(results, columns=['date', 'start_date', 'end_date', 'doc_id_1', 'doc_id_2', 'similarity'])

# save the results
similarity_results_file = ws.project_dir / "esg/data/similarity/similarity_results.parquet"
eKonf.save_data(results, similarity_results_file)


INFO:ekorpkit.hyfi.io.file:Saving dataframe to /workspace/projects/ekorpkit-book/exmaples/esg/data/similarity/similarity_results.parquet


time: 51min 39s (started: 2023-02-13 11:21:27 +00:00)


In [17]:
i# sort by similarity
results = results.sort_values(by=['similarity'], ascending=False)
results.head()

Unnamed: 0,date,start_date,end_date,doc_id_1,doc_id_2,similarity
1558596,2022-01-22,2022-01-15,2022-01-22,02100101.20220122102031004.txt_308100_0,02100101.20220121161033003.txt_308100_0,1.0
1469862,2022-01-02,2021-12-26,2022-01-02,02100101.20220102093606001.txt_373220_2,02100101.20211231172619002.txt_373220_2,1.0
2263972,2021-02-03,2021-01-27,2021-02-03,02100101.20210203020154001.txt_122870_0,02100101.20210202164529001.txt_122870_0,1.0
2263973,2021-02-03,2021-01-27,2021-02-03,02100101.20210203020154001.txt_122870_0,02100101.20210202164529001.txt_037270_0,1.0
3200858,2021-07-21,2021-07-14,2021-07-21,02100101.20210721000359002.txt_095700_0,02100101.20210720204138001.txt_095700_0,1.0


time: 1.99 s (started: 2023-02-13 12:13:08 +00:00)
