# Measuring Document Similarity


In [1]:
from ekorpkit import eKonf

if eKonf.is_colab():
    eKonf.mount_google_drive()
ws = eKonf.init_workspace(
    workspace="/workspace", 
    project="ekorpkit-book/exmaples", 
    task="esg", 
    log_level="WARNING",
    verbose=True
)
print("version:", ws.version)
print("project_dir:", ws.project_dir)


version: 0.1.40.post0.dev108
project_dir: /workspace/projects/ekorpkit-book/exmaples
time: 800 ms (started: 2023-03-27 10:44:47 +00:00)


## Load data to measure similarity


In [7]:
import pandas as pd

news_data_dir = ws.project_dir / "esg/data/econ_news_kr/news_slice"
filename = "esg_news_valid_20221229.parquet"

valid_data = eKonf.load_data(filename, news_data_dir)
id_cols = ["filename", "codes", "chunk_id"]
valid_data.chunk_id = valid_data.chunk_id.astype(str)
valid_data["doc_id"] = valid_data[id_cols].apply(lambda x: "_".join(x), axis=1)

# make date column from filename by splitting filename by ".", second element is date
valid_data["date"] = valid_data.filename.str.split(".").str[1]
# only need first 14 characters
valid_data["date"] = valid_data.date.str[:14]
# convert date column to datetime
valid_data["date"] = pd.to_datetime(valid_data.date, format="%Y%m%d%H%M%S")

source_data_file = ws.project_dir / "esg/data/similarity/source_data.parquet"
eKonf.save_data(valid_data, source_data_file)

INFO:ekorpkit.hyfi.io.file:Saving dataframe to /workspace/projects/ekorpkit-book/exmaples/esg/data/similarity/source_data.parquet


time: 1min 17s (started: 2023-03-01 02:30:16 +00:00)


In [2]:
# load source data
source_data_file = ws.project_dir / "esg/data/similarity/source_data.parquet"
data = eKonf.load_data(source_data_file)
cols = ["date", "doc_id", "text"]
# data = data[cols].sample(1000)
data.head()

Unnamed: 0,filename,chunk_id,text,codes,doc_id,date
0,02100101.20200101040200001.txt,0,◆ 2020 경제기상도 / 업종별 전망 (반도체) ◆ 지난해 미·중 무역분쟁과 공...,660,02100101.20200101040200001.txt_000660_0,2020-01-01 04:02:00
2,02100101.20200101040200002.txt,0,"◆ 2020 경제기상도 / 업종별 전망 (가전) ◆ TV, 냉장고, 세탁기 등 전...",66570,02100101.20200101040200002.txt_066570_0,2020-01-01 04:02:00
3,02100101.20200101040200002.txt,0,"◆ 2020 경제기상도 / 업종별 전망 (가전) ◆ TV, 냉장고, 세탁기 등 전...",5930,02100101.20200101040200002.txt_005930_0,2020-01-01 04:02:00
4,02100101.20200101040201001.txt,0,◆ 2020 경제기상도 / 업종별 전망 (디스플레이) ◆ 액정표시장치(LCD) 시...,34220,02100101.20200101040201001.txt_034220_0,2020-01-01 04:02:01
5,02100101.20200101040201001.txt,1,디스플레이 업계 등에서는 삼성과 LG가 글로벌 디스플레이 시장에서 중국 업체의 L...,3550,02100101.20200101040201001.txt_003550_1,2020-01-01 04:02:01


time: 6.83 s (started: 2023-03-27 10:39:35 +00:00)


In [3]:
cfg_norm = eKonf.compose("preprocessor/normalizer=formal_ko", config_module="ekorpkit.conf")
cfg_mcb = eKonf.compose("preprocessor/tokenizer=mecab_econ", config_module="ekorpkit.conf")
cfg_mcb.normalize = cfg_norm
mecab = eKonf.instantiate(cfg_mcb, verbose=True)


INFO:hyfi.hydra:instantiating ekorpkit.preprocessors.stopwords.Stopwords ...


time: 864 ms (started: 2023-03-01 08:34:22 +00:00)


In [5]:
# Tokenize

cfg = eKonf.compose("pipeline/tokenize")
data = eKonf.pipe(data, cfg)
data.head()

INFO:hyfi.pipe:Applying pipe: functools.partial(<function tokenize at 0x7f191c6b3ee0>)
INFO:hyfi.hydra:instantiating ekorpkit.preprocessors.tokenizer.MecabTokenizer ...
INFO:hyfi.hydra:instantiating ekorpkit.preprocessors.stopwords.Stopwords ...
INFO:hyfi.pipe:Using batcher with minibatch size: 1000
INFO:hyfi.utils.batch.batcher: backend: joblib  minibatch_size: 1000  procs: 50  input_split: False  merge_output: True  len(data): 558923 len(args): 5


Tokenizing column: text:   0%|          | 0/559 [00:00<?, ?it/s]

Unnamed: 0,filename,chunk_id,text,codes,doc_id,date
0,02100101.20200101040200001.txt,0,◆/SY /SP 2020/SN /SP 경제/NNG 기상도/NNG /SP //SC /...,660,02100101.20200101040200001.txt_000660_0,2020-01-01 04:02:00
2,02100101.20200101040200002.txt,0,◆/SY /SP 2020/SN /SP 경제/NNG 기상도/NNG /SP //SC /...,66570,02100101.20200101040200002.txt_066570_0,2020-01-01 04:02:00
3,02100101.20200101040200002.txt,0,◆/SY /SP 2020/SN /SP 경제/NNG 기상도/NNG /SP //SC /...,5930,02100101.20200101040200002.txt_005930_0,2020-01-01 04:02:00
4,02100101.20200101040201001.txt,0,◆/SY /SP 2020/SN /SP 경제/NNG 기상도/NNG /SP //SC /...,34220,02100101.20200101040201001.txt_034220_0,2020-01-01 04:02:01
5,02100101.20200101040201001.txt,1,디스플레이/NNG /SP 업계/NNG /SP 등/NNB 에서/JKB 는/JX /SP...,3550,02100101.20200101040201001.txt_003550_1,2020-01-01 04:02:01


time: 1min 11s (started: 2023-03-01 08:34:55 +00:00)


In [6]:
# Extract tokens
# stopwords_file = ws.project_dir / "esg/data/stopwords/stopwords.txt"
tkn_cfg = eKonf.compose("preprocessor/tokenizer=mecab_econ")
# tkn_cfg.extract.strip_pos = False

cfg = eKonf.compose("pipeline/extract_tokens")
cfg.preprocessor.tokenizer = tkn_cfg
cfg.nouns_only = False
# cfg.stopwords_path = str(stopwords_file)
# eKonf.print(cfg)
data = eKonf.pipe(data, cfg)

tokenized_data_file = ws.project_dir / "esg/data/similarity/tokenized_data.parquet"
eKonf.save_data(data, tokenized_data_file)


INFO:hyfi.pipe:Applying pipe: functools.partial(<function extract_tokens at 0x7f191c6b3f70>)
INFO:hyfi.hydra:instantiating ekorpkit.preprocessors.tokenizer.MecabTokenizer ...
INFO:hyfi.hydra:instantiating ekorpkit.preprocessors.stopwords.Stopwords ...
INFO:hyfi.pipe:Using batcher with minibatch size: 1000
INFO:hyfi.utils.batch.batcher: backend: joblib  minibatch_size: 1000  procs: 50  input_split: False  merge_output: True  len(data): 558923 len(args): 5


Extracting column: text:   0%|          | 0/559 [00:00<?, ?it/s]

INFO:hyfi.io.file:Saving dataframe to /workspace/projects/ekorpkit-book/exmaples/esg/data/similarity/tokenized_data.parquet


time: 1min 24s (started: 2023-03-01 08:36:06 +00:00)


## Predict similarity

Similarity will be measured among the news articles on the same day. The similarity is measured by the cosine similarity of the document vectors.


In [6]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Load the data into a pandas dataframe
tokenized_data_file = ws.project_dir / "esg/data/similarity/tokenized_data.parquet"
data = eKonf.load_data(tokenized_data_file)

data = data[data.codes == "020560"]
data = data.reset_index(drop=True)
# Extract the date part
data["date"] = data["date"].dt.date

data

Unnamed: 0,filename,chunk_id,text,codes,doc_id,date
0,02100101.20200101040206002.txt,0,2020 경제 기상도 업종 별 전망 항공 국내 항공업 은 올해 도 침체기 를 이 ...,020560,02100101.20200101040206002.txt_020560_0,2020-01-01
1,02100101.20200102105208001.txt,0,지난해 말 HDC 현대산업개발 미래 에 셋 대우 컨소시엄 과 인수 합병 M A 계약...,020560,02100101.20200102105208001.txt_020560_0,2020-01-02
2,02100101.20200102105208001.txt,2,그 는 아시아나항공 이 국내 최고 항공사 로 발돋움 할 수 있 는 기반 이 마련 됐...,020560,02100101.20200102105208001.txt_020560_2,2020-01-02
3,02100101.20200102163114001.txt,2,한창수 아시아나항공 사장 은 2 일 열린 시무식 에서 지난해 체결 된 회사 의 인수...,020560,02100101.20200102163114001.txt_020560_2,2020-01-02
4,02100101.20200102163114001.txt,4,무엇 보다 2 조 2000 억 원 에 달하 는 자본 이 아시아 나 항공 에 투입 돼...,020560,02100101.20200102163114001.txt_020560_4,2020-01-02
...,...,...,...,...,...,...
3712,02100851.20211221213406001.txt,3,또 2016 년 4 월 아시아 나 항공 이 보유 중 인 금호터미널 지분 전량 을 금...,020560,02100851.20211221213406001.txt_020560_3,2021-12-21
3713,02100851.20211226171045001.txt,0,운수 권 이 관건 조건부 승인 관측 세종 에 있 는 공정 거래 위원회 건물 사진 연...,020560,02100851.20211226171045001.txt_020560_0,2021-12-26
3714,02100851.20211229085237001.txt,2,양 연구원 은 대한항공 과 아시아나항공 의 기업 결합 에 따른 일부 노선 운수 권 ...,020560,02100851.20211229085237001.txt_020560_2,2021-12-29
3715,02100851.20211229164610001.txt,0,미국 EU 등 7 개국 심사 결과 관건 해외 심사 트렌드 엄격 해져 12 월 26 ...,020560,02100851.20211229164610001.txt_020560_0,2021-12-29


time: 6.98 s (started: 2023-03-27 10:51:34 +00:00)


In [9]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Load the data into a pandas dataframe
tokenized_data_file = ws.project_dir / "esg/data/similarity/tokenized_data.parquet"
data = eKonf.load_data(tokenized_data_file)
data = data.reset_index(drop=True)
# Extract the date part
data["date"] = data["date"].dt.date

# Convert the data into a matrix representation using TF-IDF vectorization
vectorizer = TfidfVectorizer(tokenizer=lambda x: x.split(" "))

# Initialize a list to store the results
results = []

# Iterate over the unique dates, ignoring time information
unique_dates = data["date"].unique()
for i, date in enumerate(unique_dates):
    # Calculate the similarity between the current document and the previous seven days
    current_doc = data[data["date"] == date]["doc_id"].iloc[0]
    current_text = data[data["date"] == date]["text"].iloc[0]
    previous_period_start = date - pd.Timedelta(7, "d")
    previous_period_end = date
    previous_period = data[
        (data["date"] >= previous_period_start) & (data["date"] <= previous_period_end)
    ]["doc_id"]
    previous_text = data[
        (data["date"] >= previous_period_start) & (data["date"] <= previous_period_end)
    ]["text"]
    matrix = vectorizer.fit_transform(previous_text.append(pd.Series(current_text)))
    similarity = cosine_similarity(matrix)
    current_doc_index = matrix.shape[0] - 1
    for j, doc in enumerate(previous_period):
        if current_doc == doc:
            continue
        sim = similarity[current_doc_index][j]
        results.append(
            [date, previous_period_start, previous_period_end, current_doc, doc, sim]
        )

# Convert the results list into a data frame
results = pd.DataFrame(
    results,
    columns=["date", "start_date", "end_date", "doc_id_1", "doc_id_2", "similarity"],
)
print(f"Number of results: {len(results)}")
# save the results
# similarity_results_file = (
#     ws.project_dir / "esg/data/similarity/similarity_results-020560.parquet"
# )
similarity_results_file = (
    ws.project_dir / "esg/data/similarity/similarity_results.parquet"
)
eKonf.save_data(results, similarity_results_file)

INFO:hyfi.io.file:Saving dataframe to /workspace/projects/ekorpkit-book/exmaples/esg/data/similarity/similarity_results.parquet


Number of results: 4454647
time: 1h 5min 33s (started: 2023-03-27 10:52:39 +00:00)


In [8]:
# sort by similarity
results = results.sort_values(by=['similarity'], ascending=False)
results.head()

Unnamed: 0,date,start_date,end_date,doc_id_1,doc_id_2,similarity
21207,2021-03-27,2021-03-20,2021-03-27,02100201.20210327075129001.txt_020560_4,02100201.20210326180200001.txt_020560_5,1.0
19987,2021-01-05,2020-12-29,2021-01-05,02100101.20210105174916001.txt_020560_4,02100101.20210105175726001.txt_020560_13,1.0
21255,2021-03-28,2021-03-21,2021-03-28,02100201.20210328175011001.txt_020560_1,02100801.20210328185945001.txt_020560_1,1.0
22122,2021-07-06,2021-06-29,2021-07-06,02100101.20210706143057001.txt_020560_0,02100701.20210706135911001.txt_020560_1,1.0
18414,2022-01-02,2021-12-26,2022-01-02,02100701.20220102150018001.txt_020560_0,02100801.20220102145824001.txt_020560_0,1.0


time: 10.6 ms (started: 2023-03-27 10:52:02 +00:00)
