# Measuring Document Similarity


In [1]:
from ekorpkit import eKonf

if eKonf.is_colab():
    eKonf.mount_google_drive()
ws = eKonf.set_workspace(
    workspace="/workspace", 
    project="ekorpkit-book/exmaples", 
    task="esg", 
    log_level="INFO",
    verbose=True
)
print("version:", ws.version)
print("project_dir:", ws.project_dir)


INFO:ekorpkit.hyfi.env:Set environment variable EKORPKIT_DATA_ROOT=/workspace/data
INFO:ekorpkit.hyfi.env:Set environment variable CACHED_PATH_CACHE_ROOT=/workspace/.cache/cached_path
INFO:ekorpkit.hyfi.env:Set environment variable WANDB_DIR=/workspace/projects/ekorpkit-book/exmaples/logs
INFO:ekorpkit.hyfi.env:Set environment variable WANDB_PROJECT=ekorpkit-book-exmaples
INFO:ekorpkit.hyfi.env:Set environment variable WANDB_NOTEBOOK_NAME=/workspace/projects/ekorpkit-book/exmaples/logs/esg-nb
INFO:ekorpkit.hyfi.env:Set environment variable WANDB_SILENT=False
INFO:ekorpkit.hyfi.utils.env:Loaded .env from /workspace/projects/ekorpkit-book/config/.env
INFO:ekorpkit.hyfi.hydra:initialized batcher with <ekorpkit.hyfi.utils.batch.batcher.Batcher object at 0x7fb0c31b2c10>


version: 0.1.40.post0.dev98
project_dir: /workspace/projects/ekorpkit-book/exmaples
time: 1.21 s (started: 2023-02-10 09:29:29 +00:00)


In [2]:
eKonf.compose("preprocessor/normalizer=formal_ko", verbose=True)

INFO:ekorpkit.hyfi.env:config_module: ekorpkit.conf
INFO:ekorpkit.hyfi.env:compose config with overrides: ['+preprocessor/normalizer=formal_ko']


{'ftfy': {'unescape_html': True, 'remove_terminal_escapes': True, 'fix_encoding': True, 'restore_byte_a0': True, 'replace_lossy_sequences': True, 'decode_inconsistent_utf8': True, 'fix_c1_controls': True, 'fix_latin_ligatures': True, 'fix_character_width': True, 'uncurl_quotes': True, 'fix_line_breaks': True, 'fix_surrogates': True, 'remove_control_chars': True, 'normalization': 'NFKC', 'max_decode_length': 1000000}, 'spaces': {'strip': True, 'fix_whitespaces': True, 'collapse_whitespaces': True, 'replace_tabs': True, 'num_spaces_for_tab': 4}, 'special_characters': {'fix_hyphens': True, 'fix_ellipsis': True, 'fix_slashes': True, 'fix_tildes': True, 'fix_emoticons': False, 'single_quotes_only': False, 'regular_parentheses_only': False}, '_target_': 'ekorpkit.preprocessors.normalizer.Normalizer', 'hanja2hangle': True, 'num_repeats': 2}

time: 167 ms (started: 2023-02-10 09:29:36 +00:00)


## Load data to measure similarity


In [3]:
news_data_dir = ws.project_dir / "esg/data/econ_news_kr/news_slice"
filename = "esg_news_valid_20221229.parquet"

valid_data = eKonf.load_data(filename, news_data_dir)
id_cols = ["filename", "codes", "chunk_id"]
valid_data.chunk_id = valid_data.chunk_id.astype(str)
valid_data["doc_id"] = valid_data[id_cols].apply(lambda x: "_".join(x), axis=1)

source_data_file = ws.project_dir / "esg/data/similarity/source_data.parquet"
eKonf.save_data(valid_data, source_data_file)

INFO:ekorpkit.hyfi.io.file:Saving dataframe to /workspace/projects/ekorpkit-book/exmaples/esg/data/similarity/source_data.parquet


time: 1min 14s (started: 2023-02-10 09:29:37 +00:00)


In [4]:
# load source data
source_data_file = ws.project_dir / "esg/data/similarity/source_data.parquet"
data = eKonf.load_data(source_data_file)
cols = ["doc_id", "text"]
data = data[cols].sample(1000)
data.head()

Unnamed: 0,doc_id,text
135590,02100101.20200703114528002.txt_101360_0,"""이엔드디는 그린 신소재 기업으로 정부가 추진하는 그린 뉴딜 정책의 수혜가 기대된..."
30748,02100851.20200211070816001.txt_051910_0,국제 신용평가사 무디스는 LG화학의 장기 신용등급을 한 단계 낮췄다 10일 기...
677787,02100601.20211111172031002.txt_271560_8,오리온은 국내 가격을 동결하고 해외에서는 일부 제품 가격을 올리는 투 트랙 전략을 ...
291166,02100601.20220128080812001.txt_270870_0,상상인증권은 28일 뉴트리가 올해 온라인 매출 비중을 확대하며 수익성을 개선할 것...
69410,02100311.20200415060111001.txt_005930_6,삼성전자는 이 같은 YMTC의 추격에 특유의 ‘초격차’로 대응할 방침이다 삼성전자...


time: 6.63 s (started: 2023-02-10 09:30:52 +00:00)


In [5]:
cfg_norm = eKonf.compose("preprocessor/normalizer=formal_ko")
cfg_mcb = eKonf.compose("preprocessor/tokenizer=mecab_econ")
cfg_mcb.normalize = cfg_norm
mecab = eKonf.instantiate(cfg_mcb, verbose=True)


INFO:ekorpkit.preprocessors.tokenizer:Initializing mecab with {'userdic_path': '/workspace/projects/ekorpkit/ekorpkit/resources/dictionaries/mecab/ekon_v1.dic', 'backend': 'mecab-python3', 'verbose': True}...
INFO:ekorpkit.preprocessors.tokenizer:instantiating ekorpkit.preprocessors.normalizer.Normalizer...
INFO:ekorpkit.preprocessors.tokenizer:instantiating ekorpkit.preprocessors.stopwords.Stopwords...
INFO:ekorpkit.hyfi.hydra:instantiating ekorpkit.preprocessors.stopwords.Stopwords ...
INFO:ekorpkit.preprocessors.tokenizer:MecabTokenizer initialized with:
INFO:ekorpkit.preprocessors.tokenizer:	return_as_list: False
INFO:ekorpkit.tokenizers.mecab:MeCab uses mecab-python3 as backend.
INFO:ekorpkit.tokenizers.mecab:Mecab uses system dictionary: /opt/conda/lib/python3.8/site-packages/mecab_ko_dic/dicdir, user dictionary: /workspace/projects/ekorpkit/ekorpkit/resources/dictionaries/mecab/ekon_v1.dic


time: 877 ms (started: 2023-02-10 09:31:00 +00:00)


In [6]:
# Tokenize

cfg = eKonf.compose("pipeline/tokenize")
data = eKonf.pipe(data, cfg)
data.head()

INFO:ekorpkit.hyfi.pipe:Applying pipe: functools.partial(<function tokenize at 0x7faa3a177310>)
INFO:ekorpkit.pipelines.pipe:instantiating tokenizer
INFO:ekorpkit.hyfi.hydra:instantiating ekorpkit.preprocessors.tokenizer.MecabTokenizer ...
INFO:ekorpkit.preprocessors.tokenizer:Initializing mecab with {'userdic_path': None, 'backend': 'mecab-python3', 'verbose': True}...
INFO:ekorpkit.preprocessors.tokenizer:instantiating ekorpkit.preprocessors.stopwords.Stopwords...
INFO:ekorpkit.hyfi.hydra:instantiating ekorpkit.preprocessors.stopwords.Stopwords ...
INFO:ekorpkit.preprocessors.tokenizer:MecabTokenizer initialized with:
INFO:ekorpkit.preprocessors.tokenizer:	return_as_list: False
INFO:ekorpkit.tokenizers.mecab:MeCab uses mecab-python3 as backend.
INFO:ekorpkit.tokenizers.mecab:Mecab uses system dictionary: /opt/conda/lib/python3.8/site-packages/mecab_ko_dic/dicdir, user dictionary: None
INFO:ekorpkit.hyfi.pipe:Using batcher with minibatch size: 21
INFO:ekorpkit.hyfi.utils.batch.batcher

Tokenizing column: text:   0%|          | 0/48 [00:00<?, ?it/s]

INFO:ekorpkit.pipelines.pipe: >> elapsed time to segment: 0:00:04.993487


Unnamed: 0,doc_id,text
135590,02100101.20200703114528002.txt_101360_0,"""/SY 이/JKS 엔드/NNG 디/NNG 는/JX /SP 그린/VV+ETM /SP..."
30748,02100851.20200211070816001.txt_051910_0,국제/NNG /SP 신용/NNG 평가/NNG 사/VV+EC /SP 무디스/NNP 는...
677787,02100601.20211111172031002.txt_271560_8,오리온/NNP 은/JX /SP 국내/NNG /SP 가격/NNG 을/JKO /SP 동...
291166,02100601.20220128080812001.txt_270870_0,상상/NNG 인증/NNG 권/XSN 은/JX /SP 28/SN 일/NNBC /SP ...
69410,02100311.20200415060111001.txt_005930_6,삼성전자/NNP 는/JX /SP 이/MM /SP 같/VA 은/ETM /SP YMTC...


time: 5.24 s (started: 2023-02-10 09:31:10 +00:00)


In [7]:
# Extract tokens
# stopwords_file = ws.project_dir / "esg/data/stopwords/stopwords.txt"
tkn_cfg = eKonf.compose("preprocessor/tokenizer=mecab_econ")
# tkn_cfg.extract.strip_pos = False

cfg = eKonf.compose("pipeline/extract_tokens")
cfg.preprocessor.tokenizer = tkn_cfg
cfg.nouns_only = False
# cfg.stopwords_path = str(stopwords_file)
# eKonf.print(cfg)
data = eKonf.pipe(data, cfg)

tokenized_data_file = ws.project_dir / "esg/data/similarity/tokenized_data.parquet"
eKonf.save_data(data, tokenized_data_file)


INFO:ekorpkit.hyfi.pipe:Applying pipe: functools.partial(<function extract_tokens at 0x7faa3a1773a0>)
INFO:ekorpkit.pipelines.pipe:instantiating tokenizer
INFO:ekorpkit.hyfi.hydra:instantiating ekorpkit.preprocessors.tokenizer.MecabTokenizer ...
INFO:ekorpkit.preprocessors.tokenizer:Initializing mecab with {'userdic_path': '/workspace/projects/ekorpkit/ekorpkit/resources/dictionaries/mecab/ekon_v1.dic', 'backend': 'mecab-python3', 'verbose': True}...
INFO:ekorpkit.preprocessors.tokenizer:instantiating ekorpkit.preprocessors.stopwords.Stopwords...
INFO:ekorpkit.hyfi.hydra:instantiating ekorpkit.preprocessors.stopwords.Stopwords ...
INFO:ekorpkit.preprocessors.tokenizer:MecabTokenizer initialized with:
INFO:ekorpkit.preprocessors.tokenizer:	return_as_list: False
INFO:ekorpkit.tokenizers.mecab:MeCab uses mecab-python3 as backend.
INFO:ekorpkit.tokenizers.mecab:Mecab uses system dictionary: /opt/conda/lib/python3.8/site-packages/mecab_ko_dic/dicdir, user dictionary: /workspace/projects/eko

Extracting column: text:   0%|          | 0/48 [00:00<?, ?it/s]

INFO:ekorpkit.pipelines.pipe: >> elapsed time to extract tokens: 0:00:00.777058
INFO:ekorpkit.hyfi.io.file:Saving dataframe to /workspace/projects/ekorpkit-book/exmaples/esg/data/similarity/tokenized_data.parquet


time: 1.33 s (started: 2023-02-10 09:31:20 +00:00)


## Predict similarity

Similarity will be measured among the news articles on the same day. The similarity is measured by the cosine similarity of the document vectors.


In [22]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Load the data into a pandas dataframe
tokenized_data_file = ws.project_dir / "esg/data/similarity/tokenized_data.parquet"
data = eKonf.load_data(tokenized_data_file)
data = data.reset_index(drop=True)

# Create the TF-IDF matrix using the TfidfVectorizer
vectorizer = TfidfVectorizer(tokenizer=lambda x: x.split(" "))
tfidf_matrix = vectorizer.fit_transform(data['text'])

# Calculate the cosine similarity between the documents
cosine_similarities = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Get the indices of the upper triangle of the cosine similarity matrix
upper_triangle_indices = np.triu_indices_from(cosine_similarities, k=1)

# Get the document ID pairs and similarity values from the cosine similarity matrix
doc_id_pairs = [(data['doc_id'][i], data['doc_id'][j]) for i, j in zip(*upper_triangle_indices)]
similarities = cosine_similarities[upper_triangle_indices]

# Create a result data frame in a long format with document ID pairs and similarity as columns
result_df = pd.DataFrame({'doc_id_1': [pair[0] for pair in doc_id_pairs],
                         'doc_id_2': [pair[1] for pair in doc_id_pairs],
                         'similarity': similarities})

# Print the result data frame
result_df

Unnamed: 0,doc_id_1,doc_id_2,similarity
0,02100101.20200703114528002.txt_101360_0,02100851.20200211070816001.txt_051910_0,0.109638
1,02100101.20200703114528002.txt_101360_0,02100601.20211111172031002.txt_271560_8,0.120655
2,02100101.20200703114528002.txt_101360_0,02100601.20220128080812001.txt_270870_0,0.114443
3,02100101.20200703114528002.txt_101360_0,02100311.20200415060111001.txt_005930_6,0.092372
4,02100101.20200703114528002.txt_101360_0,02100601.20211014101325001.txt_001680_0,0.108429
...,...,...,...
499495,02100501.20201210175145002.txt_003670_3,02100701.20210527101452001.txt_007070_2,0.017959
499496,02100501.20201210175145002.txt_003670_3,02100701.20210128164141001.txt_066570_0,0.035580
499497,02100601.20210528162407001.txt_008770_1,02100701.20210527101452001.txt_007070_2,0.034574
499498,02100601.20210528162407001.txt_008770_1,02100701.20210128164141001.txt_066570_0,0.067384


time: 4.5 s (started: 2023-02-10 09:48:22 +00:00)


In [None]:
# Convert the date column to a datetime object
data['date'] = pd.to_datetime(data['date'])

# Get the unique dates in the dataframe
unique_dates = data['date'].dt.date.unique()

# Create a list to store the result data frames for each 7-day interval
result_dfs = []

# Iterate over the unique dates in the dataframe
for i in range(len(unique_dates) - 7):
    start_date = unique_dates[i]
    end_date = unique_dates[i + 7]
    
    # Get the rows from the dataframe for the current 7-day interval
    interval_data = data[(data['date'].dt.date >= start_date) & (data['date'].dt.date < end_date)]
    
    # Create the TF-IDF matrix for the current 7-day interval
    vectorizer = TfidfVectorizer(tokenizer=lambda x: x.split(" "))
    tfidf_matrix = vectorizer.fit_transform(interval_data['text'])
    
    # Calculate the cosine similarity between the documents in the current 7-day interval
    cosine_similarities = cosine_similarity(tfidf_matrix, tfidf_matrix)
    
    # Get the indices of the upper triangle of the cosine similarity matrix
    upper_triangle_indices = np.triu_indices_from(cosine_similarities, k=1)
    
    # Get the document ID pairs and similarity values from the cosine similarity matrix
    doc_id_pairs = [(interval_data['doc_id'][i], interval_data['doc_id'][j]) for i, j in zip(*upper_triangle_indices)]
    similarities = cosine_similarities[upper_triangle_indices]
    
    # Create a result data frame in a long format with document ID pairs, similarity, and start/end dates
    result_df = pd.DataFrame({'doc_id_1': [pair[0] for pair in doc_id_pairs],
                             'doc_id_2': [pair[1] for pair in doc_id_pairs],
                             'similarity': similarities,
                             'start_date': start_date,
                             'end_date': end_date})
    result_dfs.append(result_df)

# Concatenate the result data frames for each 7-day interval into a single data frame
result_df = pd.concat(result_dfs)

# Print the result data frame
print(result_df)
