In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm

In [2]:
summary = pd.read_csv('./news_summary.csv')
summary.head()

Unnamed: 0,rgs_dt,tck_iem_cd,til_ifo,ctgy_cfc_ifo,mdi_ifo,news_smy_ifo,rld_ose_iem_tck_cd,url_ifo,article,summary
0,20230801,Latest,Steady Start Seen For Indonesia Stock Market,Markets,RTTNews,(RTTNews) - The Indonesia stock market has mov...,_,https://www.nasdaq.com/articles/steady-start-s...,"July 31, 2023 — 09:34 pm EDT Written by ...",The Indonesia stock market has risen by more t...
1,20230801,ARES,Ares Management (ARES) Q2 Earnings: Taking a L...,Technology,Zacks,"For the quarter ended June 2023, Ares Manageme...",ARES,https://www.nasdaq.com/articles/ares-managemen...,"August 01, 2023 — 09:30 am EDT Written by...",Ares Management reported revenue of $792.1 mil...
2,20230801,EW,RBC Capital Reiterates Edwards Lifesciences (E...,Stocks,Fintel,"Fintel reports that on August 1, 2023, RBC Cap...",EW,https://www.nasdaq.com/articles/rbc-capital-re...,"August 01, 2023 — 07:30 pm EDT Written by...",RBC Capital reiterated its Outperform recommen...
3,20230801,LMAT,"Noteworthy Tuesday Option Activity: LMAT, MSTR...","Markets,Stocks,Options",BNK Invest,Looking at options trading activity among comp...,"LMAT,LMAT,MSTR,TPC",https://www.nasdaq.com/articles/noteworthy-tue...,"August 01, 2023 — 03:19 pm EDT Written by...",BNK Invest analyzed options trading activity a...
4,20230801,EW,Notable Two Hundred Day Moving Average Cross - EW,"Markets,Stocks",BNK Invest,"In trading on Tuesday, shares of Edwards Lifes...",EW,https://www.nasdaq.com/articles/notable-two-hu...,"August 01, 2023 — 11:07 am EDT Written by...","On August 1, 2023, at 11:07 am EDT, shares of ..."


## Simmilarity

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [4]:
nltk.download('stopwords')
nltk.download('punkt')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/sewoong/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/sewoong/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [5]:
# 텍스트를 토큰화하고 불용어(stopwords)를 제거하는 함수
def preprocess_text(text):
    words = word_tokenize(text.lower())
    words = [word for word in words if word.isalpha() and word not in stop_words] #알파벳뺴고 모두 제거?
    return ' '.join(words)

def get_similiarity(text1,text2):
    processed_article_1 = preprocess_text(text1)
    processed_article_2 = preprocess_text(text2)
    
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform([processed_article_1, processed_article_2])

    # 코사인 유사도 계산
    cosine_similarities = cosine_similarity(tfidf_matrix[0], tfidf_matrix[1])

    # 결과 출력
    #print("Similarity between the two news articles:", cosine_similarities[0][0])
    return cosine_similarities[0][0]

In [56]:
n = summary.shape[0]
sim_matrix = np.array([np.NaN]*(n*n)).reshape(n,n)

In [57]:
for i in tqdm(range(n)):
    txt1 = summary.loc[i,'news_smy_ifo']
    arr = sim_matrix[i]
    for j in range(n):
        if np.isnan(arr[j]):
            txt2 =  summary.loc[j,'news_smy_ifo']      
            sim = get_similiarity(txt1,txt2)
            arr[j] = sim
            sim_matrix[j][i] = sim  
    sim_matrix[i] = arr

100%|█████████████████████████████████████████| 998/998 [10:52<00:00,  1.53it/s]


In [60]:
graph = [[] for _ in range(n)]

group = np.array([-1] * n)

# 그룹 수
count = 1

In [61]:
threshold = 0.4

for i in tqdm(range(n)):
    temp_sim = sim_matrix[i]
    for j,sim in enumerate(temp_sim):
        if i==j:
            continue
        if sim>threshold:
            graph[i].append(j)
            graph[j].append(i)

100%|███████████████████████████████████████| 998/998 [00:00<00:00, 6291.17it/s]


In [62]:
# node와 연결된 노드 탐색
def dfs(node):

    for neighbor in graph[node]:

        # 연결된 노드가 속한 그룹이 없다면
        if (group[neighbor] == -1):

            # node와 연결된 노드들을 모두 같은 그룹으로 처리
            group[neighbor] = group[node]

            # 연결된 노드들을 기준으로 dfs 반복
            dfs(neighbor)

# 1번 노드부터 마지막 노드까지 순서대로 탐색하며
# 현재 노드와 연결된 노드들을 모두 같은 그룹으로 처리
for i in tqdm(range(n)):

    # 현재 노드가 어떤 그룹에도 속해 있지 않은 경우, 해당 노드를 새로운 그룹으로 할당
    if (group[i] == -1):
        group[i] = count
        count += 1
        dfs(i)

# # 연결된 요소의 개수 출력
print(count-1)

100%|█████████████████████████████████████| 998/998 [00:00<00:00, 111262.44it/s]

197





In [66]:
uniq,cnts = np.unique(group,return_counts=True)

In [72]:
argsort = cnts.argsort()[::-1]
cnts = cnts[argsort]
uniq = uniq[argsort]

In [73]:
uniq

array([  3,   6,  15,   2,  36,   4,  17,   5,   9,  43,  18,  60,  11,
        12,  29,  38,  89,   7,  13,  20,   1, 107,  48,  61,  27,  31,
        28,  46,  87,  52,  16,   8,  85,  75,  34, 159,  35, 147,  54,
        74,  72, 143,  98, 138,  94,  56,  30,  37,  88,  33,  86,  78,
        40, 108,  79,  83,  14, 116, 113, 112,  64,  66,  76, 106, 104,
       102,  69,  71,  73,  77,  93,  26, 125,  25, 149,  45, 141,  44,
        21, 145, 146, 150,  19,  23, 157, 162,  24, 166, 172,  47,  81,
        49,  53,  50, 132, 131, 128, 129, 130,  10,  39,  57,  58,  32,
        59,  22,  62,  41,  67,  42,  63,  51,  65,  70,  55,  68,  80,
       197,  82, 167, 175, 174, 173, 171, 170, 169, 168, 165, 177, 164,
       163, 161, 160, 158, 156, 155, 176, 178, 153, 188, 195, 194, 193,
       192, 191, 190, 189, 187, 179, 186, 185, 184, 183, 182, 181, 180,
       154, 152,  84, 101, 115, 114, 111, 110, 109, 105, 103, 100, 118,
       196,  97,  96,  95,  92,  91,  90, 117, 119, 151, 135, 14

In [74]:
cnts

array([192, 183,  51,  45,  30,  25,  23,  15,  14,  13,  12,  12,  12,
        12,   8,   8,   7,   7,   7,   6,   6,   6,   5,   5,   5,   5,
         5,   5,   4,   4,   4,   4,   4,   4,   4,   4,   4,   3,   3,
         3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,
         3,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,
         2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,
         2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,
         2,   2,   2,   2,   2,   2,   2,   2,   1,   1,   1,   1,   1,
         1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
         1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
         1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
         1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
         1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
         1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   

In [88]:
ids = np.where(np.array(group)==3)[0]

In [91]:
ids[np.argmax([len(graph[i]) for i in ids])]

449

In [92]:
summary.iloc[449]

rgs_dt                                                         20230801
tck_iem_cd                                                         ARCB
til_ifo               Citigroup Maintains ArcBest (ARCB) Buy Recomme...
ctgy_cfc_ifo                                                     Stocks
mdi_ifo                                                          Fintel
news_smy_ifo          Fintel reports that on July 31, 2023, Citigrou...
rld_ose_iem_tck_cd                                                 ARCB
url_ifo               https://www.nasdaq.com/articles/citigroup-main...
article               August 01, 2023 — 09:02 am EDT      Written by...
summary               Fintel reports that on July 31, 2023, Citigrou...
Name: 449, dtype: object