In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import time

## Data Load

In [2]:
summary = pd.read_csv('./data/news_summary.csv')
summary.head()

Unnamed: 0,rgs_dt,tck_iem_cd,til_ifo,ctgy_cfc_ifo,mdi_ifo,news_smy_ifo,rld_ose_iem_tck_cd,url_ifo,article,summary
0,20230801,Latest,Steady Start Seen For Indonesia Stock Market,Markets,RTTNews,(RTTNews) - The Indonesia stock market has mov...,_,https://www.nasdaq.com/articles/steady-start-s...,"July 31, 2023 — 09:34 pm EDT Written by ...",The Indonesia stock market has risen by more t...
1,20230801,ARES,Ares Management (ARES) Q2 Earnings: Taking a L...,Technology,Zacks,"For the quarter ended June 2023, Ares Manageme...",ARES,https://www.nasdaq.com/articles/ares-managemen...,"August 01, 2023 — 09:30 am EDT Written by...",Ares Management reported revenue of $792.1 mil...
2,20230801,EW,RBC Capital Reiterates Edwards Lifesciences (E...,Stocks,Fintel,"Fintel reports that on August 1, 2023, RBC Cap...",EW,https://www.nasdaq.com/articles/rbc-capital-re...,"August 01, 2023 — 07:30 pm EDT Written by...",RBC Capital reiterated its Outperform recommen...
3,20230801,LMAT,"Noteworthy Tuesday Option Activity: LMAT, MSTR...","Markets,Stocks,Options",BNK Invest,Looking at options trading activity among comp...,"LMAT,LMAT,MSTR,TPC",https://www.nasdaq.com/articles/noteworthy-tue...,"August 01, 2023 — 03:19 pm EDT Written by...",BNK Invest analyzed options trading activity a...
4,20230801,EW,Notable Two Hundred Day Moving Average Cross - EW,"Markets,Stocks",BNK Invest,"In trading on Tuesday, shares of Edwards Lifes...",EW,https://www.nasdaq.com/articles/notable-two-hu...,"August 01, 2023 — 11:07 am EDT Written by...","On August 1, 2023, at 11:07 am EDT, shares of ..."


In [3]:
summary.drop_duplicates(subset='url_ifo',inplace=True)
summary.reset_index(drop=True,inplace=True)

In [26]:
import pathlib
import pickle

pkls =sorted(list(pathlib.Path('./information').glob('*.pkl')))
pkls

comp_desc = pickle.load(open(pkls[0],'rb'))
comp_industry = pickle.load(open(pkls[1],'rb'))
etf_desc = pickle.load(open(pkls[2],'rb'))
etf_holdings = pickle.load(open(pkls[3],'rb'))

## Simmilarity

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [5]:
nltk.download('stopwords')
nltk.download('punkt')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/sewoong/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/sewoong/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [6]:
# 텍스트를 토큰화하고 불용어(stopwords)를 제거하는 함수
def preprocess_text(text):
    words = word_tokenize(text.lower())
    words = [word for word in words if word.isalpha() and word not in stop_words] #알파벳뺴고 모두 제거?
    return ' '.join(words)

def get_similiarity(text1,text2):
    processed_article_1 = preprocess_text(text1)
    processed_article_2 = preprocess_text(text2)
    
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform([processed_article_1, processed_article_2])

    # 코사인 유사도 계산
    cosine_similarities = cosine_similarity(tfidf_matrix[0], tfidf_matrix[1])

    # 결과 출력
    #print("Similarity between the two news articles:", cosine_similarities[0][0])
    return cosine_similarities[0][0]

In [7]:
n = summary.shape[0]
sim_matrix = np.array([np.NaN]*(n*n)).reshape(n,n)

In [8]:
start = time.time()

for i in tqdm(range(n)):
    txt1 = summary.loc[i,'news_smy_ifo']
    arr = sim_matrix[i]
    for j in range(n):
        if np.isnan(arr[j]):
            txt2 =  summary.loc[j,'news_smy_ifo']      
            sim = get_similiarity(txt1,txt2)
            arr[j] = sim
            sim_matrix[j][i] = sim  
    sim_matrix[i] = arr
    
end = time.time()
print('유사도 계산 소요시간 :{}m'.format((end-start)/60))

  0%|          | 0/619 [00:00<?, ?it/s]

100%|██████████| 619/619 [04:13<00:00,  2.44it/s]

유사도 계산 소요시간 :4.230026988188426m





## Grouping

In [78]:
need_columns = ['summary','rld_ose_iem_tck_cd','mdi_ifo','til_ifo','ctgy_cfc_ifo']

In [79]:
def dfs(node,graph,group):

    for neighbor in graph[node]:

        # 연결된 노드가 속한 그룹이 없다면
        if (group[neighbor] == -1):

            # node와 연결된 노드들을 모두 같은 그룹으로 처리
            group[neighbor] = group[node]

            # 연결된 노드들을 기준으로 dfs 반복
            dfs(neighbor,graph,group)

def make_group(thr):
    graph = [[] for _ in range(n)]
    group = np.array([-1] * n)

    # 그룹 수
    count = 1

    for i in tqdm(range(n)):
        temp_sim = sim_matrix[i]
        for j,sim in enumerate(temp_sim):
            if i==j:
                continue
            if sim>thr:
                graph[i].append(j)
                graph[j].append(i)

    # 현재 노드와 연결된 노드들을 모두 같은 그룹으로 처리
    for i in tqdm(range(n)):
        if (group[i] == -1):
            group[i] = count
            count += 1
            dfs(i,graph,group)
            
            
    uniq,cnts = np.unique(group,return_counts=True)
    argsort = cnts.argsort()[::-1]
    cnts = cnts[argsort]
    uniq = uniq[argsort]
    
    groupsize_lim = int(n*0.01)
    mask = cnts>groupsize_lim
    cnts = cnts[mask]
    uniq = uniq[mask]
    
    if len(uniq)==0:
        print('cant group news')
        return -1
    
    if len(uniq)>10:
        print('too low threshold')
        return -1
    # elif len(uniq)==0:
    #     print('too low threshold')
    #     return -1,-1
    
    represents = pd.DataFrame()
    for i,(u,c) in enumerate(zip(uniq,cnts),start=1):
        ids = np.where(np.array(group)==u)[0]
        rep = ids[np.argmax([len(graph[i]) for i in ids])]
        rep_df = pd.DataFrame(summary.loc[rep,need_columns]).T
        rep_df['cnts'] = c
        represents = pd.concat([represents,rep_df],axis=0)
        
    return represents       

In [80]:
for thr in [0.8,0.7,0.6,0.5,0.4]:
    print(f'testing with threshold {thr}...')
    rep =  make_group(thr)
    if ~isinstance(rep,int):
        print('final threshold :',thr)
        print(f'group 개수 : {len(rep)}')
        break

testing with threshold 0.8...


100%|██████████| 619/619 [00:00<00:00, 16731.16it/s]
100%|██████████| 619/619 [00:00<00:00, 434377.48it/s]

final threshold : 0.8
group 개수 : 5





In [81]:
rep

Unnamed: 0,summary,rld_ose_iem_tck_cd,mdi_ifo,til_ifo,ctgy_cfc_ifo,cnts
196,Gartner reported quarterly earnings of $2.85 p...,"IT,III",Zacks,Gartner (IT) Q2 Earnings and Revenues Top Esti...,"Stocks,Investing",65
72,Li Auto (NASDAQ:LI) is set to report earnings ...,"LI,LI,TSLA,TM,F,BYDDF",InvestorPlace,"LI Stock Investors, Take Note: What to Watch f...",Markets,15
87,TrueCar (TRUE) reported a quarterly loss of $0...,TRUE,Zacks,"TrueCar (TRUE) Reports Q2 Loss, Tops Revenue E...","Stocks,Investing",9
82,"On August 1, 2023, Freeport-McMoRan Inc's (FCX...",FCX,BNK Invest,Freeport-McMoRan Reaches Analyst Target Price,"Markets,Stocks,ETFs",8
22,'./df_mistral.csv',"ZI,ZI",The Motley Fool,ZoomInfo Technologies (ZI) Q2 2023 Earnings Ca...,Markets,7
