In [30]:
import os
import pandas as pd
from preprocess import *
from tqdm import tqdm
from datetime import datetime
import pickle

#### set dir

In [34]:
# data, models 필요!

data_dir = "./data"
model_dir = "./models"

#### preprocess

In [35]:
tickers = []
for file in os.listdir(data_dir):
    ticker, suffix = file.split(".")
    if ticker and suffix == "csv":
        tickers.append(ticker)
tickers[:5]

['017960', '036010', '037440', '101680', '005380']

In [4]:
cleaned = preprocess_ticker('005930')
cleaned.split()[:10]

['삼성물산', '삼성전자', '규모', '반도체', '생산', '시설', '공사', '수주데일리', '남궁', '민관']

In [36]:
# data: Dict[str, List[str]], takes about 18 min in M1
data = {ticker: preprocess_ticker(ticker).split() for ticker in tqdm(tickers)}

100%|█████████████████████████████████████| 2160/2160 [2:12:11<00:00,  3.67s/it]


### train model

In [32]:
from ticker_finder import TickerFinder

In [37]:
tk = TickerFinder('original')

In [38]:
# Load if have trained data

# tk.load(model_dir, version='original')

tk.fit(data) # takes about 3 min in M1

start training tfidf...
tfidf finished! time: 1070.8446609973907
start training fasttext...
fasttext finished! time: 2066.9659559726715
start making keywords dict...
keywords dict finished! time: 5.742472887039185
train finished!


In [39]:
# save after train
version = datetime.now().strftime("%Y-%m-%d")
tk.save(model_dir)

./models/2023-06-21.model saved


#### tfidf result

In [40]:
ticker = '005930'
words = tk.tfidf[tk.bow[ticker]]
words[:5]

[(1, 0.01509312718180835),
 (3, 0.0010783227187121025),
 (13, 0.0031095202807639383),
 (37, 0.0017940720590537127),
 (42, 0.0008214241942268604)]

In [41]:
word, score = max(words, key=lambda x: x[1])
tk.dct[word]

'삼성전자'

#### fasttext result

In [42]:
tk.fasttext.wv.similarity('소주', '맥주')

0.7864743

In [43]:
tk.fasttext.wv.similar_by_word('반도체')

[('상보반도체', 0.8876902461051941),
 ('삼성전자반도체', 0.8461573719978333),
 ('반도체한미', 0.8264661431312561),
 ('인수반도체', 0.8055769801139832),
 ('경신반도체', 0.7774364352226257),
 ('투자반도체', 0.7770246863365173),
 ('청약반도체', 0.7581021189689636),
 ('내역반도체', 0.7277847528457642),
 ('감소반도체', 0.7247389554977417),
 ('유도체', 0.7079378962516785)]

#### result

In [44]:
related_words = tk.keywords['003380']
related_words

{'하림': 0.8850433931216997,
 '제일': 0.23320101777509053,
 '홀딩스': 0.1741736717944637,
 '지주사': 0.15953111093942998,
 '지주': 0.15190311820003663,
 '식품': 0.07486118383307132,
 'NS': 0.07226746733235516,
 '곡물': 0.07001335870177941,
 '닭고기': 0.06544437640853987,
 '사료': 0.06204353885520301,
 '합병': 0.05990998634455109,
 '김홍국': 0.058095847345716246,
 '최상위': 0.057082489902029125,
 '민동기': 0.05641985191675873,
 '스코': 0.048136220263721434,
 '지배': 0.04665470111598514,
 '제일사료': 0.04475991448232351,
 '지주회사': 0.04466725726875858}

In [45]:
related_score = tk.score('반도체', ticker)
related_score

('005930', 1.8400524560805656)

In [46]:
info = pd.read_csv("ticker.csv")

In [52]:
keyword = '보험'
n = 30

tickers = [ticker for ticker, _ in tk.predict(keyword, n)]
info[info['종목코드'].isin(tickers)].reset_index(drop='index')

Unnamed: 0,종목코드,종목명,시장구분,업종명,시가총액
0,370,한화손해보험,KOSPI,보험,496724083325
1,400,롯데손해보험,KOSPI,보험,537502506240
2,540,흥국화재,KOSPI,보험,206540103675
3,810,삼성화재,KOSPI,보험,10801462836000
4,1450,현대해상,KOSPI,보험,2959140000000
5,1520,동양,KOSPI,비금속광물,250618266150
6,3470,유안타증권,KOSPI,증권,564858310080
7,3690,코리안리,KOSPI,보험,1019558078360
8,5830,DB손해보험,KOSPI,보험,5508240000000
9,10240,흥국,KOSDAQ,기계·장비,79727843120
