# Tokens

In [None]:
API = '--replicate에서 발급 받은 API Token--'
AGENT = '--브라우저 agent--'

# import

In [4]:
import requests
from bs4 import BeautifulSoup
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
import replicate

from tqdm import tqdm
import os
import pathlib
import time
import pickle

import pandas as pd

# Crawl

## Crawl news & company information

전체 데이터 중 8월 1일 데이터 크롤링

In [None]:
# CSV 파일 경로 설정
csv_file_path = "NASDAQ_RSS_IFO_202308.csv"

# "url" 열을 기반으로 기사 본문을 웹 크롤링하여 "article" 열 추가하는 함수
def get_articles_for_rows(df, start_idx, end_idx):
    articles = []
    
    for url in df.iloc[start_idx:end_idx, 7]:
        try:
            headers = {
                'User-Agent': AGENT
            }

            # Retry 설정
            retry_strategy = Retry(
                total=3,
                backoff_factor=1,
                status_forcelist=[500, 502, 503, 504],
                method_whitelist=["HEAD", "GET", "OPTIONS"]
            )

            # Retry를 적용한 세션 생성
            session = requests.Session()
            session.mount("http://", HTTPAdapter(max_retries=retry_strategy))
            session.mount("https://", HTTPAdapter(max_retries=retry_strategy))

            # 요청 사이에 1초 대기
            time.sleep(1)

            # Retry를 적용하여 요청 보내기
            response = session.get(url, headers=headers, timeout=30)

            if response.status_code == 200:
                html_content = response.text
                soup = BeautifulSoup(html_content, 'html.parser')

                # 원하는 요소를 추출하여 기사 본문 가져오기 (웹 사이트에 따라 다를 수 있음)
                article_text = ""
                for p_tag in soup.find_all('p'):
                    article_text += p_tag.get_text() + '\n'
                
                articles.append(article_text)
            else:
                articles.append("")
        except Exception as e:
            print(f"Error fetching article: {e}")
            articles.append("")

    return articles

# CSV 파일 읽기
df = pd.read_csv(csv_file_path, encoding='ISO-8859-1')

# "rgs_dt" 열이 '20230801'인 행만 선택
df = df[df['rgs_dt'] == 20230801]

# 중복된 "url_ifo" 행 삭제
df = df.drop_duplicates(subset='url_ifo', keep='first')

# "url" 열을 기반으로 "article" 열 추가
df['article'] = get_articles_for_rows(df, 0, len(df))

In [None]:
temp = df.copy()

tickers = []
tickers.extend(temp['tck_iem_cd'].values.flatten().tolist())

rld_tickers = temp['rld_ose_iem_tck_cd'].apply(lambda x:x.split(',') if x!='_' else [])

for i in rld_tickers.values:
    tickers.extend(i)
    
tickers = [tic.strip().lower() for tic in tickers]

tickers = list(set(tickers))

del temp

In [None]:
headers = {'user-agent':AGENT}

company_info = {}
company_industry = {}

parse_err = []
info_err = []
indust_err = []
unavailable_ticker = []

In [None]:
for ticker in tqdm(tickers):
    url = f'https://stockanalysis.com/stocks/{ticker}/company/'
    
    try:
        response = requests.get(url,headers=headers)
        soup = BeautifulSoup(response.content, 'html.parser')
    except:
        parse_err.append(ticker)
        continue
    try:
        check = soup.select("main > div > div.mb-4.text-2xl.font-bold.sm\:text-3xl")[0].text
        if '404' in check:
            unavailable_ticker.append(ticker)
            continue
    except:
        pass
    
    try:
        desc = soup.select("main > div:nth-child(3) > div:nth-child(1)")
        if len(desc)==0:
            desc = soup.select("main > div.mt-4.sm\:mt-5 > div:nth-child(1)")
        company_info[ticker] = desc[0].text
    except:
        info_err.append(ticker)
        
    try:
        num = 0
        selector = 1
        for i in range(1,7):
            industry = soup.select(f'main > div:nth-child(3) > div.lg\:float-right.lg\:w-\[336px\] > div.mt-7.rounded.border.border-gray-200.bg-gray-50.px-3.pt-3.pb-2.dark\:border-dark-700.dark\:bg-dark-775.xs\:px-4.xs\:pt-4.lg\:mt-1 > table > tbody > tr:nth-child({i}) > td.py-1\.5.px-1.font-semibold.lg\:py-2')
            if len(industry)==0:
                selector = 2
                industry = soup.select(f'main > div.mt-4.sm\:mt-5 > div.lg\:float-right.lg\:w-\[336px\] > div.mt-7.rounded.border.border-gray-200.bg-gray-50.px-3.pt-3.pb-2.dark\:border-dark-700.dark\:bg-dark-775.xs\:px-4.xs\:pt-4.lg\:mt-1 > table > tbody > tr:nth-child({i}) > td.py-1\.5.px-1.font-semibold.lg\:py-2')
            if industry[0].text=='Industry':
                num = i
                break
        if num==0:
            indust_err.append(ticker)
        else:
            if selector == 1:
                text = soup.select(f"main > div:nth-child(3) > div.lg\:float-right.lg\:w-\[336px\] > div.mt-7.rounded.border.border-gray-200.bg-gray-50.px-3.pt-3.pb-2.dark\:border-dark-700.dark\:bg-dark-775.xs\:px-4.xs\:pt-4.lg\:mt-1 > table > tbody > tr:nth-child({num}) > td.py-1\.5.px-1.text-right.lg\:py-2 > a")
            else:
                text = soup.select(f"main > div.mt-4.sm\:mt-5 > div.lg\:float-right.lg\:w-\[336px\] > div.mt-7.rounded.border.border-gray-200.bg-gray-50.px-3.pt-3.pb-2.dark\:border-dark-700.dark\:bg-dark-775.xs\:px-4.xs\:pt-4.lg\:mt-1 > table > tbody > tr:nth-child({num}) > td.py-1\.5.px-1.text-right.lg\:py-2 > a")
                
            company_industry[ticker] = text[0].text
    except:
        indust_err.append(ticker)
        
    time.sleep(2)
    if len(info_err)>1:
        print('error')
        break
        
info_err = list(set(info_err)-set(list(company_info.keys()))-set(unavailable_ticker))
indust_err = list(set(indust_err)-set(list(company_industry.keys()))-set(unavailable_ticker))
unavailable_ticker = list(set(unavailable_ticker)-set(list(company_info.keys()))-set(list(company_industry.keys())))
parse_err = list(set(parse_err)-set(unavailable_ticker)-set(list(company_info.keys()))-set(list(company_industry.keys())))

print('파씽 실패 :',len(parse_err))
print('존재하지 않는 티커코드 :',len(unavailable_ticker))
print('기업 설명 실패 :',len(info_err))
print('사업분야 실패 :',len(indust_err))

print('기업 설명 :',len(company_info))
print('기업 사업분야 :',len(company_industry))

In [None]:
desc = 'main > div.mt-6.lg\:grid.lg\:grid-cols-sidebar_wide.lg\:gap-x-10 > div.space-y-6.lg\:order-2.lg\:pt-1 > div.px-0\.5.lg\:px-0 > p'
holdings = 'main > div.mt-6.lg\:grid.lg\:grid-cols-sidebar_wide.lg\:gap-x-10 > div.space-y-6.lg\:order-2.lg\:pt-1 > div:nth-child(4) > table'

In [None]:
etf_info = {}
etf_holdings = {}

for ticker in tqdm(unavailable_ticker[136:]):
    url = f'https://stockanalysis.com/etf/{ticker}'

    response = requests.get(url,headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    try:
        check = soup.select("main > div > div.mb-4.text-2xl.font-bold.sm\:text-3xl")[0].text
        if '404' in check:
            #unavailable_ticker.append(ticker)
            print(ticker,'pass')
            continue
    except:
        pass
    
    about = soup.select(desc)
    table = soup.find_all('table')
    print(ticker,len(about),len(table))
    dfs = pd.read_html(str(table))
    
    holding_list = None
    for df in dfs:
        if 'Symbol'in df.columns:
            holding_list = df
            break
    
    etf_info[ticker] = about[0].text
    etf_holdings[ticker] = holding_list
    
    time.sleep(2)

In [None]:
unavailable_ticker = list(set(unavailable_ticker)-set(list(etf_info.keys()))-set(list(etf_holdings.keys())))
len(unavailable_ticker)

만든 dictionary를 pickle로 저장

In [None]:
# with open('./company_describe.pkl','wb') as f:
#     pickle.dump(company_info,f)

# with open('./company_industry.pkl','wb') as f:
#     pickle.dump(company_industry,f)

# with open('./etf_describe.pkl','wb') as f:
#     pickle.dump(etf_info,f)

# with open('./etf_holdings.pkl','wb') as f:
#     pickle.dump(etf_holdings,f)

In [11]:
def dict2df(dic):
    df = pd.DataFrame(dic, index=[0]).T
    df.reset_index(inplace=True)
    df.columns = ['company', 'description']
    return df

In [None]:
company_info.update(etf_info)

In [None]:
df_com = dict2df(company_info)

## mitral 7b를 이용한 요약

replicate에서 API 발급 필요 - https://replicate.com/ <br>
유료로 전환 가능

In [None]:
os.environ["REPLICATE_API_TOKEN"] = API
df_com['summary_en'] = None
df['summary_en'] = None

In [None]:
def summarize_mistral(article):
    output = replicate.run(
    "a16z-infra/mistral-7b-v0.1:3e8a0fb6d7812ce30701ba597e5080689bef8a013e5c6a724fafb108cc2426a0",
    input={"prompt": f"You are analyst. Can you summarize following breifly?\n{article[:3500]}"}
    )
    result = ""
    for item in output:
        result += item
    return result

In [None]:
# 뉴스 요약
for i in tqdm(df.index):
    df['summary_en'].loc[i] = summarize_mistral(df['article'].loc[i])

# 기업 설명 요약
for i in tqdm(df_com.index):
    df_com['summary_en'].loc[i] = summarize_mistral(df_com['description'].loc[i])