In [15]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime, timedelta
import urllib.parse
import os
from concurrent.futures import ThreadPoolExecutor, as_completed

url = 'https://finance.naver.com/sise/sise_market_sum.nhn?&page='

company_data = []
count = 0

for page in range(1, 40):
   res = requests.get(url + str(page))
   html = res.content.decode('euc-kr', 'replace')
   soup = BeautifulSoup(html, 'lxml')
   stock_table = soup.find('table', attrs={'class': 'type_2'})
   stock_rows = stock_table.find_all('tr', onmouseover=True)

   for row in stock_rows:
       if count >= 200:
           break

       stock_data = row.find_all('td')

       if len(stock_data) > 1:
           per = stock_data[11].text.strip()
           if per != 'N/A':
               company_name = stock_data[1].text.strip()
               company_code = stock_data[1].find('a')['href'].split('code=')[-1]
               company_code = "'" + company_code.zfill(6)

               company_data.append([company_name, company_code, per])
               count += 1

df = pd.DataFrame(data=company_data, columns=['company', 'code', 'PER'])
df = df.drop('PER', axis=1)
df.to_csv('kopsi_200_stocks.csv', index=False, encoding='utf-8-sig')
print(df)

      company     code
0        삼성전자  '005930
1      SK하이닉스  '000660
2    LG에너지솔루션  '373220
3    삼성바이오로직스  '207940
4         현대차  '005380
..        ...      ...
195   LX인터내셔널  '001120
196     SK케미칼  '285130
197      롯데렌탈  '089860
198      일진전기  '103590
199     코오롱인더  '120110

[200 rows x 2 columns]


In [16]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime, timedelta
import urllib.parse
import os
from concurrent.futures import ThreadPoolExecutor, as_completed

def collect_news_for_date(search_query, code, date):
    news_list = []
    encoded_query = urllib.parse.quote(search_query.encode('euc-kr'))
    url = f"https://finance.naver.com/news/news_search.naver?q={encoded_query}&sm=all.basic&pd=1&stDateStart={date}&stDateEnd={date}"
    response = requests.get(url)
    response.encoding = 'euc-kr'
    soup = BeautifulSoup(response.text, 'html.parser')
    articles = soup.find_all('dd', class_='articleSubject')[:1]  # Limit to 1 news title per date
    for article in articles:
        title = article.find('a').text.strip()
        news_list.append({'company': search_query, 'code': code, 'Date': date, 'Title': title})
    return news_list

def collect_and_save_all_news(df, start_date, end_date, max_workers=10):
    start_date = datetime.strptime(start_date, '%Y-%m-%d')
    end_date = datetime.strptime(end_date, '%Y-%m-%d')
    all_news = []

    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = []
        for _, row in df.iterrows():
            company = row['company']
            code = row['code']
            current_date = start_date
            while current_date <= end_date:
                futures.append(executor.submit(collect_news_for_date, company, code, current_date.strftime('%Y-%m-%d')))
                current_date += timedelta(days=7)  # 7일 간격으로 검색

        for future in as_completed(futures):
            news = future.result()
            if news:
                all_news.extend(news)

    df = pd.DataFrame(all_news)
    df.to_csv('all_news.csv', index=False, encoding='utf-8-sig')

df = pd.read_csv('kopsi_200_stocks.csv')
collect_and_save_all_news(df, '2021-01-01', '2023-12-31', max_workers=20)