In [70]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import datetime
import matplotlib.pyplot as plt
import urllib
from pykrx import stock

In [71]:
# today's 시가총액 Top 300 code lists

df = stock.get_market_cap_by_ticker(datetime.date.today().strftime('%Y%m%d'))
df_t50 = df.sort_values('시가총액', ascending=False).head(300) # 50
code_lists = df_t50.index.tolist()

# code_lists

In [72]:
# today's 시가총액 Top 50 code lists covert to stock name lists

stock_name_lists = []
for ticker in code_lists:
    stock_name_lists.append(stock.get_market_ticker_name(ticker))
    
# stock_lists

In [73]:
# Build name code index

code_names = dict(zip(code_lists, stock_name_lists))
# code_names

In [74]:
# 주가데이터 with next day price up and down indicator

fail_count = 0
ok_count = 0

start = "20180701" 
end = "20210716" 

for code in code_lists:
  try:
    result = stock.get_market_ohlcv_by_date(start, end, code)
    print ("Read data for " + code + " " + repr(result.shape))
    ok_count += 1
  except:
    print ("Could not read data for " + code)
    fail_count += 1

  print(str(ok_count) + " loads, " + str(fail_count) + " failures")                               # 결측치 제거

  # 새로운 칼럼 생성
  # (Price : 당일 대비 다음날 주가가 상승했으면 1, 하락했으면 0 표시)
  result['UpDown'] = 0
  for i in range(0, result.shape[0]-1):
      if result['종가'][i] < result['종가'][i+1]:
          result['UpDown'][i] = 1
      else:
          result['UpDown'][i] = 0

  # 파일 저장
  result.to_csv(code+'_주가데이터.csv')

Read data for 005930 (752, 5)
1 loads, 0 failures


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  result['UpDown'][i] = 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  result['UpDown'][i] = 0


Read data for 000660 (752, 5)
2 loads, 0 failures
Read data for 035420 (752, 5)
3 loads, 0 failures
Read data for 035720 (752, 5)
4 loads, 0 failures
Read data for 207940 (752, 5)
5 loads, 0 failures
Read data for 005935 (752, 5)
6 loads, 0 failures
Read data for 051910 (752, 5)
7 loads, 0 failures
Read data for 006400 (752, 5)
8 loads, 0 failures
Read data for 005380 (752, 5)
9 loads, 0 failures
Read data for 068270 (752, 5)
10 loads, 0 failures
Read data for 000270 (752, 5)
11 loads, 0 failures
Read data for 005490 (752, 5)
12 loads, 0 failures
Read data for 066570 (752, 5)
13 loads, 0 failures
Read data for 051900 (752, 5)
14 loads, 0 failures
Read data for 012330 (752, 5)
15 loads, 0 failures
Read data for 028260 (752, 5)
16 loads, 0 failures
Read data for 096770 (752, 5)
17 loads, 0 failures
Read data for 017670 (752, 5)
18 loads, 0 failures
Read data for 105560 (752, 5)
19 loads, 0 failures
Read data for 055550 (752, 5)
20 loads, 0 failures
Read data for 034730 (752, 5)
21 loads,

In [75]:
# Read 주가데이터 per stock code and returns each day's up or down

def get_upDownDays(code):
  df_down = []
  df_up = []

  price_data = pd.read_csv(code+'_주가데이터.csv')

  df_down = price_data[price_data['UpDown']==0]['날짜']   
  df_up = price_data[price_data['UpDown']==1]['날짜']

  return df_up, df_down

In [84]:


def naver_news_title(stock, dates):
  result_list = []
  error_cnt = 0

  stock_name = code_names.get(stock)
  stock_encode = urllib.parse.quote(stock_name, encoding='euc-kr')


  base_url = 'https://finance.naver.com/news/news_search.nhn?rcdate=1&q={}&sm=title.basic&pd=4&stDateStart={}&stDateEnd={}'
  headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36'}

  for date in dates:
    url = base_url.format(stock_encode, date, date)
    res = requests.get(url, headers=headers)

    # print("page: ", page)
    if res.status_code == 200:
      soup = BeautifulSoup(res.text)
      title_list = soup.select('.articleSubject')
      for title in title_list:
        try:
          news_title = title.select_one('a').text.strip()
          # news_title = news_title.replace(stock_name, '')
          # print("add: ", news_title)
          result_list.append([news_title])
        except:
          error_cnt += 1
  
  return result_list

In [85]:
headline_count = pd.DataFrame(columns=['code', 'name', 'counts'])

for code in code_lists:
  date_up, date_down = get_upDownDays(code)

  result_list = naver_news_title(code, date_up)
  title_df_up = pd.DataFrame(result_list, columns=['뉴스제목'])
  title_df_up['주가변동'] = 1

  result_list = naver_news_title(code, date_down)
  title_df_down = pd.DataFrame(result_list, columns=['뉴스제목'])
  title_df_down['주가변동'] = 0

  title_df = pd.concat([title_df_up, title_df_down])
  title_df.to_csv(code+'_뉴스타이틀.tsv', index=False, encoding='utf-8', sep="\t")

  stock_name = code_names.get(code)
  new_row = {'code':code, 'name':stock_name, 'counts':title_df.shape[0]}
  headline_count = headline_count.append(new_row, ignore_index=True) 
headline_count.to_csv('news_headline_counts.csv')
# headline_count

In [78]:
# headline_count = pd.DataFrame(columns=['code', 'name', 'counts'])
# count = 1
# for code in code_lists:
#     stock_name = code_names.get(code)
#     new_row = {'code':code, 'name':stock_name, 'counts':count}
#     headline_count = headline_count.append(new_row, ignore_index=True) 
#     count += 1
# headline_count.to_csv('news_headline_counts.csv')
# headline_count

In [86]:

all_headlines = pd.DataFrame()

for code in code_lists:
  title_df = pd.read_csv(code+'_뉴스타이틀.tsv', sep="\t")
  all_headlines = pd.concat([all_headlines, title_df])

# shuffle the DataFrame rows
all_headlines = all_headlines.sample(frac = 1)
all_headlines.to_csv('all_headlines.tsv', index=False, encoding='utf-8', sep="\t")

In [87]:
all_headlines

Unnamed: 0,뉴스제목,주가변동
478,"LG생활건강, ‘숨37° 로시크숨마 황제의 빛 에디션’ 선봬",1
836,"[IR]SK이노베이션 ""내년 시설투자, 올해보다 낮은 수준""",1
1984,"수출입銀, CJ대한통운 해외채권 발행 공동 보증",0
1192,브랜드 아파트 700만원대에 마련할 마지막 기회! ‘사천 KCC스위첸,1
12167,"LG전자, 뉴욕 한복판에서 영화 상영한 사연",0
...,...,...
123,"한국항공우주, 인니에 269억원 규모 KT-1B 훈련기 공급 계약",1
2136,'하이트진로 3세' 박태영 사장 승진,1
1075,[특징주] '형제의 난' 재점화 가능성에...롯데지주 강세,0
610,"제일기획, 비상교육과 청소년 대상 '마음 교과서' 개발",1
