In [1]:
from bs4 import BeautifulSoup as bs
import requests
import datetime, time
import pickle

import pandas as pd
import numpy as np

import os
import sys
import io
import shutil

import matplotlib.pyplot as plt

In [2]:
data = [[1,1,1],[2,3,4],[5,3,6]]
col = ['col1','col2','col3']
row = ['row1','row2','row3']
df = pd.DataFrame(data=data,index=row,columns=col)

In [3]:
list(df['col1'])[-1]

5

In [4]:
df['col1'].isin([1]).any()

True

In [5]:
df

Unnamed: 0,col1,col2,col3
row1,1,1,1
row2,2,3,4
row3,5,3,6


In [6]:
# !pip install cfscrape # 403 forbidden, cloudflare error을 해결하기 위한 모듈
import cloudscraper
scraper = cloudscraper.create_scraper()

In [7]:
# # !pip install cfscrape # 403 forbidden, cloudflare error을 해결하기 위한 모듈
# import cfscrape
# scraper = cfscrape.create_scraper()
# # 이후 403 error이 발생한 곳에는 requests 대신 scraper 사용

In [8]:
headers = {'Referer': 'https://kr.investing.com/',
           'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) \
           AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36 Edg/125.0.0.0'}

In [9]:
sys.path.append('../../data/constant')
from constants import COMPANY_CODE

In [10]:
def concat_df(df_o, df, dup_col, sort_col):
    df_o = pd.concat([df_o, df], ignore_index=True)
    df_o.drop_duplicates(subset=[dup_col], keep='last', inplace=True) # dup_col 중첩제거 기준 컬럼 이름: "time", "date" 등
#     df_o.drop_duplicates(subset=[dup_col], keep='first', inplace=True)
    df_o.sort_values(by=[df_o.columns[sort_col]], inplace=True) # sort_col 정렬 기준 컬럼 번호
    df_o.index = np.arange(0, len(df_o))  # 일련 번호 오름차순으로 재 설정
    return df_o

## 일별시세

In [11]:
# 일자별 주식 데이터를 페이지별로 10개씩 취득
def get_piece_date_price(url_d):
    res = scraper.get(url_d, headers=headers)
    class_name = 'type2'
    df = pd.read_html(io.StringIO(str(res.text)), attrs={"class": class_name}, flavor=["lxml", "bs4"])[0]
    
    df = df.dropna(axis=0) # delete nan rows

    df.columns = ['date', 'close', 'close_change', 'open', 'high', 'low', 'volume'] # rename column
    df['date'] = df['date'].apply(lambda x: datetime.datetime.strptime(x, '%Y.%m.%d')) # convert character to datetime object
    
    # convert character values to integer value : 보합= 0, 하락= -, 상승= +
    df['close_change'] = df['close_change'].apply(lambda x: int(x[2:]) if x[:2] == '보합' 
                                  else (-int(x[4:].replace(',','')) if x[:2] == '하락' 
                                        else int(x[4:].replace(',',''))))  # convert characters to int
    # define variable types
    df['open'] = df['open'].astype(int)
    df['high'] = df['high'].astype(int)
    df['low'] = df['low'].astype(int)
    df['close'] = df['close'].astype(int)
    df['volume'] = df['volume'].astype(int)
    
    df = df[['date', 'open', 'high', 'low', 'close', 'close_change', 'volume']]  # rearrange columns

    return df

In [12]:
# 10개씩의 일자별 데이터를 원하는 일자부터 현재일자까지 합하어 취득
def get_date_price(url_base_d, code_com):
    
    page_num = 1
    
    # make first data frame
    page = str(page_num)
    url_date = url_base_d + '?code=' + code_com + '&page=' + page
    df_base = get_piece_date_price(url_date)
    
    page_num = page_num + 1

    startdate_str = '2020/1/2 00:00:00' # 데이터 수집 시작 일자, startdate_str
    startdate = datetime.datetime.strptime(startdate_str, '%Y/%m/%d %H:%M:%S')

    STOP_FLAG = True
    while STOP_FLAG:
        page = str(page_num)
        
        url_date = url_base_d + '?code=' + code_com + '&page=' + page
        df_p = get_piece_date_price(url_date)
        # print("page_num", page_num, end=", ")
        # print("length", len(df_p))

        if page_num != 1:            
            if list(df_p['date'])[-1] == list(df_base['date'])[0]: # 다음 쪽 마지막 날이 현재 마지막날짜와 동일하면 break
                # print("kkkk:", url_date, df_p['date'].iloc[0])
                break
        
        df_base = concat_df(df_base, df_p, 'date', 0)  # df concat후 'time' column을 기준으로 중복제거 후 0 column을 기준으로 정렬시킴.
        # print("*****", df_base.head(2))
        # print("OOOOOO", df_p.head(2))
        # for i in range(len(df_p)):
        #     if len(df_p) < 10: # 10개 이하가 아니면 완료
        #         # print("iiii:", url_date, df_p['date'].iloc[0])
        #         STOP_FLAG = False
        #         break
        #     if df_p['date'].iloc[i] == startdate: # 시작일자와 일치하는 row가 있으면 더 이상 진행하지 않음.
        #         # print("jjjj:", url_date, df_p['date'].iloc[0])
        #         STOP_FLAG = False
        #         break

        if len(df_p) < 10: # 10개 이하가 아니면 완료
            # print("iiii:", url_date, df_p['date'].iloc[0])
            break
        if df_p['date'].isin([startdate]).any(): # 시작일자와 일치하는 row가 있으면 더 이상 진행하지 않음.
            # print("jjjj:", url_date, df_p['date'].iloc[0])
            break
        page_num = page_num + 1

    # print("ppp", df_base.head(1), df_base.tail(1))
    return df_base

In [13]:
# naver_dir = 'data/naver_finance/date_data'
naver_dir = 'date_data'

url_base = 'https://finance.naver.com/item/sise_day.naver'  # sise_day

code_dic = {'005930': ['삼성전자', 'sec'], '005380': ['현대차', 'hyunmotor'], 
        '035420': ['NAVER', 'naver'], '033780': ['KT&G', 'ktng']}

code_dic = COMPANY_CODE
# code_dic = {'005930': ['삼성전자', 'sec'],}

for i, (code, company_name) in enumerate(code_dic.items()):
    # print("com_name: ", company_name[0], code)
    df_collect = get_date_price(url_base, code)
    f_name = f'{naver_dir}/{company_name[1]}.csv'
    df_collect.to_csv(f_name)
    df_collect.to_pickle(f_name.replace('csv','pkl'))
    print(i, end=", ")

0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 

In [14]:
df_collect.tail(1)['date'] == '2024-09-20'

898    False
Name: date, dtype: bool

In [15]:
c

NameError: name 'c' is not defined

In [None]:
df_collect.head(10)

## 여기까지