## Collecting Individual Stock Data from Naver Finance
* Data Reference: Naver Finance website

## Load library

In [7]:
import pandas as pd
import numpy as np

## Determining the URLs to collect

In [8]:
# item_code = "005930"
# item_name = "하이브"
# 종목 URL 만들기 #종목코드 # page number
url = 'https://finance.naver.com/item/sise_day.nhn?code=005930&page=3'

## HTTP request using 'requests'
* [Requests: HTTP for Humans™ — Requests documentation](https://requests.readthedocs.io/en/master/)
* [Quickstart — Requests documentation # custom-headers](https://requests.readthedocs.io/en/latest/user/quickstart/#custom-headers)

In [9]:
import requests
requests.__version__

'2.24.0'

## To find the 'table' tag using BeautifulSoup

* [Beautiful Soup Documentation — Beautiful Soup 4.9.0 documentation](https://www.crummy.com/software/BeautifulSoup/bs4/doc/)

In [15]:
from bs4 import BeautifulSoup as bs
# import bs4
# bs4.__version__

html = bs(response.text, 'lxml')

In [10]:
tables = html.select("table")
len(tables)

2

In [11]:
import tqdm
tqdm.__version__

'4.50.2'

## Data collection with pandas.

In [16]:
pd.read_html(url, encoding = 'cp949')

ValueError: No tables found

In [20]:
table = pd.read_html(str(tables))
table

[            날짜       종가     전일비       시가       고가       저가         거래량
 0          NaN      NaN     NaN      NaN      NaN      NaN         NaN
 1   2023.04.25  63600.0  1600.0  65300.0  65400.0  63400.0  16193271.0
 2   2023.04.24  65200.0   500.0  65300.0  65700.0  64800.0  12986581.0
 3   2023.04.21  65700.0   400.0  65800.0  65900.0  65400.0  10538622.0
 4   2023.04.20  65300.0   200.0  65100.0  65300.0  64600.0   9501169.0
 5   2023.04.19  65500.0   100.0  65500.0  65800.0  65300.0  10255985.0
 6          NaN      NaN     NaN      NaN      NaN      NaN         NaN
 7          NaN      NaN     NaN      NaN      NaN      NaN         NaN
 8          NaN      NaN     NaN      NaN      NaN      NaN         NaN
 9   2023.04.18  65600.0   300.0  65900.0  66000.0  64800.0  14802060.0
 10  2023.04.17  65300.0   200.0  65000.0  65600.0  64700.0  13486618.0
 11  2023.04.14  65100.0  1000.0  66600.0  66600.0  65000.0  16176490.0
 12  2023.04.13  66100.0   100.0  65600.0  66100.0  65400.0  150

In [22]:
table[0]

Unnamed: 0,날짜,종가,전일비,시가,고가,저가,거래량
0,,,,,,,
1,2023.04.25,63600.0,1600.0,65300.0,65400.0,63400.0,16193271.0
2,2023.04.24,65200.0,500.0,65300.0,65700.0,64800.0,12986581.0
3,2023.04.21,65700.0,400.0,65800.0,65900.0,65400.0,10538622.0
4,2023.04.20,65300.0,200.0,65100.0,65300.0,64600.0,9501169.0
5,2023.04.19,65500.0,100.0,65500.0,65800.0,65300.0,10255985.0
6,,,,,,,
7,,,,,,,
8,,,,,,,
9,2023.04.18,65600.0,300.0,65900.0,66000.0,64800.0,14802060.0


In [32]:
# emoving rows with missing values.
temp = table[0].dropna()
temp

Unnamed: 0,날짜,종가,전일비,시가,고가,저가,거래량
1,2023.04.25,63600.0,1600.0,65300.0,65400.0,63400.0,16193271.0
2,2023.04.24,65200.0,500.0,65300.0,65700.0,64800.0,12986581.0
3,2023.04.21,65700.0,400.0,65800.0,65900.0,65400.0,10538622.0
4,2023.04.20,65300.0,200.0,65100.0,65300.0,64600.0,9501169.0
5,2023.04.19,65500.0,100.0,65500.0,65800.0,65300.0,10255985.0
9,2023.04.18,65600.0,300.0,65900.0,66000.0,64800.0,14802060.0
10,2023.04.17,65300.0,200.0,65000.0,65600.0,64700.0,13486618.0
11,2023.04.14,65100.0,1000.0,66600.0,66600.0,65000.0,16176490.0
12,2023.04.13,66100.0,100.0,65600.0,66100.0,65400.0,15091022.0
13,2023.04.12,66000.0,100.0,65800.0,66200.0,65300.0,15021313.0


## Creating a function for collecting data on a page-by-page basis.

In [25]:
print(url)

https://finance.naver.com/item/sise_day.nhn?code=005930&page=3


In [26]:
headers

{'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.164 Safari/537.36'}

In [27]:

def get_day_list(item_code, page_no):

    url = f"https://finance.naver.com/item/sise_day.nhn?code={item_code}&page={page_no}"
    headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.164 Safari/537.36'}
    response = requests.get(url, headers = headers)
    html = bs(response.text, 'lxml')
    tables = html.select("table")
    table =  pd.read_html(str(tables))
    df_page = table[0].dropna()
    return df_page

In [31]:

page_no = 22
item_code = '068270'
get_day_list(item_code, page_no)

Unnamed: 0,날짜,종가,전일비,시가,고가,저가,거래량
1,2022.07.20,184000.0,500.0,185000.0,186500.0,183000.0,242433.0
2,2022.07.19,183500.0,500.0,184000.0,186500.0,182500.0,223541.0
3,2022.07.18,183000.0,3500.0,187500.0,187500.0,180000.0,366865.0
4,2022.07.15,186500.0,2500.0,189500.0,189500.0,183000.0,389861.0
5,2022.07.14,189000.0,1000.0,190000.0,190500.0,186500.0,420165.0
9,2022.07.13,190000.0,2500.0,188500.0,193000.0,187500.0,513629.0
10,2022.07.12,187500.0,1500.0,187500.0,191500.0,185000.0,423296.0
11,2022.07.11,186000.0,3500.0,181500.0,187500.0,181500.0,344098.0
12,2022.07.08,182500.0,5000.0,187000.0,187000.0,178500.0,755595.0
13,2022.07.07,187500.0,1000.0,188000.0,189000.0,184500.0,369149.0


## Collecting complete date data using a loop.
* (Caution) When collecting data over a long period, please use time.sleep() to avoid overloading the server.

In [47]:
# web page Start# End#
start_no = 1
end_no = 10

item_list = []

for page_no in range(start_no, end_no): # range(1,10): # page 1-9
    temp = get_day_list(item_code, page_no)
    item_list.append(temp)
item_list

[            날짜        종가     전일비        시가        고가        저가       거래량
 1   2023.05.25  173100.0  5900.0  178100.0  179000.0  173100.0  567187.0
 2   2023.05.24  179000.0   500.0  179100.0  180900.0  177500.0  454223.0
 3   2023.05.23  179500.0  1600.0  178400.0  180000.0  177500.0  348447.0
 4   2023.05.22  177900.0  4000.0  174000.0  179400.0  174000.0  511523.0
 5   2023.05.19  173900.0  1800.0  172100.0  174400.0  170800.0  383940.0
 9   2023.05.18  172100.0   300.0  171900.0  174500.0  170600.0  339923.0
 10  2023.05.17  171800.0  3700.0  167500.0  171900.0  167500.0  326890.0
 11  2023.05.16  168100.0   500.0  168000.0  171000.0  167600.0  285565.0
 12  2023.05.15  168600.0  2000.0  165500.0  170400.0  164500.0  403134.0
 13  2023.05.12  166600.0  2000.0  167800.0  169800.0  166400.0  301870.0,
             날짜        종가     전일비        시가        고가        저가        거래량
 1   2023.05.11  168600.0   400.0  170000.0  170800.0  167200.0   390534.0
 2   2023.05.10  169000.0  2800.0  

In [61]:

item_code = '352820'
item_name = '하이브'

page_no =1

while True: 
    #수집함수
    temp =get_day_list(item_code,page_no)
    item_list.append(temp)
    
    #서버에 부담을 주지 않기 위해 시간차를 두고 가져옵니다.
    random_time = np.random.uniform(0.1, 0.5)
    time.sleep(random_time)
    
    # 페이지 번호를 하나씩 증가 시키고
    page_no = page_no +1
    
    # 5페이지 단위로 몇 페이지인지를 출력
    if page_no % 5 == 0 :
        print(page_no)
    
    # 일별시세는 10개씩 행이 있기 때문에 행이 10개보다 적으면 마지막 페이지로 보고 break
    if temp.shape[0] != 10:
        break

5
10
15
20
25
30
35
40
45
50
55
60
65




<img src="https://pandas.pydata.org/docs/_images/merging_concat_basic.png">

* [Merge, join, concatenate and compare documentation](https://pandas.pydata.org/docs/user_guide/merging.html#merge-join-concatenate-and-compare)

In [48]:

df = pd.concat(item_list)  #행이 기본이어서, axis를 따로 안해줘도 됨 

In [49]:

df.head()

Unnamed: 0,날짜,종가,전일비,시가,고가,저가,거래량
1,2023.05.25,173100.0,5900.0,178100.0,179000.0,173100.0,567187.0
2,2023.05.24,179000.0,500.0,179100.0,180900.0,177500.0,454223.0
3,2023.05.23,179500.0,1600.0,178400.0,180000.0,177500.0,348447.0
4,2023.05.22,177900.0,4000.0,174000.0,179400.0,174000.0,511523.0
5,2023.05.19,173900.0,1800.0,172100.0,174400.0,170800.0,383940.0


In [50]:
df.tail()

Unnamed: 0,날짜,종가,전일비,시가,고가,저가,거래량
9,2023.01.19,164000.0,0.0,162500.0,166000.0,162000.0,195569.0
10,2023.01.18,164000.0,500.0,163500.0,164500.0,162000.0,211709.0
11,2023.01.17,163500.0,4500.0,168000.0,168500.0,163000.0,330668.0
12,2023.01.16,168000.0,1000.0,170000.0,170000.0,168000.0,154591.0
13,2023.01.13,169000.0,1000.0,169000.0,172000.0,168500.0,278561.0


## Add stock codes and stock names to a DataFrame and create derived variables

In [51]:
df["종목코드"] = item_code
df["종목명"]= item_name

In [52]:
df.head()

Unnamed: 0,날짜,종가,전일비,시가,고가,저가,거래량,종목코드,종목명
1,2023.05.25,173100.0,5900.0,178100.0,179000.0,173100.0,567187.0,68270,SK바이오팜
2,2023.05.24,179000.0,500.0,179100.0,180900.0,177500.0,454223.0,68270,SK바이오팜
3,2023.05.23,179500.0,1600.0,178400.0,180000.0,177500.0,348447.0,68270,SK바이오팜
4,2023.05.22,177900.0,4000.0,174000.0,179400.0,174000.0,511523.0,68270,SK바이오팜
5,2023.05.19,173900.0,1800.0,172100.0,174400.0,170800.0,383940.0,68270,SK바이오팜


## Change the column order in a DataFrame

In [54]:
df.columns

Index(['날짜', '종가', '전일비', '시가', '고가', '저가', '거래량', '종목코드', '종목명'], dtype='object')

In [12]:
#컬럼 순서를 인덱싱 방법으로 바꿔준다.
cols = ['종목코드', '종목명','날짜', '종가', '전일비', '시가', '고가', '저가', '거래량']
df[cols]

# df에 다시 담아주기
df=df[cols]

df.head()

NameError: name 'df' is not defined

## Removing duplicate data.

In [57]:

print(df.shape) 
df = df.drop_duplicates()
df.shape 

(90, 9)


(90, 9)

## Calculating descriptive statistics

In [58]:

df.describe()

Unnamed: 0,종가,전일비,시가,고가,저가,거래량
count,90.0,90.0,90.0,90.0,90.0,90.0
mean,161944.444444,2265.555556,161893.333333,164272.222222,159978.888889,503616.8
std,8668.904926,2235.774562,8652.151775,8717.978683,8642.766599,262424.8
min,143700.0,0.0,144000.0,145800.0,142500.0,154591.0
25%,155400.0,600.0,154550.0,157700.0,153100.0,339545.8
50%,162450.0,1500.0,162750.0,164550.0,161150.0,414744.5
75%,168175.0,3175.0,167725.0,170350.0,166475.0,608737.2
max,179800.0,9400.0,180200.0,184100.0,177700.0,1467217.0


## Retrieve the most recent date and create a filename using

In [79]:
df.head(1)

Unnamed: 0,날짜,종가,전일비,시가,고가,저가,거래량,종목코드,종목명
1,2021.07.28,262500.0,1000.0,263500.0,265000.0,262000.0,40790.0,352820,하이브


In [81]:

date = df.iloc[0, 0]
date

'2021.07.28'

In [82]:

file_name = f"{item_name}_{item_code}_{date}.csv"
file_name

'하이브_352820_2021.07.28.csv'

## Create a single function that encompasses the entire process

In [11]:

def get_day_list(item_code, page_no):
"""
일자별 시세를 페이지별로 수집
""" 

# 반복문
# 전처리
    df = pd.concat



# 파일저장
    for page_no in range(start_no,end_no+1):
        temp = get_day_list(item_code, page_no)
        item_list.append(temp)
        
    df = pd.concat(item_list)
    df['종목코드'] = item_code
    df['종목명']= item_name
    cols = ['종목코드', '종목명','날짜', '종가', '전일비', '시가', '고가', '저가', '거래량']
    df = df[cols]
    df = df.drop_duplicates()
    file_name = f"{item_name}_{item_code}_{date}.csv"
    
    df.to_csv(file_name, index =False)
    

IndentationError: expected an indented block (<ipython-input-11-82a8c4102368>, line 4)