<a href="https://colab.research.google.com/github/freud-sensei/crawling_tutorial/blob/main/%5B%EC%93%B1%EC%8B%B9%EC%93%B1%EC%8B%B9%5D_%EB%B3%B5%EA%B6%8C_%EC%A0%95%EB%B3%B4_%ED%81%AC%EB%A1%A4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import requests
from bs4 import BeautifulSoup

https://www.teddynote.com/python/lotto/

# 웹사이트 가져오기

In [4]:
count = 10
url = f"https://dhlottery.co.kr/gameResult.do?method=byWin&drwNo={count}"
html = requests.get(url).text
soup = BeautifulSoup(html, 'lxml')

In [6]:
# 날짜
soup.find('p', class_='desc')

<p class="desc">(2003년 02월 08일 추첨)</p>

In [7]:
# 날짜만 추출
soup.find('p', class_='desc').text

'(2003년 02월 08일 추첨)'

In [8]:
from datetime import datetime
date = datetime.strptime(soup.find('p', class_='desc').text, '(%Y년 %m월 %d일 추첨)')
date

datetime.datetime(2003, 2, 8, 0, 0)

In [9]:
# 당첨번호
soup.find('div', class_='num win')

<div class="num win">
<strong>당첨번호</strong>
<p>
<span class="ball_645 lrg ball1">9</span>
<span class="ball_645 lrg ball3">25</span>
<span class="ball_645 lrg ball3">30</span>
<span class="ball_645 lrg ball4">33</span>
<span class="ball_645 lrg ball5">41</span>
<span class="ball_645 lrg ball5">44</span>
</p>
</div>

In [15]:
numbers = [int(i) for i in soup.find('div', class_='num win').find('p').text.strip().split('\n')]
numbers

[9, 25, 30, 33, 41, 44]

In [19]:
# 보너스 번호
bonus = int(soup.find('div', class_='num bonus').find('p').text.strip())
bonus

6

In [24]:
# 1~5등
import pandas as pd
df = pd.read_html(str(soup.find('table', class_='tbl_data')))[0]
df

Unnamed: 0,순위,등위별 총 당첨금액,당첨게임 수,1게임당 당첨금액,당첨기준,비고
0,1등,"83,595,692,700원",13,"6,430,437,900원",당첨번호 6개 숫자일치,
1,2등,"9,631,962,400원",236,"40,813,400원",당첨번호 5개 숫자일치 +보너스 숫자일치,
2,3등,"9,631,930,800원",11247,"856,400원",당첨번호 5개 숫자일치,
3,4등,"19,198,288,200원",703234,"27,300원",당첨번호 4개 숫자일치,
4,5등,"34,108,460,000원",3410846,"10,000원",당첨번호 3개 숫자일치,


# 크롤링 함수 생성

In [32]:
import re
url = "https://dhlottery.co.kr/gameResult.do?method=byWin"
html = requests.get(url).text
soup = BeautifulSoup(html, 'lxml')
max_count = int(re.sub(r"\D", "", soup.find('div', class_="win_result").find('strong').text))
max_count

1104

In [34]:
def crawling_lotto(count="recent"):
  if count == "recent":
    url = "https://dhlottery.co.kr/gameResult.do?method=byWin"
  else:
    url = f"https://dhlottery.co.kr/gameResult.do?method=byWin&drwNo={count}"
  html = requests.get(url).text
  soup = BeautifulSoup(html, 'lxml')

  date = datetime.strptime(soup.find('p', class_='desc').text, '(%Y년 %m월 %d일 추첨)')
  numbers = [int(i) for i in soup.find('div', class_='num win').find('p').text.strip().split('\n')]
  bonus = int(soup.find('div', class_='num bonus').find('p').text.strip())

  return {
      'date': date, 'numbers': numbers, 'bonus': bonus
  }

result = crawling_lotto(37)

In [35]:
result

{'date': datetime.datetime(2003, 8, 16, 0, 0),
 'numbers': [7, 27, 30, 33, 35, 37],
 'bonus': 42}

In [38]:
crawling_lotto()

{'date': datetime.datetime(2024, 1, 27, 0, 0),
 'numbers': [1, 7, 21, 30, 35, 38],
 'bonus': 2}

# 자동화 코드

In [41]:
import warnings
import requests
from datetime import datetime
from tqdm.auto import tqdm
import pandas as pd
from bs4 import BeautifulSoup
warnings.filterwarnings('ignore')
import re

def max_count():
  url = "https://dhlottery.co.kr/gameResult.do?method=byWin"
  html = requests.get(url).text
  soup = BeautifulSoup(html, 'lxml')
  max_count = int(re.sub(r"\D", "", soup.find('div', class_="win_result").find('strong').text))
  return max_count

def crawling_lotto(count):
  url = f"https://dhlottery.co.kr/gameResult.do?method=byWin&drwNo={count}"
  html = requests.get(url).text
  soup = BeautifulSoup(html, 'lxml')

  date = datetime.strptime(soup.find('p', class_='desc').text, '(%Y년 %m월 %d일 추첨)')
  numbers = [int(i) for i in soup.find('div', class_='num win').find('p').text.strip().split('\n')]
  bonus = int(soup.find('div', class_='num bonus').find('p').text.strip())
  return {
      'date': date, 'num1': numbers[0], 'num2': numbers[1],
      'num3': numbers[2], 'num4': numbers[3], 'num5': numbers[4],
      'num6': numbers[5], 'bonus': bonus
  }

max_count = max_count()
data = pd.DataFrame(columns=['date', 'num1', 'num2', 'num3', 'num4', 'num5', 'num6', 'bonus'])
for i in tqdm(range(max_count-20, max_count + 1)):
  result = crawling_lotto(i)
  data = data.append(result, ignore_index=True)

data

  0%|          | 0/21 [00:00<?, ?it/s]

Unnamed: 0,date,num1,num2,num3,num4,num5,num6,bonus
0,2023-09-09,8,12,13,29,33,42,5
1,2023-09-16,4,7,17,18,38,44,36
2,2023-09-23,11,16,25,27,35,36,37
3,2023-09-30,13,14,18,21,34,44,16
4,2023-10-07,11,21,22,30,39,44,31
5,2023-10-14,4,18,31,37,42,43,40
6,2023-10-21,12,19,21,29,40,45,1
7,2023-10-28,6,20,23,24,28,30,44
8,2023-11-04,7,18,19,26,33,45,37
9,2023-11-11,10,17,22,30,35,43,44
