# Import Required Libraries
Import the necessary libraries, including requests, BeautifulSoup, pandas, os, and concurrent.futures.

In [None]:
!pip install requests beautifulsoup4 pandas

In [7]:
import requests
from bs4 import BeautifulSoup
from dataclasses import dataclass
from typing import List
import pandas as pd
import os
import concurrent.futures

# Define Data Classes
Define the ArchivePage data class to store the list of URLs and the next page URL.

In [8]:
@dataclass
class ArchivePage:
    archiveList: List[str]
    nextPageUrl: str

# Define Helper Functions
Define the helper functions used for scraping data.

In [17]:
def getArchivePage(url):
    print(f"Page = {url}")
    response = requests.get(url)
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')
        nextPageUrl = ""
        lotto = []
        for link in soup.find_all('a', class_='pagination__item--next'):
            nextPageUrl = link.get('href')
            break

        divContent = soup.find(
            'div', class_=['box-cell', 'box-cell--lotto', 'content'])

        for link in divContent.find_all('a'):
            lottoUrl = link.get('href')
            if "/lotto/check/" in lottoUrl:
                lotto.append(lottoUrl)

        return ArchivePage(archiveList=lotto, nextPageUrl=nextPageUrl)
    else:
        print(f"Failed to retrieve the page. Status code: {response.status_code}")

def getDate(url):
    d = url.split("/")
    while ("" in d):
        d.remove("")
    dateStr = d[len(d)-1]
    date = dateStr[0:2]
    month = dateStr[2:4]
    year = str(int(dateStr[4:8])-543)
    return f"{year}-{month}-{date}"

def scappingLotto(url, prize_type):
    if url == 'https://news.sanook.com/lotto/check/ผลสลากกินแบ่งรัฐบาลงวดประจำวันที่1สิงหาคม2552/':
        url = 'https://news.sanook.com/lotto/check/01082552/'

    response = requests.get(url)
    date = getDate(url)
    print(f'{date} = {url}')

    row = {
        'date': date,
        prize_type: []
    }

    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')
        if prize_type == 'prize_1st':
            columns = soup.find_all('div', class_='lottocheck__column')
            if columns:
                for col in columns:
                    for num in col.find_all('strong'):
                        if len(row[prize_type]) == 0:  # Only append if list is empty
                            row[prize_type].append(num.text)
        elif prize_type == 'nearby_1st':
            div = soup.find('div', class_='lottocheck__sec--nearby')
            if div:
                for ele in div.find_all('strong', class_="lotto__number"):
                    row[prize_type].append(ele.text)
        elif prize_type in ['prize_2nd', 'prize_3rd', 'prize_4th', 'prize_5th']:
            sections = soup.find_all('div', class_='lottocheck__sec')
            if sections:
                for section in sections:
                    divs = section.find_all('div', class_='lottocheck__box-item')
                    nums = []
                    for div in divs:
                        for span in div.find_all('span', class_='lotto__number'):
                            nums.append(span.text)
                    row[prize_type] = nums
        elif prize_type == 'prize_2digits':
            columns = soup.find_all('div', class_='lottocheck__column')
            if columns:
                for col in columns:
                    for num in col.find_all('strong'):
                        if "เลขท้าย 2 ตัว" in col.text:
                            row[prize_type].append(num.text)
        elif prize_type in ['prize_pre_3digit', 'prize_sub_3digits']:
            columns = soup.find_all('div', class_='lottocheck__column')
            if columns:
                for col in columns:
                    for num in col.find_all('strong'):
                        if "เลขหน้า" in col.text:
                            row['prize_pre_3digit'].append(num.text)
                        elif "เลขท้าย" in col.text:
                            row['prize_sub_3digits'].append(num.text)
            
            # Adjust prize_pre_3digit and prize_sub_3digits if necessary
            if len(row['prize_pre_3digit']) < 2 and len(row['prize_sub_3digits']) > 2:
                row['prize_pre_3digit'] = row['prize_sub_3digits'][:2]
                row['prize_sub_3digits'] = row['prize_sub_3digits'][2:]

            # Ensure both lists have exactly 2 elements
            row['prize_pre_3digit'].extend([''] * (2 - len(row['prize_pre_3digit'])))
            row['prize_sub_3digits'].extend([''] * (2 - len(row['prize_sub_3digits'])))

        return row
    else:
        print(f"Failed to retrieve the page. Status code: {response.status_code}")

# getArchivePage Function
Define the getArchivePage function to retrieve the archive page and extract URLs.

In [20]:
def getArchivePage(url):
    print(f"Page = {url}")
    response = requests.get(url)
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')
        nextPageUrl = ""
        lotto = []
        for link in soup.find_all('a', class_='pagination__item--next'):
            nextPageUrl = link.get('href')
            break

        divContent = soup.find(
            'div', class_=['box-cell', 'box-cell--lotto', 'content'])

        for link in divContent.find_all('a'):
            lottoUrl = link.get('href')
            if "/lotto/check/" in lottoUrl:
                lotto.append(lottoUrl)

        return ArchivePage(archiveList=lotto, nextPageUrl=nextPageUrl)
    else:
        print(f"Failed to retrieve the page. Status code: {response.status_code}")

# getDate Function
Define the getDate function to extract and format the date from the URL.

In [21]:
def getDate(url):
    # Split the URL by "/" and remove empty strings
    d = url.split("/")
    while ("" in d):
        d.remove("")
    
    # Extract the date string from the URL
    dateStr = d[len(d)-1]
    
    # Extract day, month, and year from the date string
    date = dateStr[0:2]
    month = dateStr[2:4]
    year = str(int(dateStr[4:8])-543)  # Convert Buddhist year to Gregorian year
    
    # Return the formatted date string
    return f"{year}-{month}-{date}"

# scappingLotto Functions
Define the scappingLotto functions for each prize category (1st, Nearby 1st, 2nd, 3rd, 4th, 5th, 3-Digit, 2-Digit).

In [22]:
def scappingLotto(url, prize_type):
    if url == 'https://news.sanook.com/lotto/check/ผลสลากกินแบ่งรัฐบาลงวดประจำวันที่1สิงหาคม2552/':
        url = 'https://news.sanook.com/lotto/check/01082552/'

    response = requests.get(url)
    date = getDate(url)
    print(f'{date} = {url}')

    row = {
        'date': date,
        prize_type: []
    }

    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')
        if prize_type == 'prize_1st':
            columns = soup.find_all('div', class_='lottocheck__column')
            if columns:
                for col in columns:
                    for num in col.find_all('strong'):
                        if len(row[prize_type]) == 0:  # Only append if list is empty
                            row[prize_type].append(num.text)
        elif prize_type == 'nearby_1st':
            div = soup.find('div', class_='lottocheck__sec--nearby')
            if div:
                for ele in div.find_all('strong', class_="lotto__number"):
                    row[prize_type].append(ele.text)
        elif prize_type in ['prize_2nd', 'prize_3rd', 'prize_4th', 'prize_5th']:
            sections = soup.find_all('div', class_='lottocheck__sec')
            if sections:
                for section in sections:
                    divs = section.find_all('div', class_='lottocheck__box-item')
                    nums = []
                    for div in divs:
                        for span in div.find_all('span', class_='lotto__number'):
                            nums.append(span.text)
                    row[prize_type] = nums
        elif prize_type == 'prize_2digits':
            columns = soup.find_all('div', class_='lottocheck__column')
            if columns:
                for col in columns:
                    for num in col.find_all('strong'):
                        if "เลขท้าย 2 ตัว" in col.text:
                            row[prize_type].append(num.text)
        elif prize_type in ['prize_pre_3digit', 'prize_sub_3digits']:
            columns = soup.find_all('div', class_='lottocheck__column')
            if columns:
                for col in columns:
                    for num in col.find_all('strong'):
                        if "เลขหน้า" in col.text:
                            row['prize_pre_3digit'].append(num.text)
                        elif "เลขท้าย" in col.text:
                            row['prize_sub_3digits'].append(num.text)
            
            # Adjust prize_pre_3digit and prize_sub_3digits if necessary
            if len(row['prize_pre_3digit']) < 2 and len(row['prize_sub_3digits']) > 2:
                row['prize_pre_3digit'] = row['prize_sub_3digits'][:2]
                row['prize_sub_3digits'] = row['prize_sub_3digits'][2:]

            # Ensure both lists have exactly 2 elements
            row['prize_pre_3digit'].extend([''] * (2 - len(row['prize_pre_3digit'])))
            row['prize_sub_3digits'].extend([''] * (2 - len(row['prize_sub_3digits'])))

        return row
    else:
        print(f"Failed to retrieve the page. Status code: {response.status_code}")

# Initialize DataFrames and Variables
Initialize the DataFrames and variables needed for storing the scraped data.

In [23]:
# Initialize DataFrames and Variables

# Define the columns for each DataFrame
columns_1st = ['date', 'prize_1st']
columns_nearby_1st = ['date', 'nearby_1st']
columns_2nd = ['date', 'prize_2nd']
columns_3rd = ['date', 'prize_3rd']
columns_4th = ['date', 'prize_4th']
columns_5th = ['date', 'prize_5th']
columns_2digit = ['date', 'prize_2digits']
columns_3digit = ['date', 'prize_pre_3digit', 'prize_sub_3digits']

# Initialize DataFrames for each prize type
df_1st = pd.DataFrame(columns=columns_1st)
df_nearby_1st = pd.DataFrame(columns=columns_nearby_1st)
df_2nd = pd.DataFrame(columns=columns_2nd)
df_3rd = pd.DataFrame(columns=columns_3rd)
df_4th = pd.DataFrame(columns=columns_4th)
df_5th = pd.DataFrame(columns=columns_5th)
df_2digit = pd.DataFrame(columns=columns_2digit)
df_3digit = pd.DataFrame(columns=columns_3digit)

# Initialize archive URL and header flag
archive_url = "https://news.sanook.com/lotto/archive/"
header = True

# Remove existing CSV and Parquet files if they exist
file_paths = [
    'prize_1th.csv', 'lotto_prize_1th.parquet',
    'nearby_1st.csv', 'lotto_nearby_1st.parquet',
    'prize_2nd.csv', 'lotto_prize_2nd.parquet',
    'prize_3rd.csv', 'prize_3rd.parquet',
    'prize_4th.csv', 'prize_4th.parquet',
    'prize_5th.csv', 'prize_5th.parquet',
    'prize_2digit.csv', 'prize_2digit.parquet',
    'prize_3digit.csv', 'prize_3digit.parquet'
]

for file_path in file_paths:
    if os.path.exists(file_path):
        os.remove(file_path)

# Main Loop for Scraping Data
Define the main loop that iterates through the archive pages and scrapes data for each prize category.

In [24]:
# Main Loop for Scraping Data

# Define the main loop that iterates through the archive pages and scrapes data for each prize category
while True:
    archive = getArchivePage(archive_url)

    with concurrent.futures.ThreadPoolExecutor() as executor:
        results_1st = list(executor.map(lambda url: scappingLotto(url, 'prize_1st'), archive.archiveList))
        results_nearby_1st = list(executor.map(lambda url: scappingLotto(url, 'nearby_1st'), archive.archiveList))
        results_2nd = list(executor.map(lambda url: scappingLotto(url, 'prize_2nd'), archive.archiveList))
        results_3rd = list(executor.map(lambda url: scappingLotto(url, 'prize_3rd'), archive.archiveList))
        results_4th = list(executor.map(lambda url: scappingLotto(url, 'prize_4th'), archive.archiveList))
        results_5th = list(executor.map(lambda url: scappingLotto(url, 'prize_5th'), archive.archiveList))
        results_2digit = list(executor.map(lambda url: scappingLotto(url, 'prize_2digits'), archive.archiveList))
        results_3digit = list(executor.map(lambda url: scappingLotto(url, 'prize_pre_3digit'), archive.archiveList))

    for new_row in results_1st:
        if new_row:
            newDf = pd.DataFrame([new_row])
            df_1st = pd.concat([df_1st, newDf], ignore_index=True)
            newDf.to_csv('prize_1th.csv', mode='a', index=False, header=header)
            header = False

    for new_row in results_nearby_1st:
        if new_row:
            newDf = pd.DataFrame([new_row])
            df_nearby_1st = pd.concat([df_nearby_1st, newDf], ignore_index=True)
            newDf.to_csv('nearby_1st.csv', mode='a', index=False, header=header)
            header = False

    for new_row in results_2nd:
        if new_row:
            newDf = pd.DataFrame([new_row])
            df_2nd = pd.concat([df_2nd, newDf], ignore_index=True)
            newDf.to_csv('prize_2nd.csv', mode='a', index=False, header=header)
            header = False

    for new_row in results_3rd:
        if new_row:
            newDf = pd.DataFrame([new_row])
            df_3rd = pd.concat([df_3rd, newDf], ignore_index=True)
            newDf.to_csv('prize_3rd.csv', mode='a', index=False, header=header)
            header = False

    for new_row in results_4th:
        if new_row:
            newDf = pd.DataFrame([new_row])
            df_4th = pd.concat([df_4th, newDf], ignore_index=True)
            newDf.to_csv('prize_4th.csv', mode='a', index=False, header=header)
            header = False

    for new_row in results_5th:
        if new_row:
            newDf = pd.DataFrame([new_row])
            df_5th = pd.concat([df_5th, newDf], ignore_index=True)
            newDf.to_csv('prize_5th.csv', mode='a', index=False, header=header)
            header = False

    for new_row in results_2digit:
        if new_row:
            newDf = pd.DataFrame([new_row])
            df_2digit = pd.concat([df_2digit, newDf], ignore_index=True)
            newDf.to_csv('prize_2digit.csv', mode='a', index=False, header=header)
            header = False

    for new_row in results_3digit:
        if new_row:
            newDf = pd.DataFrame([new_row])
            df_3digit = pd.concat([df_3digit, newDf], ignore_index=True)
            newDf.to_csv('prize_3digit.csv', mode='a', index=False, header=header)
            header = False

    if archive.nextPageUrl == "":
        df_1st.to_parquet('prize_1th.parquet', index=False)
        df_nearby_1st.to_parquet('nearby_1st.parquet', index=False)
        df_2nd.to_parquet('prize_2nd.parquet', index=False)
        df_3rd.to_parquet('prize_3rd.parquet', index=False)
        df_4th.to_parquet('prize_4th.parquet', index=False)
        df_5th.to_parquet('prize_5th.parquet', index=False)
        df_2digit.to_parquet('prize_2digit.parquet', index=False)
        df_3digit.to_parquet('prize_3digit.parquet', index=False)
        break
    else:
        archive_url = archive.nextPageUrl

Page = https://news.sanook.com/lotto/archive/
2025-02-16 = https://news.sanook.com/lotto/check/16022568/
2025-02-01 = https://news.sanook.com/lotto/check/01022568/
2025-01-02 = https://news.sanook.com/lotto/check/02012568/
2025-01-17 = https://news.sanook.com/lotto/check/17012568/
2024-12-01 = https://news.sanook.com/lotto/check/01122567/
2024-12-16 = https://news.sanook.com/lotto/check/16122567/
2024-11-16 = https://news.sanook.com/lotto/check/16112567/
2024-11-01 = https://news.sanook.com/lotto/check/01112567/
2024-10-16 = https://news.sanook.com/lotto/check/16102567/
2024-10-01 = https://news.sanook.com/lotto/check/01102567/
2024-09-16 = https://news.sanook.com/lotto/check/16092567/
2024-09-01 = https://news.sanook.com/lotto/check/01092567/
2024-08-01 = https://news.sanook.com/lotto/check/01082567/
2024-07-16 = https://news.sanook.com/lotto/check/16072567/
2024-07-01 = https://news.sanook.com/lotto/check/01072567/
2024-08-16 = https://news.sanook.com/lotto/check/16082567/
2024-06-16

KeyError: 'prize_sub_3digits'

# Scrape 1st Prize Data
Scrape the 1st prize data and save it to a CSV and Parquet file.

In [None]:
# Scrape 1st Prize Data

# Initialize DataFrame and Variables for 1st Prize
columns_1st = ['date', 'prize_1st']
df_1st = pd.DataFrame(columns=columns_1st)
archive_url = "https://news.sanook.com/lotto/archive/"
header = True

# Remove existing CSV and Parquet files if they exist
if os.path.exists('lotto_prize_1th.csv'):
    os.remove('lotto_prize_1th.csv')

if os.path.exists('lotto_prize_1th.parquet'):
    os.remove('lotto_prize_1th.parquet')

# Main Loop for Scraping 1st Prize Data
while True:
    archive = getArchivePage(archive_url)

    with concurrent.futures.ThreadPoolExecutor() as executor:
        results_1st = list(executor.map(lambda url: scappingLotto(url, 'prize_1st'), archive.archiveList))

    for new_row in results_1st:
        if new_row:
            newDf = pd.DataFrame([new_row])
            df_1st = pd.concat([df_1st, newDf], ignore_index=True)
            newDf.to_csv('lotto_prize_1th.csv', mode='a', index=False, header=header)
            header = False

    if archive.nextPageUrl == "":
        df_1st.to_parquet('lotto_prize_1th.parquet', index=False)
        break
    else:
        archive_url = archive.nextPageUrl

# Scrape Nearby 1st Prize Data
Scrape the nearby 1st prize data and save it to a CSV and Parquet file.

In [None]:
# Scrape Nearby 1st Prize Data

# Initialize DataFrame and Variables for Nearby 1st Prize
columns_nearby_1st = ['date', 'nearby_1st']
df_nearby_1st = pd.DataFrame(columns=columns_nearby_1st)
archive_url = "https://news.sanook.com/lotto/archive/"
header = True

# Remove existing CSV and Parquet files if they exist
if os.path.exists('lotto_nearby_1st.csv'):
    os.remove('lotto_nearby_1st.csv')

if os.path.exists('lotto_nearby_1st.parquet'):
    os.remove('lotto_nearby_1st.parquet')

# Main Loop for Scraping Nearby 1st Prize Data
while True:
    archive = getArchivePage(archive_url)

    with concurrent.futures.ThreadPoolExecutor() as executor:
        results_nearby_1st = list(executor.map(lambda url: scappingLotto(url, 'nearby_1st'), archive.archiveList))

    for new_row in results_nearby_1st:
        if new_row:
            newDf = pd.DataFrame([new_row])
            df_nearby_1st = pd.concat([df_nearby_1st, newDf], ignore_index=True)
            newDf.to_csv('lotto_nearby_1st.csv', mode='a', index=False, header=header)
            header = False

    if archive.nextPageUrl == "":
        df_nearby_1st.to_parquet('lotto_nearby_1st.parquet', index=False)
        break
    else:
        archive_url = archive.nextPageUrl

# Scrape 2nd Prize Data
Scrape the 2nd prize data and save it to a CSV and Parquet file.

In [None]:
# Scrape 2nd Prize Data

# Initialize DataFrame and Variables for 2nd Prize
columns_2nd = ['date', 'prize_2nd']
df_2nd = pd.DataFrame(columns=columns_2nd)
archive_url = "https://news.sanook.com/lotto/archive/"
header = True

# Remove existing CSV and Parquet files if they exist
if os.path.exists('lotto_prize_2nd.csv'):
    os.remove('lotto_prize_2nd.csv')

if os.path.exists('lotto_prize_2nd.parquet'):
    os.remove('lotto_prize_2nd.parquet')

# Main Loop for Scraping 2nd Prize Data
while True:
    archive = getArchivePage(archive_url)

    with concurrent.futures.ThreadPoolExecutor() as executor:
        results_2nd = list(executor.map(lambda url: scappingLotto(url, 'prize_2nd'), archive.archiveList))

    for new_row in results_2nd:
        if new_row:
            newDf = pd.DataFrame([new_row])
            df_2nd = pd.concat([df_2nd, newDf], ignore_index=True)
            newDf.to_csv('lotto_prize_2nd.csv', mode='a', index=False, header=header)
            header = False

    if archive.nextPageUrl == "":
        df_2nd.to_parquet('lotto_prize_2nd.parquet', index=False)
        break
    else:
        archive_url = archive.nextPageUrl

# Scrape 3rd Prize Data
Scrape the 3rd prize data and save it to a CSV and Parquet file.

In [None]:
# Scrape 3rd Prize Data

# Initialize DataFrame and Variables for 3rd Prize
columns_3rd = ['date', 'prize_3rd']
df_3rd = pd.DataFrame(columns=columns_3rd)
archive_url = "https://news.sanook.com/lotto/archive/"
header = True

# Remove existing CSV and Parquet files if they exist
if os.path.exists('lotto_prize_3rd.csv'):
    os.remove('lotto_prize_3rd.csv')

if os.path.exists('lotto_prize_3rd.parquet'):
    os.remove('lotto_prize_3rd.parquet')

# Main Loop for Scraping 3rd Prize Data
while True:
    archive = getArchivePage(archive_url)

    with concurrent.futures.ThreadPoolExecutor() as executor:
        results_3rd = list(executor.map(lambda url: scappingLotto(url, 'prize_3rd'), archive.archiveList))

    for new_row in results_3rd:
        if new_row:
            newDf = pd.DataFrame([new_row])
            df_3rd = pd.concat([df_3rd, newDf], ignore_index=True)
            newDf.to_csv('lotto_prize_3rd.csv', mode='a', index=False, header=header)
            header = False

    if archive.nextPageUrl == "":
        df_3rd.to_parquet('lotto_prize_3rd.parquet', index=False)
        break
    else:
        archive_url = archive.nextPageUrl

# Scrape 4th Prize Data
Scrape the 4th prize data and save it to a CSV and Parquet file.

In [None]:
# Scrape 4th Prize Data

# Initialize DataFrame and Variables for 4th Prize
columns_4th = ['date', 'prize_4th']
df_4th = pd.DataFrame(columns=columns_4th)
archive_url = "https://news.sanook.com/lotto/archive/"
header = True

# Remove existing CSV and Parquet files if they exist
if os.path.exists('lotto_prize_4th.csv'):
    os.remove('lotto_prize_4th.csv')

if os.path.exists('lotto_prize_4th.parquet'):
    os.remove('lotto_prize_4th.parquet')

# Main Loop for Scraping 4th Prize Data
while True:
    archive = getArchivePage(archive_url)

    with concurrent.futures.ThreadPoolExecutor() as executor:
        results_4th = list(executor.map(lambda url: scappingLotto(url, 'prize_4th'), archive.archiveList))

    for new_row in results_4th:
        if new_row:
            newDf = pd.DataFrame([new_row])
            df_4th = pd.concat([df_4th, newDf], ignore_index=True)
            newDf.to_csv('lotto_prize_4th.csv', mode='a', index=False, header=header)
            header = False

    if archive.nextPageUrl == "":
        df_4th.to_parquet('lotto_prize_4th.parquet', index=False)
        break
    else:
        archive_url = archive.nextPageUrl

# Scrape 5th Prize Data
Scrape the 5th prize data and save it to a CSV and Parquet file.

In [None]:
# Scrape 5th Prize Data

# Initialize DataFrame and Variables for 5th Prize
columns_5th = ['date', 'prize_5th']
df_5th = pd.DataFrame(columns=columns_5th)
archive_url = "https://news.sanook.com/lotto/archive/"
header = True

# Remove existing CSV and Parquet files if they exist
if os.path.exists('lotto_prize_5th.csv'):
    os.remove('lotto_prize_5th.csv')

if os.path.exists('lotto_prize_5th.parquet'):
    os.remove('lotto_prize_5th.parquet')

# Main Loop for Scraping 5th Prize Data
while True:
    archive = getArchivePage(archive_url)

    with concurrent.futures.ThreadPoolExecutor() as executor:
        results_5th = list(executor.map(lambda url: scappingLotto(url, 'prize_5th'), archive.archiveList))

    for new_row in results_5th:
        if new_row:
            newDf = pd.DataFrame([new_row])
            df_5th = pd.concat([df_5th, newDf], ignore_index=True)
            newDf.to_csv('lotto_prize_5th.csv', mode='a', index=False, header=header)
            header = False

    if archive.nextPageUrl == "":
        df_5th.to_parquet('lotto_prize_5th.parquet', index=False)
        break
    else:
        archive_url = archive.nextPageUrl

# Scrape 3-Digit Prize Data
Scrape the 3-digit prize data and save it to a CSV and Parquet file.

In [None]:
# Scrape 3-Digit Prize Data

# Initialize DataFrame and Variables for 3-Digit Prize
columns_3digit = ['date', 'prize_pre_3digit', 'prize_sub_3digits']
df_3digit = pd.DataFrame(columns=columns_3digit)
archive_url = "https://news.sanook.com/lotto/archive/"
header = True

# Remove existing CSV and Parquet files if they exist
if os.path.exists('lotto_prize_3digit.csv'):
    os.remove('lotto_prize_3digit.csv')

if os.path.exists('lotto_prize_3digit.parquet'):
    os.remove('lotto_prize_3digit.parquet')

# Main Loop for Scraping 3-Digit Prize Data
while True:
    archive = getArchivePage(archive_url)

    with concurrent.futures.ThreadPoolExecutor() as executor:
        results_3digit = list(executor.map(lambda url: scappingLotto(url, 'prize_pre_3digit'), archive.archiveList))

    for new_row in results_3digit:
        if new_row:
            newDf = pd.DataFrame([new_row])
            df_3digit = pd.concat([df_3digit, newDf], ignore_index=True)
            newDf.to_csv('lotto_prize_3digit.csv', mode='a', index=False, header=header)
            header = False

    if archive.nextPageUrl == "":
        df_3digit.to_parquet('lotto_prize_3digit.parquet', index=False)
        break
    else:
        archive_url = archive.nextPageUrl

# Scrape 2-Digit Prize Data
Scrape the 2-digit prize data and save it to a CSV and Parquet file.

In [None]:
# Scrape 2-Digit Prize Data

# Initialize DataFrame and Variables for 2-Digit Prize
columns_2digit = ['date', 'prize_2digits']
df_2digit = pd.DataFrame(columns=columns_2digit)
archive_url = "https://news.sanook.com/lotto/archive/"
header = True

# Remove existing CSV and Parquet files if they exist
if os.path.exists('lotto_prize_2digit.csv'):
    os.remove('lotto_prize_2digit.csv')

if os.path.exists('lotto_prize_2digit.parquet'):
    os.remove('lotto_prize_2digit.parquet')

# Main Loop for Scraping 2-Digit Prize Data
while True:
    archive = getArchivePage(archive_url)

    with concurrent.futures.ThreadPoolExecutor() as executor:
        results_2digit = list(executor.map(lambda url: scappingLotto(url, 'prize_2digits'), archive.archiveList))

    for new_row in results_2digit:
        if new_row:
            newDf = pd.DataFrame([new_row])
            df_2digit = pd.concat([df_2digit, newDf], ignore_index=True)
            newDf.to_csv('lotto_prize_2digit.csv', mode='a', index=False, header=header)
            header = False

    if archive.nextPageUrl == "":
        df_2digit.to_parquet('lotto_prize_2digit.parquet', index=False)
        break
    else:
        archive_url = archive.nextPageUrl