In [9]:
import requests
from bs4 import BeautifulSoup
import csv
import pandas as pd

start_year = 2003
end_year = 2022

# Set headers to specify language preference and location
headers = {
    "Accept-Language": "en-US,en;q=0.9",
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36"
}

# Initialize an empty list to store all holiday data
all_holidays = []

# Loop through the years
for year in range(start_year, end_year + 1):
    # Construct URL with the year
    url = f'https://www.timeanddate.com/holidays/philippines/{year}'
    
    # Make request with headers
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.text, 'lxml')
    
    # Extract the holiday data
    holidays = [[td.text.strip() for td in tr.select('th, td')] for tr in soup.select('tr[data-mask]')]
    
    # Add year to each holiday entry
    for holiday in holidays:
        holiday.insert(0, year)  # Add the year at the beginning of each row
        all_holidays.append(holiday)

# Print or save data as needed
# for holiday in all_holidays:
#     print(holiday)

#convert to dataframe
df = pd.DataFrame(all_holidays,columns=['Year', 'Date', 'Weekday', 'Holiday Name', 'Holiday Type', 'Details'])
holiday_df = df.drop(columns='Details')

# Combine 'Year' and 'Date' columns with a separator before converting to datetime
holiday_df['Date'] = pd.to_datetime(holiday_df['Year'].astype(str) + '-' + holiday_df['Date'], format='%Y-%d %b')

# Extract the date part in the desired format
holiday_df['Date'] = holiday_df['Date'].dt.strftime('%Y-%m-%d')

holiday_df = holiday_df.drop(columns='Year').rename(columns={'Date': 'date'})

# with open('philippines_holidays_2003_2022.csv', 'w', newline='', encoding='utf-8') as f:
#     writer = csv.writer(f)
#     writer.writerow(['Year', 'Date', 'Weekday', 'Holiday Name', 'Holiday Type', 'Details'])
#     writer.writerows(all_holidays)


In [11]:
holiday_df.to_csv('philippines_holidays_2003_2022_df.csv')