#### Webscraping for dates 
2023 Malaysia Holiday

In [1]:
import pandas as pd
from bs4 import BeautifulSoup
import requests

In [2]:
url = 'https://www.officeholidays.com/countries/malaysia'
page = requests.get(url)
soup = BeautifulSoup(page.text, 'html')

In [3]:
table = soup.find_all('table')[0]

In [4]:
titles = table.find_all('th')
titles

[<th>Day</th>,
 <th>Date</th>,
 <th>Holiday Name</th>,
 <th class="comments">Type</th>,
 <th class="hide-ipadmobile">Comments</th>]

In [5]:
table_title = [title.text.strip() for title in titles]
print(table_title)

['Day', 'Date', 'Holiday Name', 'Type', 'Comments']


In [6]:
df = pd.DataFrame(columns=table_title)
df

Unnamed: 0,Day,Date,Holiday Name,Type,Comments


In [7]:
column_data = table.find_all('tr')

In [8]:
for row in column_data[1:]:
    row_data = row.find_all('td')
    ind_row = [data.text.strip() for data in row_data]
    print(ind_row)

['Sunday', 'Jan 01', "New Year's Day", 'Regional Holiday', 'Most states']
['Monday', 'Jan 02', "New Year's Day (in lieu)", 'Regional Holiday', 'Most states']
['Saturday', 'Jan 14', 'Birthday of the Sultan of Negeri Sembilan', 'Regional Holiday', 'Negeri Sembilan only']
['Sunday', 'Jan 22', 'Chinese New Year', 'National Holiday', '1st day of 1st lunar month']
['Monday', 'Jan 23', 'Chinese New Year Holiday', 'National Holiday', '']
['Tuesday', 'Jan 24', 'Chinese New Year Holiday', 'Regional Holiday', 'Except JHR, KDH, KTN, TRG']
['Wednesday', 'Feb 01', 'Federal Territory Day', 'Regional Holiday', 'Kuala Lumpur, Labuan, Putrajaya']
['Sunday', 'Feb 05', 'Thaipusam', 'Regional Holiday', 'Several states']
['Monday', 'Feb 06', 'Thaipusam (in lieu)', 'Regional Holiday', 'Several states']
['Saturday', 'Feb 18', 'Isra and Miraj', 'Regional Holiday', 'Kedah, Perlis and Negeri Sembilan']
['Saturday', 'Mar 04', 'Anniversary of Installation of the Sultan', 'Regional Holiday', 'Terengganu']
['Thursda

In [9]:
for row in column_data[1:]:
    row_data = row.find_all('td')
    ind_row = [data.text.strip() for data in row_data]

    length = len(df)
    df.loc[length] = ind_row

df

Unnamed: 0,Day,Date,Holiday Name,Type,Comments
0,Sunday,Jan 01,New Year's Day,Regional Holiday,Most states
1,Monday,Jan 02,New Year's Day (in lieu),Regional Holiday,Most states
2,Saturday,Jan 14,Birthday of the Sultan of Negeri Sembilan,Regional Holiday,Negeri Sembilan only
3,Sunday,Jan 22,Chinese New Year,National Holiday,1st day of 1st lunar month
4,Monday,Jan 23,Chinese New Year Holiday,National Holiday,
5,Tuesday,Jan 24,Chinese New Year Holiday,Regional Holiday,"Except JHR, KDH, KTN, TRG"
6,Wednesday,Feb 01,Federal Territory Day,Regional Holiday,"Kuala Lumpur, Labuan, Putrajaya"
7,Sunday,Feb 05,Thaipusam,Regional Holiday,Several states
8,Monday,Feb 06,Thaipusam (in lieu),Regional Holiday,Several states
9,Saturday,Feb 18,Isra and Miraj,Regional Holiday,"Kedah, Perlis and Negeri Sembilan"


In [11]:
# Add a year to the date values (e.g., 2023)
df['Date'] = df['Date'] + ' 2023'

# Convert 'Date' column to datetime
df['Date'] = pd.to_datetime(df['Date'], format='%b %d %Y')


# Print the DataFrame
print(df)

          Day       Date                                 Holiday Name   
0      Sunday 2023-01-01                               New Year's Day  \
1      Monday 2023-01-02                     New Year's Day (in lieu)   
2    Saturday 2023-01-14    Birthday of the Sultan of Negeri Sembilan   
3      Sunday 2023-01-22                             Chinese New Year   
4      Monday 2023-01-23                     Chinese New Year Holiday   
5     Tuesday 2023-01-24                     Chinese New Year Holiday   
6   Wednesday 2023-02-01                        Federal Territory Day   
7      Sunday 2023-02-05                                    Thaipusam   
8      Monday 2023-02-06                          Thaipusam (in lieu)   
9    Saturday 2023-02-18                               Isra and Miraj   
10   Saturday 2023-03-04    Anniversary of Installation of the Sultan   
11   Thursday 2023-03-23              Birthday of the Sultan of Johor   
12   Thursday 2023-03-23                         Be

In [12]:
df['Date'].dt.strftime('%d-%m-%Y')

0     01-01-2023
1     02-01-2023
2     14-01-2023
3     22-01-2023
4     23-01-2023
5     24-01-2023
6     01-02-2023
7     05-02-2023
8     06-02-2023
9     18-02-2023
10    04-03-2023
11    23-03-2023
12    23-03-2023
13    26-03-2023
14    07-04-2023
15    08-04-2023
16    15-04-2023
17    21-04-2023
18    22-04-2023
19    23-04-2023
20    24-04-2023
21    26-04-2023
22    01-05-2023
23    04-05-2023
24    14-05-2023
25    17-05-2023
26    22-05-2023
27    30-05-2023
28    31-05-2023
29    01-06-2023
30    02-06-2023
31    05-06-2023
32    18-06-2023
33    18-06-2023
34    28-06-2023
35    29-06-2023
36    30-06-2023
37    30-06-2023
38    07-07-2023
39    08-07-2023
40    19-07-2023
41    22-07-2023
42    30-07-2023
43    13-08-2023
44    14-08-2023
45    23-08-2023
46    24-08-2023
47    31-08-2023
48    16-09-2023
49    28-09-2023
50    07-10-2023
51    14-10-2023
52    03-11-2023
53    11-11-2023
54    12-11-2023
55    12-11-2023
56    13-11-2023
57    11-12-2023
58    24-12-20

In [13]:
df

Unnamed: 0,Day,Date,Holiday Name,Type,Comments
0,Sunday,2023-01-01,New Year's Day,Regional Holiday,Most states
1,Monday,2023-01-02,New Year's Day (in lieu),Regional Holiday,Most states
2,Saturday,2023-01-14,Birthday of the Sultan of Negeri Sembilan,Regional Holiday,Negeri Sembilan only
3,Sunday,2023-01-22,Chinese New Year,National Holiday,1st day of 1st lunar month
4,Monday,2023-01-23,Chinese New Year Holiday,National Holiday,
5,Tuesday,2023-01-24,Chinese New Year Holiday,Regional Holiday,"Except JHR, KDH, KTN, TRG"
6,Wednesday,2023-02-01,Federal Territory Day,Regional Holiday,"Kuala Lumpur, Labuan, Putrajaya"
7,Sunday,2023-02-05,Thaipusam,Regional Holiday,Several states
8,Monday,2023-02-06,Thaipusam (in lieu),Regional Holiday,Several states
9,Saturday,2023-02-18,Isra and Miraj,Regional Holiday,"Kedah, Perlis and Negeri Sembilan"


In [16]:
df.to_csv('holiday_2023.csv', index=False)

In [17]:
# Filter rows with 'Type' column equal to 'National Holiday'
national_holidays_df = df[df['Type'] == 'National Holiday']

# Print the filtered DataFrame
print(national_holidays_df)

          Day       Date                           Holiday Name   
3      Sunday 2023-01-22                       Chinese New Year  \
4      Monday 2023-01-23               Chinese New Year Holiday   
17     Friday 2023-04-21           Hari Raya Aidilfitri Holiday   
18   Saturday 2023-04-22                   Hari Raya Aidilfitri   
19     Sunday 2023-04-23           Hari Raya Aidilfitri Holiday   
22     Monday 2023-05-01                             Labour Day   
23   Thursday 2023-05-04                              Wesak Day   
31     Monday 2023-06-05  Birthday of SPB Yang di Pertuan Agong   
35   Thursday 2023-06-29                         Hari Raya Haji   
40  Wednesday 2023-07-19                          Awal Muharram   
47   Thursday 2023-08-31                           National Day   
48   Saturday 2023-09-16                           Malaysia Day   
49   Thursday 2023-09-28                         Maulidur Rasul   
59     Monday 2023-12-25                          Christmas Da

In [18]:
df.to_csv('holidayMalaysia_2023.csv', index=False)