## 1. Install Libraries

In [None]:
# pip install selenium beautifulsoup4

Collecting selenium
  Obtaining dependency information for selenium from https://files.pythonhosted.org/packages/17/ef/d0e033e1b3f19a0325ce03863b68d709780908381135fc0f9436dea76a7b/selenium-4.35.0-py3-none-any.whl.metadata
  Downloading selenium-4.35.0-py3-none-any.whl.metadata (7.4 kB)
Collecting beautifulsoup4
  Obtaining dependency information for beautifulsoup4 from https://files.pythonhosted.org/packages/04/eb/f4151e0c7377a6e08a38108609ba5cede57986802757848688aeedd1b9e8/beautifulsoup4-4.13.5-py3-none-any.whl.metadata
  Downloading beautifulsoup4-4.13.5-py3-none-any.whl.metadata (3.8 kB)
Collecting trio~=0.30.0 (from selenium)
  Obtaining dependency information for trio~=0.30.0 from https://files.pythonhosted.org/packages/69/8e/3f6dfda475ecd940e786defe6df6c500734e686c9cd0a0f8ef6821e9b2f2/trio-0.30.0-py3-none-any.whl.metadata
  Downloading trio-0.30.0-py3-none-any.whl.metadata (8.5 kB)
Collecting trio-websocket~=0.12.2 (from selenium)
  Obtaining dependency information for trio-webso


[notice] A new release of pip is available: 23.2.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


## 2. Import Libraries

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import TimeoutException
import time
import os

## 3. Setup and Launch Chrome Driver

In [2]:
driver = webdriver.Chrome()
driver.get("https://www.comparefirst.sg/wap/searchProductsEvent.action")

# Wait for products to load
time.sleep(10)  # You may need to increase this on a slower connection

## 4. Converting the HTML response into HTML-tag accessible structure

In [3]:

# --- Retrieve HTML and make text accessible by HTML Tags  ---
soup = BeautifulSoup(driver.page_source, "html.parser")
driver.quit()


In [4]:
soup

<html lang="en"><head>
<meta charset="utf-8"/>
<meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>
<meta content="width=device-width, user-scalable=yes, initial-scale=1" name="viewport"/>
<meta content="no-cache, no-store, must-revalidate" http-equiv="Cache-Control"/>
<meta content="no-cache" http-equiv="Pragma"/>
<meta content="0" http-equiv="Expires"/>
<meta content="Search term life, whole life, endowment and investment-linked products offered by Singapore life insurers." name="description"/>
<title>Search Results</title>
<link href="css/jquery-ui.css" rel="stylesheet"/>
<link href="css/select2.css" rel="stylesheet"/>
<link href="css/common.css" rel="stylesheet"/>
<link href="css/introjs.css" rel="stylesheet"/>
<link href="css/web.css" media="screen and (min-width:1001px)" rel="stylesheet"/>
<link href="css/tablet.css" media="screen and (min-width:768px) and (max-width:1000px)" rel="stylesheet"/>
<link href="css/smartphone.css" media="screen and (min-width:10px) and

## 5. Retrieve Target Text based on HTML Tag that it is contained within

In [5]:
insurance_type = []
premium_price = []
coverage_term = []
premium_term = []
provider = []

In [6]:
# --- Extract Products ---
insurance_type = [item.get_text(strip=True) for item in soup.find_all("p", id="sProdName")]
premium_price = [item.get_text(strip=True) for item in soup.find_all("span", id="TGpayoutDisp")]
coverage_term = [item.get_text(strip=True) for item in soup.find_all("span", id='covgTerm')]
premium_term = [item.get_text(strip=True) for item in soup.find_all("span", id='TGpayout')]
product_containers = soup.find_all("div", class_="result_content_inner")
provider = []
for container in product_containers:
    provider_tag = container.find("h3")
    provider.append(provider_tag.get_text(strip=True) if provider_tag else None)

## 6. Checking the length and content scraped

In [7]:
len(provider)

22

In [8]:
provider

['Etiqa Insurance Pte. Ltd.',
 'Great Eastern Life',
 'Tokio Marine Life Insurance Singapore Ltd',
 'FWD SINGAPORE PTE. LTD.',
 'HSBC Life (Singapore) Pte. Ltd.',
 'Singapore Life Ltd.',
 'China Taiping Insurance (Singapore) Pte. Ltd.',
 'Income Insurance Limited',
 'China Life Insurance (Singapore) Pte. Ltd.',
 'Prudential Assurance Company Singapore (Pte) Limited',
 'Etiqa Insurance Pte. Ltd.',
 'Great Eastern Life',
 'Tokio Marine Life Insurance Singapore Ltd',
 'FWD SINGAPORE PTE. LTD.',
 'HSBC Life (Singapore) Pte. Ltd.',
 'Singapore Life Ltd.',
 'China Taiping Insurance (Singapore) Pte. Ltd.',
 'Income Insurance Limited',
 'China Life Insurance (Singapore) Pte. Ltd.',
 'Prudential Assurance Company Singapore (Pte) Limited',
 'Manulife (Singapore) Pte. Ltd.',
 'AIA Singapore']

In [None]:
sum_assured = []
critical_illness = []
type = []
age_until = []
gender = []
smoker = []
age_next_birthday = []

for item in range(0,len(provider)):
    sum_assured.append(100000)
    critical_illness.append(1)
    type.append('term')
    age_until.append(65)
    gender.append('f')
    smoker.append(0)
    age_next_birthday.append(31)

## 7. Save to Data Frame for Analysis on Python

In [10]:
# --- Save to DataFrame ---
df = pd.DataFrame({
    'insurance_type': insurance_type,
    'premium_price': premium_price,
    'coverage_term': coverage_term,
    'premium_term': premium_term,
    'provider': provider,
    'sum_assured': sum_assured,
    'critical_illness': critical_illness,
    'type': type,
    'age_until': age_until,
    'gender': gender,
    'smoker': smoker,
    'age_next_birthday': age_next_birthday
})


In [11]:
df.head()

Unnamed: 0,insurance_type,premium_price,coverage_term,premium_term,provider,sum_assured,critical_illness,type,age_until,gender,smoker,age_next_birthday
0,DIRECT - Etiqa term life with CI,S$ 212,34 years,212,Etiqa Insurance Pte. Ltd.,100000,1,term,65,m,0,31
1,DIRECT - Great Term with Optional DIRECT - Gre...,S$ 234,34 years,234,Great Eastern Life,100000,1,term,65,m,0,31
2,DIRECT- TM Basic Term (Level) (+ Critical Illn...,S$ 238,34 years,238,Tokio Marine Life Insurance Singapore Ltd,100000,1,term,65,m,0,31
3,DIRECT - Term Life,S$ 253,34 years,253,FWD SINGAPORE PTE. LTD.,100000,1,term,65,m,0,31
4,DIRECT - HSBC Life - Term Lite and Termcare,S$ 254,35 years,254,HSBC Life (Singapore) Pte. Ltd.,100000,1,term,65,m,0,31


## 8. Removing Duplicates

In [12]:
df_cleaned = df.drop_duplicates()

In [13]:
df_cleaned.shape

(12, 12)

## 9. Importing the previous csv to add to it

After the first iteration was run, this section was added from the 2nd iteration onwards to keep adding to it

In [None]:
file_path = "../data/dpi_premium_rates.csv"

if os.path.exists(file_path):
    # File exists → read it
    previous_df = pd.read_csv(file_path)
    print("File found. Loaded existing data.")
else:
    # File does not exist 
    previous_df = df_cleaned 
    previous_df.to_csv(file_path, index=False)
    print("File not found. Created a new one.")

In [477]:
previous_df.head()

Unnamed: 0,insurance_type,premium_price,coverage_term,premium_term,provider,sum_assured,critical_illness,type,age_until,gender,smoker,age_next_birthday
0,DIRECT - Etiqa term life,S$ 79,34 years,79,Etiqa Insurance Pte. Ltd.,100000,0,term,65,m,0,31
1,DIRECT- TM Basic Term (Level),S$ 98,34 years,98,Tokio Marine Life Insurance Singapore Ltd,100000,0,term,65,m,0,31
2,DIRECT - Term,S$ 112,34 years,112,China Taiping Insurance (Singapore) Pte. Ltd.,100000,0,term,65,m,0,31
3,DIRECT - Term Life,S$ 117,34 years,117,FWD SINGAPORE PTE. LTD.,100000,0,term,65,m,0,31
4,DIRECT - China Life Term Plan,S$ 118,34 years,118,China Life Insurance (Singapore) Pte. Ltd.,100000,0,term,65,m,0,31


In [478]:
previous_df.shape

(4788, 12)

In [479]:
df_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
Index: 12 entries, 0 to 21
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   insurance_type     12 non-null     object
 1   premium_price      12 non-null     object
 2   coverage_term      12 non-null     object
 3   premium_term       12 non-null     object
 4   provider           12 non-null     object
 5   sum_assured        12 non-null     int64 
 6   critical_illness   12 non-null     int64 
 7   type               12 non-null     object
 8   age_until          12 non-null     int64 
 9   gender             12 non-null     object
 10  smoker             12 non-null     int64 
 11  age_next_birthday  12 non-null     int64 
dtypes: int64(5), object(7)
memory usage: 1.2+ KB


## 10. Merging the newly scraped data to the existing dataset using concat()

In [480]:
merged_df = pd.concat([previous_df, df_cleaned])
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4800 entries, 0 to 21
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   insurance_type     4800 non-null   object
 1   premium_price      4800 non-null   object
 2   coverage_term      4800 non-null   object
 3   premium_term       4800 non-null   object
 4   provider           4800 non-null   object
 5   sum_assured        4800 non-null   int64 
 6   critical_illness   4800 non-null   int64 
 7   type               4800 non-null   object
 8   age_until          4800 non-null   int64 
 9   gender             4800 non-null   object
 10  smoker             4800 non-null   int64 
 11  age_next_birthday  4800 non-null   int64 
dtypes: int64(5), object(7)
memory usage: 487.5+ KB


## 11. Export Data Frame to CSV

In [None]:
# --- Export to CSV ---
merged_df.to_csv("dpi_premium_rates.csv", index=False)
print("Saved csv file")

Saved csv file
