# Data Collection

In [1]:
%pip install requests beautifulsoup4 fake_useragent pandas openpyxl

Note: you may need to restart the kernel to use updated packages.


# Web Scraping URLs

## Code Function

In [2]:
import requests
from bs4 import BeautifulSoup
from fake_useragent import UserAgent
import urllib.parse

def google_search(query, num_results, time_filter = None):
    ua = UserAgent()
    headers = {'User-Agent': ua.random}

    query = urllib.parse.quote_plus(query)

    google_url = f"https://www.google.com/search?q={query}&num={num_results}"

    if time_filter:
        google_url += f"&tbs={time_filter}"

    response = requests.get(google_url, headers=headers)

    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')
        search_results = []

        for g in soup.find_all('div', class_='g'):
            anchors = g.find_all('a')
            if anchors:
                link = anchors[0]['href']
                search_results.append(link)
                
        return search_results
    else:
        print(f"failed to retrieve search results: status code {response.status_code}")
        return []


In [4]:
def generate_quarters(start_year, end_year):
    quarters = {}
    if end_year == 2024:
        quarters["2024 Q1"] = "cdr:1,cd_min:1/1/2024,cd_max:3/31/2024"
        quarters["2024 Q2"] = "cdr:1,cd_min:4/1/2024,cd_max:6/30/2024"
        end_year -= 1
    for year in range(start_year, end_year + 1):
        quarters[f"{year} Q1"] = f"cdr:1,cd_min:1/1/{year},cd_max:3/31/{year}"
        quarters[f"{year} Q2"] = f"cdr:1,cd_min:4/1/{year},cd_max:6/30/{year}"
        quarters[f"{year} Q3"] = f"cdr:1,cd_min:7/1/{year},cd_max:9/30/{year}"
        quarters[f"{year} Q4"] = f"cdr:1,cd_min:10/1/{year},cd_max:12/31/{year}"
    return quarters

def generate_query(source_list):
    dictionary = {}
    for source in source_list:
        if source in dictionary:
            continue
        else:
            dictionary[source] = f"singapore industrial property market {source}"
    return dictionary

In [4]:
import pandas as pd

query_dictionary = generate_query(["cna", "singapore business review"])

quarter_dictionary = generate_quarters(2020, 2024)

headers = ["URLs", "Source", "Quarter"]
df = pd.DataFrame(columns=headers)

for source, query in query_dictionary.items():
    for quarter, time_filter in quarter_dictionary.items():
        results = google_search(query, num_results=30, time_filter=time_filter)
        temp_df = pd.DataFrame({"URLs": results, "Source": source, "Quarter": quarter})
        df = pd.concat([df, temp_df], ignore_index=True)

print(df)

query_dictionary = generate_query(["straits times", "business times", "edgeprop"])

quarter_dictionary = generate_quarters(2020, 2024)

for source, query in query_dictionary.items():
    for quarter, time_filter in quarter_dictionary.items():
        results = google_search(query, num_results=50, time_filter=time_filter)
        temp_df = pd.DataFrame({"URLs": results, "Source": source, "Quarter": quarter})
        df = pd.concat([df, temp_df], ignore_index=True)

print(df)

df.to_excel("urls.xlsx", index=False)

                                                  URLs  \
0    https://www.channelnewsasia.com/business/singa...   
1    https://m.facebook.com/SingaporePropertyAdviso...   
2    https://m.facebook.com/story.php?story_fbid=28...   
3    https://www.reddit.com/r/singapore/comments/fk...   
4                    https://www.pbarobotics.com/media   
..                                                 ...   
681  https://www.edgeprop.sg/industrial/mandai-indu...   
682  https://www.propertyguru.com.sg/property-manag...   
683        https://www.youtube.com/watch?v=Mn07tstnDvA   
684  https://www.businesstimes.com.sg/property/rent...   
685  https://www.mof.gov.sg/news-publications/speec...   

                        Source  Quarter  
0                          cna  2020 Q1  
1                          cna  2020 Q1  
2                          cna  2020 Q1  
3                          cna  2020 Q1  
4                          cna  2020 Q1  
..                         ...      ...  
681  sing

## Extract URLs from 2000 to 2024

In [59]:
import pandas as pd

query_dictionary = generate_query(["cna", "singapore business review"])

quarter_dictionary = generate_quarters(2000, 2004)

headers = ["URLs", "Source", "Quarter"]
df = pd.DataFrame(columns=headers)

for source, query in query_dictionary.items():
    for quarter, time_filter in quarter_dictionary.items():
        results = google_search(query, num_results=30, time_filter=time_filter)
        temp_df = pd.DataFrame({"URLs": results, "Source": source, "Quarter": quarter})
        df = pd.concat([df, temp_df], ignore_index=True)

print(df)

query_dictionary = generate_query(["straits times", "business times", "edgeprop"])

quarter_dictionary = generate_quarters(2000, 2004)

for source, query in query_dictionary.items():
    for quarter, time_filter in quarter_dictionary.items():
        results = google_search(query, num_results=50, time_filter=time_filter)
        temp_df = pd.DataFrame({"URLs": results, "Source": source, "Quarter": quarter})
        df = pd.concat([df, temp_df], ignore_index=True)

print(df)

file_path = 'urls.xlsx'

with pd.ExcelWriter(file_path, engine='openpyxl', mode='a', if_sheet_exists='new') as writer:
    df.to_excel(writer, sheet_name='2000 - 2004', index=False)

                                                  URLs  \
0    https://www.ide.go.jp/library/English/Publish/...   
1    https://www.singstat.gov.sg/-/media/files/stan...   
2    https://www.meti.go.jp/meti_lib/report/2022FY/...   
3    https://www.wto.org/english/tratop_e/tpr_e/tp1...   
4    https://www.ura.gov.sg/-/media/Corporate/Resou...   
..                                                 ...   
691  https://www.annualreports.co.uk/HostedData/Ann...   
692  https://www.academia.edu/63315195/Industrial_R...   
693  http://www.reconnectingamerica.org/assets/Uplo...   
694  https://www.crawco.com/expertise/contacts/scot...   
695  https://ec.europa.eu/economy_finance/publicati...   

                        Source  Quarter  
0                          cna  2000 Q1  
1                          cna  2000 Q1  
2                          cna  2000 Q1  
3                          cna  2000 Q1  
4                          cna  2000 Q1  
..                         ...      ...  
691  sing

# URLs Cleaning

In [65]:
file_path = 'urls.xlsx'
sheet_to_clean = '2000 - 2004'

df = pd.read_excel(file_path, sheet_name=sheet_to_clean)
duplicates = df["URLs"].duplicated(keep=False)
df = df[~duplicates]

with pd.ExcelWriter(file_path, engine='openpyxl', mode='a', if_sheet_exists='replace') as writer:
    df.to_excel(writer, sheet_name=sheet_to_clean, index=False)

In [68]:
from urllib.parse import urlparse

file_path = 'urls.xlsx'
sheet_to_clean = '2000 - 2004'

df = pd.read_excel(file_path, sheet_name=sheet_to_clean)

acceptable_domains = [
    'www.channelnewsasia.com',
    'sbr.com.sg',
    'www.straitstimes.com',
    'www.businesstimes.com.sg',
    'www.edgeprop.sg'
]

def extract_domain(url):
    try:
        return urlparse(url).netloc
    except:
        return ''

df['Domain'] = df['URLs'].apply(extract_domain)

filtered_df = df[df['Domain'].isin(acceptable_domains)]

filtered_df = filtered_df.drop(columns=['Domain'])

print(filtered_df)

with pd.ExcelWriter(file_path, engine='openpyxl', mode='a', if_sheet_exists='replace') as writer:
    filtered_df.to_excel(writer, sheet_name=sheet_to_clean, index=False)


Empty DataFrame
Columns: [URLs, Source, Quarter]
Index: []


# Date Scraping URLs

## Code Function

In [71]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

def get_article_date(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    
    try:
        response = requests.get(url, headers=headers)
        response.raise_for_status()
    except requests.exceptions.RequestException as e:
        return f"failed to retrieve the webpage: {e}"
    
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # locate the date in the meta tag with property 'article:published_time'
    date = soup.find('meta', attrs={'property': 'article:published_time'})
    if date:
        return date['content']
    
    # locate the date in the meta tag with name 'cXenseParse:recs:publishtime'
    date = soup.find('meta', attrs={'name': 'cXenseParse:recs:publishtime'})
    if date:
        return date['content']
    
    # locate the date in the meta tag with name 'article:published_time'
    date = soup.find('meta', attrs={'name': 'article:published_time'})
    if date:
        return date['content']
    
    # locate the date 
    time_element = soup.find('time')
    if time_element and 'datetime' in time_element.attrs:
        return time_element['datetime']
    
    # if not found
    return "date not found"

## Extract Dates from 2000 to 2024

In [74]:
excel_file = 'urls.xlsx'
sheet_name = '2020 - 2024'
df_existing = pd.read_excel(excel_file, sheet_name=sheet_name)

urls = df_existing['URLs'].tolist()

date_data = []
for url in urls:
    date = get_article_date(url)
    print(date)
    date_data.append({'URLs': url, 'Date': date})

df_date = pd.DataFrame(date_data)

df_updated = pd.merge(df_existing, df_date, on='URLs', how='left')

df_updated = df_updated.drop(columns=['Quarter'])

with pd.ExcelWriter(excel_file, engine="openpyxl", mode="a", if_sheet_exists="replace") as writer:
    df_updated.to_excel(writer, sheet_name=sheet_name, index=False)

2020-03-13T15:34:45+08:00
2020-02-17T06:14:54+08:00
2020-10-09T05:59:40+08:00
2020-11-29T06:06:44+08:00
2020-11-21T06:00:28+08:00
2020-10-08T06:04:10+08:00
2020-11-28T06:19:08+08:00
date not found
2021-05-23T06:00:45+08:00
2021-05-10T06:10:18+08:00
2021-05-04T06:11:35+08:00
2021-04-03T06:30:00+08:00
date not found
2021-08-20T19:56:32+08:00
date not found
date not found
2021-09-28T08:30:00+08:00
2021-08-14T06:00:00+08:00
2021-08-24T06:30:00+08:00
2021-08-26T06:06:12+08:00
2021-09-09T15:53:00+08:00
2021-11-27T06:00:00+08:00
2021-12-06T06:00:34+08:00
2021-12-09T06:02:28+08:00
2021-12-23T08:44:00+08:00
2021-10-28T07:49:54+08:00
2021-11-18T10:24:00+08:00
2021-12-16T11:00:00+08:00
2022-02-28T06:03:41+08:00
2022-01-26T12:56:00+08:00
date not found
2022-02-23T14:58:00+08:00
2022-01-03T09:00:00+08:00
2022-02-17T06:02:00+08:00
2022-01-19T06:06:00+08:00
2022-01-31T10:00:00+08:00
2022-06-07T12:27:00+08:00
2022-05-24T11:30:00+08:00
2022-05-26T21:07:00+08:00
2022-04-07T20:09:00+08:00
2023-02-04T17:2

# Text Scraping

In [76]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

excel_file = 'urls.xlsx'
df = pd.read_excel(excel_file, sheet_name='2020 - 2024')

In [77]:
urls = df['URLs'].tolist()

text_data = []

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}

for index, url in enumerate(urls):
    print(f"processing URL {index + 1}/{len(urls)}: {url}")
    try:
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            html = response.text

            soup = BeautifulSoup(html, 'html.parser')

            paragraphs = soup.find_all('p')
            extracted_text = '\n'.join([p.get_text() for p in paragraphs])

            text_data.append({'URLs': url, 'Text': extracted_text})
            print(extracted_text[:60])
        else:
            print(f"failed to retrieve {url}. status code: {response.status_code}")
            text_data.append({'URLs': url, 'Text': ''})
    except Exception as e:
        print(f"error fetching {url}: {str(e)}")
        text_data.append({'URLs': url, 'Text': ''}) 

text = pd.DataFrame(text_data)

processing URL 1/589: https://www.channelnewsasia.com/business/singapore-shares-sgx-stocks-10-year-low-covid-19-coronavirus-771846


      Business
  



      Business
  

An SGX sign is pic
processing URL 2/589: https://www.channelnewsasia.com/commentary/singapore-tech-entrepreneurs-start-up-grants-youth-challenge-774631


      Commentary
  

commentary




commentary


      Com
processing URL 3/589: https://www.channelnewsasia.com/commentary/singapore-homes-become-workspaces-huge-changes-work-home-744341


      Commentary
  

commentary




commentary


      Com
processing URL 4/589: https://www.channelnewsasia.com/commentary/property-market-home-sales-ura-purchase-cooling-measures-covid-573191


      Commentary
  

commentary




commentary


      Com
processing URL 5/589: https://www.channelnewsasia.com/commentary/golden-mile-complex-architecture-conservation-555266


      Commentary
  

commentary




commentary


      Com
processing URL 6/589: https://www.channelnewsasia

In [None]:
df_updated = pd.merge(df, text, on='URLs', how='left')

def clean_string(s):
    if isinstance(s, str):
        return ''.join(c for c in s if ord(c) in range(32, 127))
    return s

df_updated['Text'] = df_updated['Text'].apply(clean_string)

df_updated.to_excel("texts.xlsx", index=False)


# Date Handling

In [37]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

excel_file = 'texts.xlsx'
df = pd.read_excel(excel_file, sheet_name='2020 - 2024')
df['Date'] = df['Date'].str.slice(0, 10)

df.to_excel("dates.xlsx", index=False)