# Data Collection

## Web Scraping URLs

### Install Required Packages

In [1]:
%pip install requests beautifulsoup4 fake_useragent pandas openpyxl numpy


Note: you may need to restart the kernel to use updated packages.


### Python Code

#### Google Search

In [2]:
import requests
from bs4 import BeautifulSoup
from fake_useragent import UserAgent
import urllib.parse
import time

def google_search(query, num_results, time_filter = None):
    # initialize UserAgent
    ua = UserAgent()
    # generate a random user agent for each request
    headers = {'User-Agent': ua.random}

    # URL encode the query
    query = urllib.parse.quote_plus(query)

    # construct the Google search URL
    google_url = f"https://www.google.com/search?q={query}&num={num_results}"

    # append the time filter if specified
    if time_filter:
        google_url += f"&tbs={time_filter}"

    attempts = 0
    while attempts < 5:
        # send the request
        response = requests.get(google_url, headers=headers)
        attempts += 1

        if response.status_code == 200:
            soup = BeautifulSoup(response.text, 'html.parser')
            search_results = []

            # extract search result URLs
            for g in soup.find_all('div', class_='g'):
                anchors = g.find_all('a')
                if anchors:
                    link = anchors[0]['href']
                    search_results.append(link)
                
            return search_results

        elif response.status_code == 429:
            print("rate limit reached, waiting to retry...")
            time.sleep(1 * attempts)  # exponential back-off

        else:
            print(f"failed to retrieve search results: status code {response.status_code}")
            return []
        
    return []

#### Generate Quarters

In [3]:
def generate_quarters(start_year, end_year):
    quarters = {}
    if end_year == 2024:
        quarters["2024 Q1"] = "cdr:1,cd_min:1/1/2024,cd_max:3/31/2024"
        quarters["2024 Q2"] = "cdr:1,cd_min:4/1/2024,cd_max:6/30/2024"
        end_year -= 1
    for year in range(start_year, end_year + 1):
        quarters[f"{year} Q1"] = f"cdr:1,cd_min:1/1/{year},cd_max:3/31/{year}"
        quarters[f"{year} Q2"] = f"cdr:1,cd_min:4/1/{year},cd_max:6/30/{year}"
        quarters[f"{year} Q3"] = f"cdr:1,cd_min:7/1/{year},cd_max:9/30/{year}"
        quarters[f"{year} Q4"] = f"cdr:1,cd_min:10/1/{year},cd_max:12/31/{year}"
    return quarters

#### Result

##### 2020 - 2024

In [4]:
import pandas as pd
quarter_dictionary = generate_quarters(2020, 2024)

query = "singapore industrial market outlook news"
# create a base DataFrame
headers = ["URLs", "Quarter"]
df = pd.DataFrame(columns=headers)

# iterate over queries and quarters
for quarter, time_filter in quarter_dictionary.items():
    # perform Google search
    results = google_search(query, num_results=100, time_filter=time_filter)
    # append results to DataFrame
    temp_df = pd.DataFrame({"URLs": results, "Quarter": quarter})
    df = pd.concat([df, temp_df], ignore_index=True)

# output the DataFrame
print(df)

                                                  URLs  Quarter
0    https://www.colliers.com/en-sg/news/2024-02-14...  2024 Q1
1    https://www.colliers.com/en-sg/news/2024-02-14...  2024 Q1
2    https://www.businesstimes.com.sg/property/bt-p...  2024 Q1
3    https://www.colliers.com/en-sg/research/2023-q...  2024 Q1
4    https://www.jll.com.sg/en/newsroom/moderation-...  2024 Q1
..                                                 ...      ...
907  https://www.ebmpapst.com/sg/en/newsroom/news/2...  2023 Q4
908  https://esr-logosreit.listedcompany.com/newsro...  2023 Q4
909  https://www.propertyguru.com.sg/property-for-r...  2023 Q4
910  https://www.mof.gov.sg/news-publications/speec...  2023 Q4
911  https://www.ema.gov.sg/news-events/news/featur...  2023 Q4

[912 rows x 2 columns]


In [5]:
# save results to an Excel File
df.to_excel("web_scraping_urls.xlsx", index=False)

##### 2016 - 2019

In [6]:
quarter_dictionary = generate_quarters(2015, 2019)

query = "singapore industrial market outlook news"
# create a base DataFrame
headers = ["URLs", "Quarter"]
new_df = pd.DataFrame(columns=headers)

# iterate over queries and quarters
for quarter, time_filter in quarter_dictionary.items():
    # perform Google search
    results = google_search(query, num_results=100, time_filter=time_filter)
    # append results to DataFrame
    temp_df = pd.DataFrame({"URLs": results, "Quarter": quarter})
    new_df = pd.concat([new_df, temp_df], ignore_index=True)

# output the DataFrame
print(new_df)

                                                   URLs  Quarter
0     https://www.todayonline.com/singapore/jtc-impr...  2015 Q1
1     https://www.straitstimes.com/singapore/manufac...  2015 Q1
2     https://www.todayonline.com/singapore/singapor...  2015 Q1
3             https://thejden.sg/general-property-news/  2015 Q1
4                              https://eurocham.org.sg/  2015 Q1
...                                                 ...      ...
1106  https://www.srx.com.sg/commercial/henderson-in...  2019 Q4
1107    https://www.eurekalert.org/news-releases/500897  2019 Q4
1108                               https://www.ifs.com/  2019 Q4
1109  https://www.99.co/singapore/insider/99-co-to-c...  2019 Q4
1110  https://www.propertyguru.com.sg/project/inno-c...  2019 Q4

[1111 rows x 2 columns]


In [7]:
# save results to an existing Excel file
file_path = '/Users/loowenwen/Desktop/Visual Code Studio/jtc-chatgpt/web_scraping_urls.xlsx'
existing_data = pd.read_excel(file_path)
updated_data = pd.concat([df, new_df], ignore_index=True)

# write the updated DataFrame back to the same Excel file
with pd.ExcelWriter(file_path, engine='openpyxl', mode='a', if_sheet_exists='replace') as writer:
    updated_data.to_excel(writer, index=False)

##### 2010 - 2015

In [8]:
quarter_dictionary = generate_quarters(2020, 2024)

query = "singapore industrial market outlook news"
# create a base DataFrame
headers = ["URLs", "Quarter"]
df = pd.DataFrame(columns=headers)

# iterate over queries and quarters
for quarter, time_filter in quarter_dictionary.items():
    # perform Google search
    results = google_search(query, num_results=100, time_filter=time_filter)
    # append results to DataFrame
    temp_df = pd.DataFrame({"URLs": results, "Quarter": quarter})
    df = pd.concat([df, temp_df], ignore_index=True)

# output the DataFrame
print(df)

rate limit reached, waiting to retry...
rate limit reached, waiting to retry...
rate limit reached, waiting to retry...
rate limit reached, waiting to retry...
rate limit reached, waiting to retry...
rate limit reached, waiting to retry...
rate limit reached, waiting to retry...
rate limit reached, waiting to retry...
rate limit reached, waiting to retry...


KeyboardInterrupt: 

In [None]:
# save results to an Excel File
df.to_excel("web_scraping_urls.xlsx", index=False)