# Data Collection

## Web Scraping URLs

### Install Required Packages

In [33]:
!pip install requests beautifulsoup4 fake_useragent pandas openpyxl




### Python Code

#### Google Search

In [35]:
import requests
from bs4 import BeautifulSoup
from fake_useragent import UserAgent
import urllib.parse

def google_search(query, num_results, time_filter = None):
    # initialize UserAgent
    ua = UserAgent()
    # generate a random user agent for each request
    headers = {'User-Agent': ua.random}

    # URL encode the query
    query = urllib.parse.quote_plus(query)

    # construct the Google search URL
    google_url = f"https://www.google.com/search?q={query}&num={num_results}"

    # append the time filter if specified
    if time_filter:
        google_url += f"&tbs={time_filter}"

    # send the request
    response = requests.get(google_url, headers=headers)

    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')
        search_results = []

        # extract search result URLs
        for g in soup.find_all('div', class_='g'):
            anchors = g.find_all('a')
            if anchors:
                link = anchors[0]['href']
                search_results.append(link)
                
        return search_results
    else:
        print(f"failed to retrieve search results: status code {response.status_code}")
        return []

#### Generate Quarters

In [36]:
def generate_quarters(start_year, end_year):
    quarters = {}
    if end_year == 2024:
        quarters["2024 Q1"] = "cdr:1,cd_min:1/1/2024,cd_max:3/31/2024"
        quarters["2024 Q2"] = "cdr:1,cd_min:4/1/2024,cd_max:6/30/2024"
        end_year -= 1
    for year in range(start_year, end_year + 1):
        quarters[f"{year} Q1"] = f"cdr:1,cd_min:1/1/{year},cd_max:3/31/{year}"
        quarters[f"{year} Q2"] = f"cdr:1,cd_min:4/1/{year},cd_max:6/30/{year}"
        quarters[f"{year} Q3"] = f"cdr:1,cd_min:7/1/{year},cd_max:9/30/{year}"
        quarters[f"{year} Q4"] = f"cdr:1,cd_min:10/1/{year},cd_max:12/31/{year}"
    return quarters

#### Result

##### 2020 - 2024

In [45]:
import pandas as pd
quarter_dictionary = generate_quarters(2020, 2024)

query = "singapore industrial market outlook"
# create a base DataFrame
headers = ["URLs", "Quarter"]
df = pd.DataFrame(columns=headers)

# iterate over queries and quarters
for quarter, time_filter in quarter_dictionary.items():
    # perform Google search
    results = google_search(query, num_results=100, time_filter=time_filter)
    # append results to DataFrame
    temp_df = pd.DataFrame({"URLs": results, "Quarter": quarter})
    df = pd.concat([df, temp_df], ignore_index=True)

# output the DataFrame
print(df)

                                                   URLs  Quarter
0     https://www.colliers.com/en-sg/news/2024-02-14...  2024 Q1
1     https://www.colliers.com/en-sg/news/2024-02-14...  2024 Q1
2     https://www.colliers.com/en-sg/research/2023-q...  2024 Q1
3     https://www.businesstimes.com.sg/property/bt-p...  2024 Q1
4     https://www.cbre.com.sg/insights/reports/singa...  2024 Q1
...                                                 ...      ...
1391  https://www.99.co/singapore/insider/august-res...  2023 Q3
1392  https://altoo.io/what-to-know-about-the-singap...  2023 Q3
1393  https://www.thesingaporeaninvestor.sg/2023/07/...  2023 Q3
1394  https://content.knightfrank.com/research/529/d...  2023 Q4
1395  https://content.knightfrank.com/research/529/d...  2023 Q4

[1396 rows x 2 columns]


In [47]:
# save results to an Excel File
df.to_excel("web_scraping_urls.xlsx", index=False)

##### 2010 - 2019

In [48]:
import pandas as pd
quarter_dictionary = generate_quarters(2010, 2019)

query = "singapore industrial market outlook"
# create a base DataFrame
headers = ["URLs", "Quarter"]
df = pd.DataFrame(columns=headers)

# iterate over queries and quarters
for quarter, time_filter in quarter_dictionary.items():
    # perform Google search
    results = google_search(query, num_results=100, time_filter=time_filter)
    # append results to DataFrame
    temp_df = pd.DataFrame({"URLs": results, "Quarter": quarter})
    df = pd.concat([df, temp_df], ignore_index=True)

# output the DataFrame
print(df)

failed to retrieve search results: status code 429
failed to retrieve search results: status code 429
failed to retrieve search results: status code 429
failed to retrieve search results: status code 429
failed to retrieve search results: status code 429
failed to retrieve search results: status code 429
failed to retrieve search results: status code 429
failed to retrieve search results: status code 429
failed to retrieve search results: status code 429
failed to retrieve search results: status code 429
failed to retrieve search results: status code 429
failed to retrieve search results: status code 429
                                                   URLs  Quarter
0     https://centreforaviation.com/analysis/reports...  2010 Q1
1     https://www.mom.gov.sg/-/media/mom/documents/p...  2010 Q1
2     https://nap.nationalacademies.org/read/12920/c...  2010 Q1
3     https://www.econstor.eu/bitstream/10419/174708...  2010 Q1
4     https://www.stb.gov.sg/content/dam/stb/documen...  2010 Q