# Data Cleaning

## Text Extraction

In [1]:
%pip install pandas requests beautifulsoup4 openpyxl

Note: you may need to restart the kernel to use updated packages.


In [26]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import time
import re

# load the Excel file with URLs
df = pd.read_excel('web_scraping_urls.xlsx')

# function to clean text
def clean_text(text):
    # regex to match non-printable characters
    clean_text = re.sub(r'[\x00-\x1f\x7f-\x9f]', '', text)
    return clean_text

# function to extract and clean text from a URL
def extract_text(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        text = soup.get_text()
        # clean the text before returning
        return clean_text(text)
    except requests.RequestException as e:
        return str(e)

# define batch size and output file name
batch_size = 10
output_filename = 'all_extracted_texts.xlsx'

# initialize the ExcelWriter to append without overwriting previous data
with pd.ExcelWriter(output_filename, engine='openpyxl', mode='a', if_sheet_exists='overlay') as writer:
    total_urls = len(df['URLs'])

    # process each batch
    for start in range(2150, total_urls, batch_size):
        end = start + batch_size
        print(f"processing batch from URL {start+1} to {min(end, total_urls)}")
        batch_data = {'URL': df['URLs'][start:end], 'Quarter': df['Quarter'][start:end], 'Extracted Text': []}

        for index, url in enumerate(df['URLs'][start:end], start=start+1):
            print(f"processing URL {index}/{total_urls}: {url}")
            text = extract_text(url)
            batch_data['Extracted Text'].append(text)
            time.sleep(2)  # sleep for 2 seconds between requests
            print("extraction successful for this URL")

        # convert batch data to DataFrame and append to the Excel file
        batch_df = pd.DataFrame(batch_data)
        # append batch data to the Excel file
        batch_df.to_excel(writer, sheet_name='Extracted Text', index=False, header=not writer.sheets, startrow=writer.sheets['Extracted Text'].max_row if 'Extracted Text' in writer.sheets else 0)

        print(f"batch from URL {start+1} to {min(end, total_urls)} saved.")

print("all URLs processed and saved to the same Excel file.")


processing batch from URL 1821 to 1830
processing URL 1821/4913: https://www.pwc.com/id/en/industry-sectors/consumer-industrial-products-services.html
extraction successful for this URL
processing URL 1822/4913: https://sbma.org.sg/singapore-bullion-market/singapore/
extraction successful for this URL
processing URL 1823/4913: https://www.straitstimes.com/authors/chong-koh-ping
extraction successful for this URL
processing URL 1824/4913: https://www.jll.com.mo/en/newsroom
extraction successful for this URL
processing URL 1825/4913: https://www.99.co/singapore/insider/who-is-buying-property-in-singapore-despite-the-price-slump-and-why/
extraction successful for this URL
processing URL 1826/4913: https://research.sginvestors.io/2015/12/singapore-stocks-target-price-review-transport-logistics-infrastructure-sector-2015-12-24.html
extraction successful for this URL
processing URL 1827/4913: https://customs.gov.sg/news-and-media/media-releases/
extraction successful for this URL
processing 

KeyboardInterrupt: 

In [25]:
# initialize the ExcelWriter to append without overwriting previous data
with pd.ExcelWriter(output_filename, engine='openpyxl', mode='a', if_sheet_exists='overlay') as writer:
    total_urls = len(df['URLs'])

    start = 1812
    end = 1820
    batch_data = {'URL': df['URLs'][start:end], 'Quarter': df['Quarter'][start:end], 'Extracted Text': []}

    for index, url in enumerate(df['URLs'][start:end], start=start+1):
        print(f"processing URL {index}/{total_urls}: {url}")
        text = extract_text(url)
        batch_data['Extracted Text'].append(text)
        time.sleep(2)  # sleep for 2 seconds between requests
        print("extraction successful for this URL")

    # convert batch data to DataFrame and append to the Excel file
    batch_df = pd.DataFrame(batch_data)
    # append batch data to the Excel file
    batch_df.to_excel(writer, sheet_name='Extracted Text', index=False, header=not writer.sheets, startrow=writer.sheets['Extracted Text'].max_row if 'Extracted Text' in writer.sheets else 0)

    print(f"batch from URL {start+1} to {min(end, total_urls)} saved.")

print("all URLs processed and saved to the same Excel file.")


processing URL 1813/4913: https://www.99.co/singapore/insider/is-it-getting-hip-to-live-in-the-north/
extraction successful for this URL
processing URL 1814/4913: https://sg.linkedin.com/in/victor-chia-87520337
extraction successful for this URL
processing URL 1815/4913: https://www.newlaunch.sg/industrial/
extraction successful for this URL
processing URL 1816/4913: https://www.newlaunch.sg/listings/industrial/
extraction successful for this URL
processing URL 1817/4913: https://www.jll.com.ph/en/newsroom
extraction successful for this URL
processing URL 1818/4913: https://www.thailand-business-news.com/asean/51145-asean-integration-to-boost-south-east-asias-real-estate-market
extraction successful for this URL
processing URL 1819/4913: https://www.commercialguru.com.sg/property-management-news/2015/12/112671/iskandar-the-rising-industrial-star
extraction successful for this URL
processing URL 1820/4913: https://www.robertwalters.com.sg/insights/career-advice/blog/the-future-of-pharma