# Data Cleaning

## Text Extraction

In [1]:
%pip install pandas requests beautifulsoup4 openpyxl

Note: you may need to restart the kernel to use updated packages.


In [6]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import time
import re

df = pd.read_excel('web_scraping_urls.xlsx', sheet_name='2020 - 2024')

def clean_text(text):
    return re.sub(r'[\x00-\x1f\x7f-\x9f]', '', text)

def extract_text(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        return clean_text(soup.get_text())
    except requests.RequestException as e:
        return str(e)

batch_size = 10
output_filename = 'all_extracted_texts.xlsx'

with pd.ExcelWriter(output_filename, engine='openpyxl', mode='a', if_sheet_exists='overlay') as writer:
    total_urls = len(df['URLs'])
    for start in range(180, total_urls, batch_size):
        end = start + batch_size
        print(f"processing batch from URL {start+1} to {min(end, total_urls)}")
        batch_data = {'URL': df['URLs'][start:end], 'Quarter': df['Quarter'][start:end], 'Extracted Text': []}

        for index, url in enumerate(df['URLs'][start:end], start=start+1):
            text = extract_text(url)
            batch_data['Extracted Text'].append(text)
            print(f"extracted from URL {index}: {text[:60]}...")
            time.sleep(2)

        batch_df = pd.DataFrame(batch_data)
        print(batch_df.head())  # Print head of the DataFrame to check it
        batch_df.to_excel(writer, sheet_name='2020 - 2024', index=False, header=not writer.sheets, startrow=writer.sheets['2020 - 2024'].max_row if '2020 - 2024' in writer.sheets else 0)

        print(f"batch from URL {start+1} to {min(end, total_urls)} saved.")


processing batch from URL 181 to 190
extracted from URL 181:    Moomoo SG - Invest Smarter with One Super App      Operat...
extracted from URL 182:   Building the Future: A Comprehensive Overview of Singapore...
extracted from URL 183: Singapore's real estate rally predicted to halt, says Morgan...
extracted from URL 184: My Analysis of Frasers Logistics & Commercial Trust - The Si...
extracted from URL 185:  Over 50 advanced manufacturing innovations to launch at Ind...
extracted from URL 186: Asia-Pacific real estate market remains resilient in 2024 de...
extracted from URL 187: How digitalisation is fueling Singapore’s economy: I&C secto...
extracted from URL 188: Ubi Industrial Property Review - YouTubeAboutPressCopyrightC...
extracted from URL 189: Singapore Switchgear Market | Grow at 10.65% CAGR till 2030T...
extracted from URL 190: Singapore and Changi Airport eyeing a busier – and greener –...
                                                   URL  Quarter  \
180  https://www

In [7]:
import pandas as pd

file_path = 'all_extracted_texts.xlsx' 
sheet_name = '2020 - 2024'
df = pd.read_excel(file_path, sheet_name=sheet_name)
df_cleaned = df.drop_duplicates(subset='URLs', keep='first')

df_cleaned.to_excel("cleaned_extracted_texts.xlsx", index=False)