In [3]:
import csv
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
from geopy.geocoders import Nominatim

# Set up Selenium WebDriver
options = Options()
options.add_argument("--headless")  # Run in headless mode
options.add_argument("--disable-gpu")  # Disable GPU (headless mode optimization)
options.add_argument('user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36')

driver = webdriver.Chrome(options=options)

# Define URLs for scraping
urls = {
    "Starbucks": "https://www.starbucks.com.my/find-store",
    "McDonald's": "https://www.mcdonalds.com.my/locate-us",
    "CU": "https://nicetocu.com.my/stores/",
}

In [13]:
# Initialize the list to store data
cu = []

def scrape_cu_stores(driver, url, data):
    driver.get(url)
    time.sleep(2)  # Allow the page to load

    try:
        # Locate all toggle links (for each state or region, like "Johor")
        toggle_links = driver.find_elements(By.CSS_SELECTOR, "a.colisiond")

        # Click each toggle link to reveal the store section
        for toggle_link in toggle_links:
            # Simulate a click to open the store section
            ActionChains(driver).move_to_element(toggle_link).click().perform()
            time.sleep(1)  # Allow time for the section to load (adjust if needed)

            # Wait for the store section to become visible after the click
            WebDriverWait(driver, 10).until(EC.visibility_of_element_located((By.CSS_SELECTOR, ".sl-addr-sec")))

            # Extract the store details after the section is revealed
            store_sections = driver.find_elements(By.CSS_SELECTOR, ".sl-addr-sec")
            
            for section in store_sections:
                # Extract store name
                store_name = section.find_element(By.CSS_SELECTOR, "h3").text.strip()
                store_name = '\n'.join([line for line in store_name.splitlines() if line.strip()])

                # Extract address
                address = section.find_element(By.CSS_SELECTOR, ".sl-addr span").text.strip()
                address = '\n'.join([line for line in address.splitlines() if line.strip()])
                
                data.append(["CU", store_name, address])

    except Exception as e:
        print(f"Error scraping CU stores: {e}")

scrape_cu_stores(driver, urls["CU"], cu)
output_file = "cu_store_locations.csv"

# Convert the list to a DataFrame
columns = ["Brand", "Store Name", "Address"]
cu_df = pd.DataFrame(cu, columns=columns)

# Drop rows with any empty values (empty strings or NaN)
cu_df = cu_df.replace('', pd.NA).dropna()

# Save the DataFrame to a CSV file
cu_df.to_csv(output_file, index=False)

print(f"Scraping completed. Data saved to {output_file}")

Scraping completed. Data saved to cu_store_locations.csv


In [12]:
# Initialize the list to store data
mcd = []

def scrape_mcdonalds_stores(driver, url, data):
    driver.get(url)
    time.sleep(10)  # Allow the page to load
    try:
        store_elements = driver.find_elements(By.CSS_SELECTOR, "a.addressTitle strong")
        address_elements = driver.find_elements(By.CSS_SELECTOR, "p.addressText")
        
        for store_element, address_element in zip(store_elements, address_elements):
            store_name = store_element.text.strip()
            store_address = address_element.text.strip()

            # Remove any lines containing "tel" or "fax" from the address
            address_lines = store_address.split('\n')
            filtered_address = '\n'.join([line for line in address_lines if not line.strip().lower().startswith(("tel:", "fax:"))])
            
            # Only add the store if the address is non-empty after filtering
            if filtered_address:
                data.append(["McDonald's", store_name, "", filtered_address, ""])
                
    except Exception as e:
        print(f"Error scraping McDonald's stores: {e}")


scrape_mcdonalds_stores(driver, urls["McDonald's"], mcd)

# Save the data to a CSV file
output_file = "mcd_store_locations.csv"
with open(output_file, mode="w", newline="", encoding="utf-8") as file:
    writer = csv.writer(file)
    writer.writerow(["Brand", "Store Name","Address"])
    writer.writerows(data2)

print(f"Scraping completed. Data saved to {output_file}")

Scraping completed. Data saved to mcd_store_locations.csv


In [None]:
# Initialize the geolocator
geolocator = Nominatim(user_agent="agentname")

# Replace 'your_file.csv' with the path to your CSV file
cudf = pd.read_csv('cu_store_locations.csv')
mcddf = pd.read_csv('mcd_store_locations.csv')

# Concatenate DataFrames vertically (along rows)
df = pd.concat([cudf, cudf], ignore_index=True)

# Add longitude and latitude columns
df["Longitude"] = ""
df["Latitude"] = ""

# Geocode addresses
for index, row in df.iterrows():
    address = row["Address"] 
    if address.strip():  # Check if address is not empty
        try:
            location = geolocator.geocode(address, namedetails=True)
            if location:
                df.at[index, "Longitude"] = location.longitude
                df.at[index, "Latitude"] = location.latitude
            else:
                print(f"Coordinates not available for: {address}")
        except Exception as e:
            print(f"Error geocoding address: {address}. Error: {e}")
    else:
        print(f"No address provided for row {index}")

# Replace empty longitude/latitude with "Not available"
df["Longitude"] = df["Longitude"].replace("", "Not available")
df["Latitude"] = df["Latitude"].replace("", "Not available")