In [20]:
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import time
import pandas as pd
import numpy as np

In [21]:
class Stock_Scraper:

    def __init__(self, driver, timeout=10):
        self.driver = driver
        self.wait = WebDriverWait(self.driver, timeout=timeout)
        self.data = []

    def wait_for_page_to_load(self):
        page_title = self.driver.title
        try:
            self.wait.until(
                lambda d: d.execute_script("return document.readyState") == "complete"
            )
        except:
            print(f"The Page \"{page_title}\" did not get fully loaded within the given duration\n")
        else:
            print(f'The Page \"{page_title}\" is successfully loaded\n')

    def access_url(self, url):
        self.driver.get(url)
        self.wait_for_page_to_load()

    def access_most_active_stock(self):
        # Hovering over the market menu
        actions = ActionChains(self.driver)
        markets_menu = self.wait.until(
            EC.presence_of_element_located((By.XPATH, '/html[1]/body[1]/div[2]/header[1]/div[1]/div[1]/div[1]/div[4]/div[1]/div[1]/ul[1]/li[3]/a[1]/span[1]'))
        )
        actions.move_to_element(markets_menu).perform()

        # Clicking on trending Tickers
        trending_tickers = self.wait.until(
            EC.element_to_be_clickable((By.XPATH, '/html[1]/body[1]/div[2]/header[1]/div[1]/div[1]/div[1]/div[4]/div[1]/div[1]/ul[1]/li[3]/div[1]/ul[1]/li[4]/a[1]/div[1]'))
        )
        trending_tickers.click()
        self.wait_for_page_to_load()

        # Clicking on most active
        most_active = self.wait.until(
            EC.element_to_be_clickable((By.XPATH, '/html[1]/body[1]/div[2]/main[1]/section[1]/section[1]/section[1]/article[1]/section[1]/div[1]/nav[1]/ul[1]/li[1]/a[1]/span[1]'))
        )
        most_active.click()
        self.wait_for_page_to_load()

    
    def extract_stocks_data(self):

        while True:
            self.wait.until(
            EC.presence_of_element_located((By.TAG_NAME, "table"))
            )
            # we went all the rows
            rows = self.driver.find_elements(By.CSS_SELECTOR, "table tbody tr")
            
            for row in rows:
                values = row.find_elements(By.TAG_NAME, "td")
                stock = {
                    "name: ":values[1].text,
                    "symbol: ":values[0].text,
                    "price: ":values[3].text,
                    "change: ":values[4].text,
                    "volumne: ":values[6].text,
                    "market_cap: ":values[8].text,
                    "pe_ratio: ":values[9].text,
                }
                self.data.append(stock)
            # clickable
            try:
                next_button = self.wait.until(
                EC.element_to_be_clickable((By.XPATH, '//*[@id="nimbus-app"]/section/section/section/article/section[1]/div/div[3]/div[3]/button[3]'))
                )
            except:
                print("The \"next\" button is not clickable!. We have navigate through all the pages")
                break
            else:
                next_button.click()
                # since data change but page same 
                time.sleep(1)


    def clean_and_save_data(self,filename='temp'):
        # Convert dictionary to DataFrame
        stock_df = pd.DataFrame(self.data)
        # Fix column names: remove colons and extra spaces
        stock_df.columns = stock_df.columns.str.strip().str.replace(':', '')
        # Rename 'price' to 'price_usd'
        stock_df = stock_df.rename(columns={'price': 'price_usd'})
        # Convert 'price_usd' to numeric (float)
        stock_df['price_usd'] = pd.to_numeric(stock_df['price_usd'], errors='coerce')
        # Clean 'change' column:
        stock_df['change'] = stock_df['change'].str.replace(r'^\+', '', regex=True)  # Remove '+' but keep '-'
        stock_df['change'] = pd.to_numeric(stock_df['change'], errors='coerce')  # Convert to float
        # Clean 'volumne' column:
        stock_df['volumne'] = stock_df['volumne'].str.replace('M', '', regex=False)  # Remove 'M'
        stock_df['volumne'] = pd.to_numeric(stock_df['volumne'], errors='coerce')  # Convert to float
        # Rename 'volumne' to 'volumne_m' to reflect millions
        stock_df = stock_df.rename(columns={'volumne': 'volumne_m'})
        def convert_market_cap(value):
            if isinstance(value, str):
                value = value.replace('M', '')  # Remove 'M'
                if 'T' in value:  # If it's in trillion
                    value = float(value.replace('T', '')) * 1000  # Convert trillion to billion
                elif 'B' in value:  # If it's in billion
                    value = float(value.replace('B', ''))  # Remove 'B' and convert to float
                else:
                    value = float(value)
            return value
        # Apply the conversion to 'market_cap'
        stock_df['market_cap'] = stock_df['market_cap'].apply(convert_market_cap)
        # Rename 'market_cap' to 'market_cap_billion'
        stock_df = stock_df.rename(columns={'market_cap': 'market_cap_billion'})
        # Replace '-' with NaN
        stock_df['pe_ratio'] = stock_df['pe_ratio'].replace('-', np.nan)
        # Remove commas (if any)
        stock_df['pe_ratio'] = stock_df['pe_ratio'].str.replace(',', '', regex=True)
        # Convert to numeric, forcing errors to NaN
        stock_df['pe_ratio'] = pd.to_numeric(stock_df['pe_ratio'], errors='coerce')
        # Show cleaned DataFrame
        stock_df.dtypes
        # Save the cleaned DataFrame to a CSV file
        stock_df.to_csv(f"{filename}.csv", index=False)

In [22]:
if __name__=="__main__":
    driver = webdriver.Chrome()
    driver.maximize_window()

    url = 'https://finance.yahoo.com/'
    scraper = Stock_Scraper(driver, 5)

    scraper.access_url(url)
    scraper.access_most_active_stock()
    scraper.extract_stocks_data()
    scraper.clean_and_save_data('Yahoo_finance_Stock_dataset')

    driver.quit()

The Page "Yahoo Finance - Stock Market Live, Quotes, Business & Finance News" is successfully loaded

The Page "Top Trending Stocks: US stocks with the highest interest today - Yahoo Finance" is successfully loaded

The Page "Top Trending Stocks: US stocks with the highest interest today - Yahoo Finance" is successfully loaded

The "next" button is not clickable!. We have navigate through all the pages
