# Web-Scraping Website - "https://finance.yahoo.com/"

In [62]:
import numpy as np
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
import time


In [10]:
driver = webdriver.Chrome()
driver.maximize_window()

# explicit wait
wait = WebDriverWait(driver, 5)

# function to check if webpage is fully loaded
def wait_for_page_to_load(driver, wait):
    page_title = driver.title
    try:
        wait.until(
            lambda d: d.execute_script("return document.readyState") == "complete"
        )
    except:
        print(f"The page \"{page_title}\" did not get fully loaded within the given duration.\n")
    else:
        print(f"The page \"{page_title}\" is fully loaded.\n")


url = "https://finance.yahoo.com/"
driver.get(url)
wait_for_page_to_load(driver, wait)


# hovering on Markets menu
actions = ActionChains(driver)
markets_menu =wait.until(
    EC.presence_of_element_located((By.XPATH, '/html[1]/body[1]/div[2]/header[1]/div[1]/div[1]/div[1]/div[4]/div[1]/div[1]/ul[1]/li[3]/a[1]/span[1]'))
)
actions.move_to_element(markets_menu).perform()

# click on Trending Tickers

trending_tickers = wait.until(
    EC.element_to_be_clickable((By.XPATH, '/html[1]/body[1]/div[2]/header[1]/div[1]/div[1]/div[1]/div[4]/div[1]/div[1]/ul[1]/li[3]/div[1]/ul[1]/li[4]/a[1]/div[1]'))
)
trending_tickers.click()
wait_for_page_to_load(driver, wait)

# click on Most Active
most_active = wait.until(
    EC.element_to_be_clickable((By.XPATH, '/html[1]/body[1]/div[2]/main[1]/section[1]/section[1]/section[1]/article[1]/section[1]/div[1]/nav[1]/ul[1]/li[1]/a[1]/span[1]'))
)
most_active.click()
wait_for_page_to_load(driver, wait)

# Scraping the data
data = []
while True:
    # scraping
    wait.until(
        EC.presence_of_element_located((By.TAG_NAME, "table"))
    )
    rows = driver.find_elements(By.CSS_SELECTOR, "table tbody tr")
    for row in rows:
        values = row.find_elements(By.TAG_NAME, "td")
        #print([val.text for val in values])
        stock = {
            "name": values[1].text,
            "symbols": values[0].text,
            "price": values[3].text,
            "change": values[4].text,
            "volume": values[6].text,
            "market_cap": values[8].text,
            "pe_ratio":values[9].text
        }
        data.append(stock)
        #break
    #break


    # click next
    try:
        next_button = wait.until(
            EC.element_to_be_clickable((By.XPATH, '//*[@id="nimbus-app"]/section/section/section/article/section[1]/div/div[3]/div[3]/button[3]'))
        )
    except:
        print("The \"next\" button is not clickable. We have navigated through all the pages.")
        break
    else:
        next_button.click()
        time.sleep(1)

driver.quit()

The page "Yahoo Finance - Stock Market Live, Quotes, Business & Finance News" is fully loaded.

The page "Yahoo Finance - Stock Market Live, Quotes, Business & Finance News" is fully loaded.

The page "Top Trending Stocks: US stocks with the highest interest today - Yahoo Finance" is fully loaded.

The "next" button is not clickable. We have navigated through all the pages.


In [11]:
data

[{'name': 'JPMorgan Chase & Co.',
  'symbols': 'JPM',
  'price': '227.11',
  'change': '-7.23',
  'volume': '16.28M',
  'market_cap': '632.047B',
  'pe_ratio': '11.49'},
 {'name': 'Wells Fargo & Company',
  'symbols': 'WFC',
  'price': '63.11',
  'change': '-3.22',
  'volume': '25.38M',
  'market_cap': '206.064B',
  'pe_ratio': '11.75'},
 {'name': 'Ford Motor Company',
  'symbols': 'F',
  'price': '9.14',
  'change': '-0.36',
  'volume': '163.356M',
  'market_cap': '36.346B',
  'pe_ratio': '6.26'},
 {'name': 'Lucid Group, Inc.',
  'symbols': 'LCID',
  'price': '2.5200',
  'change': '-0.0300',
  'volume': '125.447M',
  'market_cap': '7.639B',
  'pe_ratio': '-'},
 {'name': 'Palantir Technologies Inc.',
  'symbols': 'PLTR',
  'price': '88.59',
  'change': '-3.42',
  'volume': '123.826M',
  'market_cap': '207.775B',
  'pe_ratio': '466.26'},
 {'name': 'Intel Corporation',
  'symbols': 'INTC',
  'price': '19.88',
  'change': '-1.65',
  'volume': '128.155M',
  'market_cap': '86.689B',
  'pe_r

In [12]:
len(data)

402

In [71]:
stocks_df = (
    pd
    .DataFrame(data)
    .apply(lambda col: col.str.strip() if col.dtype == "object" else col)
    #.price.str.extract(r"([^0-9.])", expand=False).unique()
    .assign(
        price=lambda df_: pd.to_numeric(df_.price.str.replace(",", "")),
        change=lambda df_: pd.to_numeric(df_.change.str.replace("+", "").str.replace(",", "")),
        volume=lambda df_: pd.to_numeric(df_.volume.str.replace(",", "").str.replace("M", "")),
        market_cap=lambda df_: df_.market_cap.apply(lambda val: float(val.replace("B", "")) if "B" in val else float(val.replace("T","")) * 1000),
        pe_ratio=lambda df_: df_.pe_ratio.replace("-", np.nan).str.replace(",","").pipe(lambda col: pd.to_numeric(col))
    )
    .rename(columns={
        "price": "price_usd",
        "vokume": "volume_M",
        "market_cap":"market_cap_B"
    })
)

stocks_df#.pe_ratio.str.extract(r"([^0-9.])", expand=False).unique()

Unnamed: 0,name,symbols,price_usd,change,volume,market_cap_B,pe_ratio
0,JPMorgan Chase & Co.,JPM,227.11,-7.23,16.280,632.047,11.49
1,Wells Fargo & Company,WFC,63.11,-3.22,25.380,206.064,11.75
2,Ford Motor Company,F,9.14,-0.36,163.356,36.346,6.26
3,"Lucid Group, Inc.",LCID,2.52,-0.03,125.447,7.639,
4,Palantir Technologies Inc.,PLTR,88.59,-3.42,123.826,207.775,466.26
...,...,...,...,...,...,...,...
397,"General Mills, Inc.",GIS,57.17,-0.29,5.030,31.306,12.56
398,Hormel Foods Corporation,HRL,29.51,-0.20,5.016,16.228,21.38
399,"Stanley Black & Decker, Inc.",SWK,58.68,-5.54,5.009,9.068,31.05
400,"Netflix, Inc.",NFLX,921.17,-24.30,5.109,394.037,46.50


In [None]:
#!pip install openpyxl

In [72]:
stocks_df.to_excel("yahoo-stocks-data.xlsx", index=False)