In [98]:
#importing 
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support import expected_conditions as EC
import time
import pandas as pd

In [99]:
# a function to apply wait ensuring that our page is loaded completely
def page_load_wait(wait,driver):
    title = driver.title
    try:
        wait.until(
            lambda x : x.execute_script('return document.readyState;') == 'complete' 
        )
    except:
        print(f"page {title} not loaded fully")
    else:
        print(f"page {title} loaded successfully")
    

In [100]:
#instantiating driver object
driver = webdriver.Chrome()
driver.maximize_window()

#implementing explicit wait with timeout value as 5 s
wait = WebDriverWait(driver,5)

#loading our page
url = "https://finance.yahoo.com/"
driver.get(url)

#wait for page to get laoded
page_load_wait(wait,driver)

#hovering onto markets menu
actions = ActionChains(driver)
markets = wait.until(
    EC.presence_of_element_located((By.XPATH,'//*[@id="ybar-navigation"]/div/ul/li[3]/a'))
)
actions.move_to_element(markets).perform()

#clicking on trending tickers
tickers = wait.until(
    EC.element_to_be_clickable((By.XPATH,'//*[@id="ybar-navigation"]/div/ul/li[3]/div/ul/li[4]/a'))
)
tickers.click()

#clicking on most active stocks
active = wait.until(
    EC.element_to_be_clickable((By.XPATH,'//*[@id="nimbus-app"]/section/section/section/article/section[1]/div/nav/ul/li[1]/a'))
)
active.click()
page_load_wait(wait,driver)

data = []
#navigating through the stock pages to scrape data
while True:
    #extracting
    wait.until(
        EC.presence_of_element_located((By.XPATH,'//*[@id="nimbus-app"]/section/section/section/article/section[1]/div/div[2]/div/table'))
    )
    curr_rows = driver.find_elements(By.CSS_SELECTOR,"table tbody tr")
    for row in curr_rows:
        try:
            parameters = row.find_elements(By.TAG_NAME,'td')
            stock = {
                'symbol' : parameters[0].text,
                'name' : parameters[1].text,
                'current_market_price' : parameters[3].text,
                'change' : parameters[4].text,
                'percent_change' : parameters[5].text,
                'volume' : parameters[6].text,
                'avg_volume' : parameters[7].text,
                'market_cap' : parameters[8].text,
                'pe_ratio' : parameters[9].text,
                '52wk_change' : parameters[10].text
            }
            data.append(stock)
        except:
            #due to the page content being dynamically loaded it might throw stale element error which is a drawback of webscraping 
            continue
    
    #checking for further data 
    try:
       next_btn = wait.until(
           EC.element_to_be_clickable((By.XPATH,'//*[@id="nimbus-app"]/section/section/section/article/section[1]/div/div[3]/div[3]/button[3]'))
       )
    except:
        print('all stock data fetched no more pages to view')
        break
    else:
        next_btn.click()



page Yahoo Finance - Stock Market Live, Quotes, Business & Finance News loaded successfully
page Top Trending Stocks: US stocks with the highest interest today - Yahoo Finance loaded successfully
all stock data fetched no more pages to view


### converting data to temporary dataframe to perform cleaning

In [None]:
df = pd.DataFrame(data)

### performing some data cleaning for manual screening observations


In [148]:
df.loc[0,'symbol'] = 'NVDA'

In [166]:
df = df.drop_duplicates(subset=['name'],keep='first')

### changing the data type of columns to appropriate type

In [186]:
df['current_market_price'] = df['current_market_price'].astype(float)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['current_market_price'] = df['current_market_price'].astype(float)


In [194]:
df['change'] = df['change'].astype(float)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['change'] = df['change'].astype(float)


In [244]:
def conv(x):
    sign = x[0]
    x = x[1:-1]
    return float(x) if sign == '+' else -1*float(x)

In [208]:
df['percent_change'] = df['percent_change'].apply(lambda x : conv(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['percent_change'] = df['percent_change'].apply(lambda x : conv(x))


In [216]:
def conv2(x):
    illion = x[-1]
    val = float(x[:-1])
    if illion == 'M':
        val*=1000000
    elif illion == 'B':
        val*=1000000000
    elif illion == 'T':
        val*=1000000000000
    return val
    

In [220]:
df['volume'] = df['volume'].apply(lambda x : conv2(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['volume'] = df['volume'].apply(lambda x : conv2(x))


In [228]:
df['avg_volume'] = df['avg_volume'].apply(lambda x : conv2(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['avg_volume'] = df['avg_volume'].apply(lambda x : conv2(x))


In [238]:
df['market_cap'] = df['market_cap'].apply(lambda x : conv2(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['market_cap'] = df['market_cap'].apply(lambda x : conv2(x))


In [250]:
df['52wk_change'] = df['52wk_change'].apply(lambda x : conv(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['52wk_change'] = df['52wk_change'].apply(lambda x : conv(x))


In [266]:
def pe_conv(pe):
    if pe == '-':
        return -1 # no profit of company
    if ',' in pe :
        pe = pe.replace(',','')
    return float(pe)

In [270]:
df['pe_ratio'] = df['pe_ratio'].apply(lambda x: pe_conv(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['pe_ratio'] = df['pe_ratio'].apply(lambda x: pe_conv(x))


In [272]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 356 entries, 0 to 357
Data columns (total 10 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   symbol                356 non-null    object 
 1   name                  356 non-null    object 
 2   current_market_price  356 non-null    float64
 3   change                356 non-null    float64
 4   percent_change        356 non-null    float64
 5   volume                356 non-null    float64
 6   avg_volume            356 non-null    float64
 7   market_cap            356 non-null    float64
 8   pe_ratio              356 non-null    float64
 9   52wk_change           356 non-null    float64
dtypes: float64(8), object(2)
memory usage: 30.6+ KB


In [276]:
df.head()

Unnamed: 0,symbol,name,current_market_price,change,percent_change,volume,avg_volume,market_cap,pe_ratio,52wk_change
0,NVDA,NVIDIA Corporation,112.69,2.12,1.92,341755000.0,261527000.0,2750000000000.0,38.33,-1.38
1,F,Ford Motor Company,9.9,0.29,3.02,138194000.0,78894000.0,39238000000.0,6.78,-18.25
2,PLTR,Palantir Technologies Inc.,84.9,4.44,5.52,105377000.0,97032000.0,199128000000.0,446.86,-34.95
3,WBA,"Walgreens Boots Alliance, Inc.",11.39,0.79,7.45,104361000.0,28666000.0,9843000000.0,-1.0,-47.12
4,TSLA,"Tesla, Inc.",262.67,-0.78,-0.3,102370000.0,86215000.0,844883000000.0,128.13,-7.76


### Exporting the cleaned data frame

In [281]:
df.to_csv('scraped_data.csv')