<a href="https://colab.research.google.com/github/econ105/AI/blob/main/Python/algotrading/webscraping/stock_filter_volume.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import pandas as pd
import requests

# --- Step 1: Fetch page with headers ---
url = "https://en.wikipedia.org/wiki/List_of_S%26P_500_companies"
headers = {"User-Agent": "Mozilla/5.0"}  # Pretend to be a browser
response = requests.get(url, headers=headers)
response.raise_for_status()  # Raise error if request fails

# --- Step 2: Parse tables with pandas ---
tables = pd.read_html(response.text)
sp500_table = tables[0]  # First table is the constituents table

# --- Step 3: Extract ticker symbols ---
sp500_tickers = sp500_table['Symbol'].tolist()

# --- Step 4: Fix '.' to '-' for yfinance compatibility ---
sp500_tickers = [ticker.replace('.', '-') for ticker in sp500_tickers]

print(sp500_tickers[:20])  # Show first 20 tickers


  tables = pd.read_html(response.text)


['MMM', 'AOS', 'ABT', 'ABBV', 'ACN', 'ADBE', 'AMD', 'AES', 'AFL', 'A', 'APD', 'ABNB', 'AKAM', 'ALB', 'ARE', 'ALGN', 'ALLE', 'LNT', 'ALL', 'GOOGL']


In [4]:
import pandas as pd
import requests
import yfinance as yf

# --- Step 1: Fetch S&P 500 table with headers ---
url = "https://en.wikipedia.org/wiki/List_of_S%26P_500_companies"
headers = {"User-Agent": "Mozilla/5.0"}  # Pretend to be a browser
response = requests.get(url, headers=headers)
response.raise_for_status()

# --- Step 2: Parse tables ---
tables = pd.read_html(response.text)
sp500_table = tables[0]

# --- Step 3: Clean tickers for yfinance ---
excluded_tickers = ['BRK.B', 'BF.B']  # optional exclusions
tickers = [t.replace('.', '-') for t in sp500_table['Symbol'].tolist() if t not in excluded_tickers]

# --- Step 4: Fetch and filter stocks ---
def fetch_and_filter_stocks(tickers):
    data = []
    for ticker in tickers:
        stock = yf.Ticker(ticker)
        hist = stock.history(period='1mo')
        if hist.empty:
            continue
        avg_volume = hist['Volume'].mean()
        latest_volume = hist['Volume'].iloc[-1]
        if latest_volume >= 1.5 * avg_volume:
            info = stock.info
            pct_volume_gt_avg = (latest_volume - avg_volume) / avg_volume * 100
            data.append({
                'Ticker': ticker,
                'Name': info.get('shortName', 'N/A'),
                'Price': info.get('regularMarketPrice', None),
                'Pct_Volume_GT_Avg': f"{pct_volume_gt_avg:.2f}",
                'PE_Ratio': f"{info.get('trailingPE', float('nan')):.2f}" if info.get('trailingPE') else 'N/A',
                'Market_Cap': f"{info.get('marketCap', 0)/1e6:.2f}M",
                'Dividend_Yield': info.get('dividendYield', None)
            })
    return pd.DataFrame(data)

# --- Step 5: Run and display ---
df_volume = fetch_and_filter_stocks(tickers)
print(df_volume.head())


  tables = pd.read_html(response.text)


  Ticker                             Name   Price Pct_Volume_GT_Avg PE_Ratio  \
0    APD  Air Products and Chemicals, Inc  279.74            115.39      N/A   
1   ABNB                     Airbnb, Inc.  121.35            114.41    30.11   
2   AKAM        Akamai Technologies, Inc.  111.76             79.05    32.77   
3    AEE               Ameren Corporation  110.97             68.16    21.34   
4    AWK  American Water Works Company, I  133.50             83.52    23.46   

  Market_Cap  Dividend_Yield  
0  62285.79M            2.59  
1  72751.02M             NaN  
2  16078.38M             NaN  
3  30672.11M            2.70  
4  26056.15M            2.48  


In [5]:
import requests
import re
from bs4 import BeautifulSoup

def remove_leftmost_zero(code):
    # Remove only the very first '0' if present
    return code[1:] if code.startswith('0') else code

# Fetch the webpage
url = "http://www.aastocks.com/en/stocks/market/index/hk-index-con.aspx"
headers = {
    "User-Agent": "Mozilla/5.0"
}
response = requests.get(url, headers=headers)
response.encoding = response.apparent_encoding

# Parse the HTML
soup = BeautifulSoup(response.text, 'html.parser')

# Find all stock codes matching the pattern (e.g., 00001.HK)
# The stock codes are usually in <a> tags with href containing the code
stock_codes = set()
for a in soup.find_all('a', href=True):
    match = re.search(r'\b0*\d{4,5}\.HK\b', a.text)
    if match:
        stock_codes.add(match.group())

# Remove only the very first '0' from each code
processed_stock_codes = [remove_leftmost_zero(code) for code in stock_codes]

# Sort and print the result
processed_stock_codes.sort()
print(processed_stock_codes)
tickers = processed_stock_codes


['0001.HK', '0002.HK', '0003.HK', '0005.HK', '0006.HK', '0012.HK', '0016.HK', '0027.HK', '0066.HK', '0101.HK', '0175.HK', '0241.HK', '0267.HK', '0285.HK', '0288.HK', '0291.HK', '0300.HK', '0316.HK', '0322.HK', '0386.HK', '0388.HK', '0669.HK', '0688.HK', '0700.HK', '0728.HK', '0762.HK', '0823.HK', '0836.HK', '0857.HK', '0868.HK']


  tables = pd.read_html(response.text)


  Ticker                             Name   Price Pct_Volume_GT_Avg PE_Ratio  \
0    APD  Air Products and Chemicals, Inc  279.74            115.39      N/A   
1   ABNB                     Airbnb, Inc.  121.35            114.41    30.11   
2   AKAM        Akamai Technologies, Inc.  111.76             79.05    32.77   
3    AEE               Ameren Corporation  110.97             68.16    21.34   
4    AWK  American Water Works Company, I  133.50             83.52    23.46   

  Market_Cap  Dividend_Yield  
0  62285.79M            2.59  
1  72751.02M             NaN  
2  16078.38M             NaN  
3  30672.11M            2.70  
4  26056.15M            2.48  


In [10]:
import pandas as pd
import requests
import yfinance as yf

# --- Step 1: Fetch S&P 500 table with headers ---
url = "https://en.wikipedia.org/wiki/List_of_S%26P_500_companies"
headers = {"User-Agent": "Mozilla/5.0"}  # Pretend to be a browser
response = requests.get(url, headers=headers)
response.raise_for_status()

# --- Step 2: Parse tables ---
tables = pd.read_html(response.text)
sp500_table = tables[0]

# --- Step 3: Clean tickers for yfinance ---
excluded_tickers = ['BRK.B', 'BF.B']  # optional exclusions
tickers = [t.replace('.', '-') for t in sp500_table['Symbol'].tolist() if t not in excluded_tickers]

# --- Step 4: Fetch and filter stocks ---
def fetch_and_filter_stocks(tickers):
    data = []
    for ticker in tickers:
        stock = yf.Ticker(ticker)
        hist = stock.history(period='1mo')
        if hist.empty:
            continue
        avg_volume = hist['Volume'].mean()
        latest_volume = hist['Volume'].iloc[-1]
        if latest_volume >= 1.5 * avg_volume:
            info = stock.info
            pct_volume_gt_avg = (latest_volume - avg_volume) / avg_volume * 100
            data.append({
                'Ticker': ticker,
                'Name': info.get('shortName', 'N/A'),
                'Price': info.get('regularMarketPrice', None),
                'Pct_Volume_GT_Avg': pct_volume_gt_avg,
                'PE_Ratio': info.get('trailingPE', None),
                'Market_Cap': info.get('marketCap', None),
                'Dividend_Yield': info.get('dividendYield', None)
            })
    return pd.DataFrame(data)

# --- Step 5: Run and rank top 20 ---
df_volume = fetch_and_filter_stocks(tickers)

# Format columns
df_volume['Pct_Volume_GT_Avg'] = df_volume['Pct_Volume_GT_Avg'].map(lambda x: f"{x:.2f}")
df_volume['PE_Ratio'] = df_volume['PE_Ratio'].map(lambda x: f"{x:.2f}" if pd.notnull(x) else 'N/A')
df_volume['Market_Cap'] = df_volume['Market_Cap'] / 1e6
df_volume['Market_Cap'] = df_volume['Market_Cap'].map(lambda x: f"{x:.2f}M")

# Sort by % volume > avg and take top 20
df_top20 = df_volume.sort_values(by='Pct_Volume_GT_Avg', ascending=False).head(20)

print(df_top20)


  tables = pd.read_html(response.text)


   Ticker                             Name    Price Pct_Volume_GT_Avg  \
34   NCLH  Norwegian Cruise Line Holdings     21.49             88.67   
18   CPAY                     Corpay, Inc.   337.12             86.31   
15   CTSH  Cognizant Technology Solutions     66.55             85.71   
4     AWK  American Water Works Company, I   133.50             83.52   
19   DXCM                     DexCom, Inc.    70.02             82.93   
30    IRM  Iron Mountain Incorporated (Del   109.83             82.33   
31      J            Jacobs Solutions Inc.   132.79             81.48   
42   VRTX  Vertex Pharmaceuticals Incorpor   491.47             80.45   
44    WTW  Willis Towers Watson Public Lim   287.74             79.06   
2    AKAM        Akamai Technologies, Inc.   111.76             79.05   
29    IQV             IQVIA Holdings, Inc.   166.94             78.57   
5    AMAT          Applied Materials, Inc.   354.91             78.27   
7     AJG        Arthur J. Gallagher & Co.   208.45