In [40]:
#API Usage

import os
import time
from datetime import datetime, timedelta
import requests
import pandas as pd
import yfinance as yf
from pathlib import Path
from dotenv import load_dotenv

#Finding path for saving files
DATA_RAW = Path("./data/raw")
DATA_RAW.mkdir(parents=True, exist_ok=True)

#Ticker and API setups
ticker = "AAPL"
now = datetime.now()
timeformat = now.strftime("%Y%m%d-%H%M")
load_dotenv()
FINNHUB_KEY = os.getenv("FINNHUB_API") 
#Alert if needed
if not FINNHUB_KEY:
    raise RuntimeError("Please set your FINNHUB_KEY environment variable.")
#Time setup - 2 days of financial data for Apple
now = int(time.time())
two_days_ago = now - 2*24*60*60
#Try to use FINNHUB first (actually it will not work, as even 2days of data require subscription)
#However, same-day data is within the free-tier
#But I can easily replace it with yfinance
try: 
    source = "finnhub"
    url = "https://finnhub.io/api/v1/stock/candle"
    params = {
        "ticker": ticker,
        "resolution": "D", 
        "from": two_days_ago,
        "to": now,
        "token": FINNHUB_KEY
    }
    r = requests.get(url, params=params, timeout=30)
    r.raise_for_status()
    js = r.json()
    
    if js.get("s") != "ok":
        raise ValueError(f"Unexpected response: {js}")
    
    # build dataframe
    df = pd.DataFrame({
        "date": pd.to_datetime(js["t"], unit="s"),
        "open": js["o"],
        "high": js["h"],
        "low": js["l"],
        "close": js["c"],
        "volume": js["v"],
    })
    
    print(df)
#In the case of FINNHUB not working, I will be using yfinance
except:
    source = "yfinance"
    end_date = datetime.today().strftime("%Y-%m-%d")
    start_date = (datetime.today() - timedelta(days=2)).strftime("%Y-%m-%d")
    
    # Download data
    df = yf.download(ticker, start=start_date, end=end_date) 
    
    # Reset index to make DateTime a column
    df.reset_index(inplace=True)
    
    print(df.head())

#Save to CSV in the end
output_file = DATA_RAW / f"api_{source}_{ticker}_{timeformat}_2days.csv"
df.to_csv(output_file, index=False)
print(f"Saved CSV to {output_file}")

#Eventually, yfinance (instead of FINNHUB) produced the output

  df = yf.download(ticker, start=start_date, end=end_date)
[*********************100%***********************]  1 of 1 completed

Price        Date       Close        High         Low        Open    Volume
Ticker                   AAPL        AAPL        AAPL        AAPL      AAPL
0      2025-08-14  232.779999  235.119995  230.850006  234.059998  51916300
1      2025-08-15  231.589996  234.279999  229.339996  234.000000  56010500
Saved CSV to data\raw\api_yfinance_AAPL_20250816-1726_2days.csv





In [41]:
#Scraping

import requests
from bs4 import BeautifulSoup
import pandas as pd
from pathlib import Path

#Finding path for saving files similarly
DATA_RAW = Path("data/raw")
DATA_RAW.mkdir(parents=True, exist_ok=True)

#Fetching (in this part I kind of asked for help from GPT to let me know what are
#good websites for scraping data, as I am not sure of that)
url = "https://en.wikipedia.org/wiki/List_of_largest_companies_by_revenue"
headers = {"User-Agent": "Mozilla/5.0"}  
response = requests.get(url, headers=headers)
response.raise_for_status()  # will raise error if request failed

soup = BeautifulSoup(response.text, "html.parser")

table = soup.find("table", {"class": "wikitable"})

#I used some GPT to assist me in scraping efficiently and in a stable way
#Otherwise I really do not know how to scrape the right table
header_row = table.find("tr")
columns = [th.get_text(strip=True) for th in header_row.find_all("th")]

#Extracting data
rows = table.find_all("tr")[1:] 
data = []
for row in rows:
    cols = row.find_all(["th", "td"])
    cols_text = [c.get_text(strip=True) for c in cols]
    # Pad or truncate row to match header length
    if len(cols_text) < len(columns):
        cols_text += [""] * (len(columns) - len(cols_text))
    elif len(cols_text) > len(columns):
        cols_text = cols_text[:len(columns)]
    data.append(cols_text)

#Building dataframe and columns for storing data
df = pd.DataFrame(data, columns=columns)
numeric_cols = ["Revenue in USD billions", "Profit in USD billions", "Employees"]
for col in numeric_cols:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col].str.replace(",", "").str.replace("–", ""), errors="coerce")

#Save to CSV in the end
output_file = DATA_RAW / "largest_companies_scraped.csv"
df.to_csv(output_file, index=False)
print(f"Saved CSV to {output_file}")
print(df.head())


Saved CSV to data\raw\largest_companies_scraped.csv
               Ranks                             Name  \
0  USD (in millions)                                    
1                  1                          Walmart   
2                  2                           Amazon   
3                  3  State Grid Corporation of China   
4                  4                     Saudi Aramco   

                       Industry   Revenue    Profit  Employees  \
0                                                          NaN   
1                        Retail  $680,985   $19,436  2100000.0   
2  Retailinformation technology  $637,959   $59,248  1556000.0   
3                   Electricity  $545,948    $9,204  1361423.0   
4                   Oil and gas  $480,446  $106,246    73311.0   

  Headquarters[note 1] State-owned Ref.  
0                                        
1        United States              [1]  
2                              [4]       
3                China              [5] 

In [None]:
#API
#Data source: FINNHUB or Yahoo Finance
#URL: https://finnhub.io/api/v1/stock/candle or https://finance.yahoo.com/quote/AAPL/history/
#params: Date, Close, High, Low, Open, Volume
#Potential problems include: 
#1. subscription might be needed in future (like FINNHUB)
#2. URL might change and require updates
#3. while API-formatted data sources usually have more consistent format, 
# change of format/rubrics might still be a risk

#Scraping
#Data source: Wikipedia
#URL: https://en.wikipedia.org/wiki/List_of_largest_companies_by_revenue
#params: Ranks, Name, Industry, Revenue, Profit, Employees, Headquarters, State-owned, Ref.
#Potential problems include: 
#1. Inconsistent format and might consistently require the programmer to modify codes
#2. Potential change in policies regarding scraping
#3. The extent to which the data is allowed to be used might be unclear or not as clearly stated as most commercial API data sources