# CNBC AND YAHOO CRAWLING (BY API)

In [1]:
!pip install selenium
!pip install newsapi-python



In [2]:
# 匯入所需的套件
import pandas as pd  # 用於資料處理和儲存
from bs4 import BeautifulSoup  # 用於解析 HTML 網頁內容
from datetime import datetime  # 用於取得當前日期和時間
from selenium import webdriver  # 用於自動化瀏覽器操作
from selenium.webdriver.common.by import By  # 用於定位網頁元素
from selenium.webdriver.chrome.options import Options  # 用於設定 Chrome 瀏覽器選項
from selenium.webdriver.support.ui import WebDriverWait  # 用於等待網頁元素載入
from selenium.webdriver.support import expected_conditions as EC  # 用於設定等待條件

In [3]:
def setup_driver():
    options = Options()
    options.add_argument('--headless')  # 啟用無頭模式，不顯示瀏覽器視窗
    options.add_argument('window-size=800x600')  # 設定瀏覽器視窗大小
    prefs = {"profile.managed_default_content_settings.images": 2}  # 禁止載入圖片，加快爬取速度
    options.add_experimental_option("prefs", prefs)
    driver = webdriver.Chrome(options=options)  # 初始化 Chrome 瀏覽器
    return driver

# 初始化一個空的列表來儲存新聞標題
title_list = []

In [4]:
def Yahoo():
    # -------------------- Yahoo Finance 新聞抓取 --------------------

    # 設置 Selenium 瀏覽器並打開 Yahoo Finance 的 Tesla 新聞頁面
    driver = setup_driver()
    yahoo_url = "https://finance.yahoo.com/quote/TSLA/news"
    driver.get(yahoo_url)  # 載入頁面

    # 取得頁面源代碼並使用 BeautifulSoup 解析
    page_source = driver.page_source
    soup = BeautifulSoup(page_source, 'html.parser')

    # 尋找包含新聞的主要 div 區塊
    news_div = soup.find("ul", class_="stream-items yf-1usaaz9").find_all("li", class_="stream-item story-item yf-1usaaz9")

    # 迭代每個新聞項目，提取標題
    for news_item in news_div:
        try:
            # 嘗試找到標題的 h3 標籤並提取文字
            title = news_item.find("h3", class_="clamp yf-18q3fnf").text
            title_list.append(title.strip())  # 去除前後空白並加入列表
        except AttributeError:
            print("err finding" + news_item.text)
            # 如果找不到標題，跳過此新聞項目
            continue

    # 關閉 Yahoo Finance 的瀏覽器實例
    driver.quit()
#Yahoo()

In [5]:
def CNBC():
    # -------------------- CNBC 新聞抓取 --------------------

    # 設置 Selenium 瀏覽器並打開 CNBC 的 Tesla 搜尋結果頁面
    driver = setup_driver()
    cnbc_url = "https://www.cnbc.com/search/?query=tesla&qsearchterm=tesla"
    driver.get(cnbc_url)  # 載入頁面

    # 使用 WebDriverWait 等待搜尋結果容器載入完成
    wait = WebDriverWait(driver, 60)  # 最多等待 60 秒
    search_results = wait.until(EC.visibility_of_element_located((By.CLASS_NAME, "SearchResults-searchResultsContainer")))

    # 取得頁面源代碼並使用 BeautifulSoup 解析
    page_source = driver.page_source
    soup = BeautifulSoup(page_source, 'html.parser')

    # 尋找包含搜尋結果的主要 div 區塊
    news_div_cnbc = soup.find("div", class_="SearchResults-searchResultsContainer").find("div", id="searchcontainer").find_all("div")

    # 迭代每個新聞項目，提取標題
    for news_item in news_div_cnbc:
        try:
            # 嘗試找到標題的 span 標籤並提取文字
            title = news_item.find("div", class_="SearchResult-searchResultTitle").find("a").find("span").text
            title_list.append(title.strip())  # 去除前後空白並加入列表
        except AttributeError:
            # 如果找不到標題，跳過此新聞項目
            continue

    # 關閉 CNBC 的瀏覽器實例
    driver.quit()
#CNBC()

# code starts from here

In [7]:
import pandas as pd
from datetime import datetime
from newsapi.newsapi_client import NewsApiClient
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer

nltk.download('vader_lexicon')
nltk.download('punkt')

sia = SentimentIntensityAnalyzer()
newsapi = NewsApiClient(api_key='e862efc88eeb445a9f54d97d9174ac69')
title_list = []

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [40]:
def SaveTitles():
    global dates, csv_filename
    # -------------------- 去除重複標題並儲存到 CSV --------------------

    # 去除列表中的重複標題
    i = 0
    while i < len(title_list):
        if title_list.count(title_list[i]) > 1:
            title_list.pop(i)
            dates.pop(i)
        else:
            i+=1
    # unique_titles = list(dict.fromkeys(title_list))

    # 創建一個 Pandas DataFrame，包含當前日期和新聞標題
    df = pd.DataFrame({
        "Date": dates,  # 格式化當前日期為 YYYY-MM-DD
        "News_Title": title_list,  # 新聞標題列表
        "source": sources
    })

    # 將 DataFrame 儲存為 CSV 文件，文件名稱包含當前日期
    csv_filename = f"News_Title_{datetime.now().strftime('%Y-%m-%d-%H%M%S')}.csv"
    df.to_csv(csv_filename, index=False, encoding='utf-8-sig')  # 不儲存索引，使用 UTF-8 編碼

    print(f"成功儲存新聞標題到 {csv_filename}")
# SaveTitles()

In [42]:
def getNews(domain, keyword, date):
    global articles, title_list, dates, sources
    
    articles = newsapi.get_everything(
        q=keyword,
        domains=domain,
        from_param=date,
        to=date,
        language='en',
        sort_by='publishedAt',
        page_size=100
    )
    
    for article in articles['articles']:
        title = article['title']
        description = article['description']
        text = (title or '') + ' ' + (description or '')
        dates.append(article['publishedAt'][:10])
        title_list.append(text.strip())
    

In [44]:
def AnalyzeSentiment(newsFileName, newNewsFileName):
    sentiments = []
    newsFile = pd.read_csv(newsFileName)

    for i, news in newsFile.iterrows():
        title = str(news["News_Title"])
        score = sia.polarity_scores(title)['compound']
        sentiments.append(sia.polarity_scores(title)['compound'])
        print(title)
        print("sentiment score: ", score)
        
    pd.concat([newsFile, pd.DataFrame({'Score':sentiments})], axis=1).to_csv(newNewsFileName, header=True, index=False)
    print("file output to " + newNewsFileName)

In [46]:
def clear():
    global dates, title_list, sources
    dates = []
    title_list = []
    sources = []
clear()

In [48]:
from datetime import datetime, timedelta
# Get today's date
today = datetime.today()
# Start date: 30 days ago
start_date = today - timedelta(days=30)
# Iterate through dates
date_list = [(start_date + timedelta(days=i)).strftime('%Y-%m-%d') for i in range(31)]
keyword = input('Input some stocks name: ')
# Print the dates
for date in date_list:
    getNews('cnbc.com', keyword, date)
    getNews('yahoo.com', keyword, date)
SaveTitles()


Input some stocks name:  tesla


NewsAPIException: {'status': 'error', 'code': 'rateLimited', 'message': 'You have made too many requests recently. Developer accounts are limited to 100 requests over a 24 hour period (50 requests available every 12 hours). Please upgrade to a paid plan if you need more requests.'}

# Twitter crawling

In [52]:
!pip install requests
!pip install requests-oauthlib

Collecting requests-oauthlib
  Using cached requests_oauthlib-2.0.0-py2.py3-none-any.whl.metadata (11 kB)
Collecting oauthlib>=3.0.0 (from requests-oauthlib)
  Using cached oauthlib-3.2.2-py3-none-any.whl.metadata (7.5 kB)
Using cached requests_oauthlib-2.0.0-py2.py3-none-any.whl (24 kB)
Using cached oauthlib-3.2.2-py3-none-any.whl (151 kB)
Installing collected packages: oauthlib, requests-oauthlib
Successfully installed oauthlib-3.2.2 requests-oauthlib-2.0.0


In [None]:
import twitter

api = twitter.Api(consumer_key='你的consumer_key',
   consumer_secret='你的consumer_secret',
   access_token_key='你的access_token_key',
   access_token_secret='你的access_token_secret')
  
docs = api.GetSearch(term='台灣', since='2020-01-01', count=25, result_type='popular', return_json=True)
print(docs)

In [54]:
!pip install tweepy

Collecting tweepy
  Downloading tweepy-4.15.0-py3-none-any.whl.metadata (4.1 kB)
Downloading tweepy-4.15.0-py3-none-any.whl (99 kB)
   ---------------------------------------- 0.0/99.4 kB ? eta -:--:--
   ---- ----------------------------------- 10.2/99.4 kB ? eta -:--:--
   -------- ------------------------------- 20.5/99.4 kB 330.3 kB/s eta 0:00:01
   ------------------------ --------------- 61.4/99.4 kB 544.7 kB/s eta 0:00:01
   ---------------------------------------- 99.4/99.4 kB 636.7 kB/s eta 0:00:00
Installing collected packages: tweepy
Successfully installed tweepy-4.15.0
