<a href="https://colab.research.google.com/github/chaotingchong-crypto/programming-language/blob/main/Hw4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [33]:
!pip install requests beautifulsoup4 pandas scikit-learn google-generativeai gradio





In [34]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import google.generativeai as genai
import gradio as gr



In [35]:
def crawl_gnn(url):
    """爬取 GNN 單篇新聞內容"""
    res = requests.get(url)
    res.encoding = "utf-8"
    soup = BeautifulSoup(res.text, "html.parser")

    title = soup.find("h1").text.strip() if soup.find("h1") else "無標題"
    content_tags = soup.select(".GN-lbox3B p")
    content = " ".join([p.text.strip() for p in content_tags if p.text.strip()])

    return {"title": title, "content": content}


In [36]:
def tfidf_analysis(texts, top_n=20):
    vectorizer = TfidfVectorizer(max_features=top_n, stop_words=["的", "是", "在", "了", "與", "和"])
    tfidf_matrix = vectorizer.fit_transform(texts)
    words = vectorizer.get_feature_names_out()
    scores = tfidf_matrix.toarray().sum(axis=0)
    df = pd.DataFrame({"term": words, "score": scores}).sort_values(by="score", ascending=False)
    return df


In [37]:
def generate_insights(text, api_key, model="gemini-1.5-flash"):
    genai.configure(api_key=api_key)
    prompt = f"""
請根據以下文章生成：
1️⃣ 五句重點洞察摘要
2️⃣ 一段約 120 字的結論（繁體中文）
---
{text[:4000]}
"""
    model = genai.GenerativeModel(model)
    response = model.generate_content(prompt)
    return response.text.strip()


In [38]:
def full_pipeline(url, api_key, top_n=20):
    data = [crawl_gnn(url)]
    df = pd.DataFrame(data)

    # TF-IDF 熱詞分析
    tfidf_df = tfidf_analysis([d["content"] for d in data], top_n=top_n)

    # Gemini 洞察摘要
    insights = generate_insights(data[0]["content"], api_key)

    # 匯出 CSV
    df.to_csv("GNN_新聞內容.csv", index=False, encoding="utf-8-sig")
    tfidf_df.to_csv("GNN_熱詞統計.csv", index=False, encoding="utf-8-sig")

    msg = "✅ 分析完成！已匯出 GNN_新聞內容.csv 與 GNN_熱詞統計.csv"
    return msg, df.to_markdown(index=False), tfidf_df.to_markdown(index=False), insights



In [39]:
def run_all(url, api_key):
    msg, news_table, tfidf_table, insights = full_pipeline(url, api_key)
    return msg, news_table, tfidf_table, insights

demo = gr.Interface(
    fn=run_all,
    inputs=[
        gr.Textbox(label="📰 GNN 新聞網址", value="https://gnn.gamer.com.tw/detail.php?sn=294468"),
        gr.Textbox(label="🤖 AIzaSyDnHWdNufaLRA_e7jISpfB93QqYnTcLS-4", type="password", placeholder="AIzaSyDnHWdNufaLRA_e7jISpfB93QqYnTcLS-4"),
    ],
    outputs=[
        gr.Textbox(label="📤 狀態訊息"),
        gr.Textbox(label="🕸️ 爬蟲新聞內容（Markdown 表格）"),
        gr.Textbox(label="🔥 熱詞統計（TF-IDF）"),
        gr.Textbox(label="💡 Gemini 洞察摘要"),
    ],
    title="🎮 GNN 新聞 AI 自動分析系統",
    description="輸入 GNN 新聞網址 → 自動爬取內容 → TF-IDF 熱詞分析 → Gemini 生成洞察摘要 → 匯出結果"
)

demo.launch()


It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://888218b4d7e55783f7.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


