In [None]:
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TextClassificationPipeline
from tqdm import tqdm

# word bank
positive_keywords = [
    "增长", "超预期", "创新高", "涨停", "增持", "收购", "盈利", "创纪录", "扭亏", "战略合作", 
    "投资加码", "高送转", "获批", "大幅上调", "份额提升", "中标", "喜报", "成功"
]
negative_keywords = [
    "下滑", "亏损", "减持", "退市", "风险提示", "预警", "被调查", "立案", "处罚", "股东减持",
    "净利润下降", "违约", "收紧", "停牌", "监管函", "诉讼", "审计问题", "贬值", "跌停"
]

def keyword_sentiment_boost(text):
    pos_hits = sum([kw in text for kw in positive_keywords])
    neg_hits = sum([kw in text for kw in negative_keywords])
    if pos_hits > neg_hits and pos_hits > 0:
        return "Positive"
    elif neg_hits > pos_hits and neg_hits > 0:
        return "Negative"
    else:
        return "Neutral"

df = pd.read_excel("匹配新闻摘要.xlsx")
df = df.head(1000).copy()

model_id = "yiyanghkust/finbert-tone"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForSequenceClassification.from_pretrained(model_id)
pipe = TextClassificationPipeline(model=model, tokenizer=tokenizer, return_all_scores=True)

pos_scores = []
neg_scores = []
neutral_scores = []
labels = []
dict_labels = []
final_labels = []

for i in tqdm(range(len(df)), desc="FinBERT 情感打分中"):
    text = str(df.loc[i, "Summary"])
    try:
        scores = pipe(text, truncation=True, max_length=512, return_all_scores=True)[0]
        score_dict = {s['label'].lower(): s['score'] for s in scores}

        pos_score = round(score_dict.get("positive", 0), 4)
        neg_score = round(score_dict.get("negative", 0), 4)
        neu_score = round(score_dict.get("neutral", 0), 4)

        pos_scores.append(pos_score)
        neg_scores.append(neg_score)
        neutral_scores.append(neu_score)

        model_label = max(scores, key=lambda x: x['score'])['label']
        labels.append(model_label)

        dict_label = keyword_sentiment_boost(text)
        dict_labels.append(dict_label)

        if model_label.lower() == "neutral" and dict_label != "Neutral":
            final_label = dict_label
        else:
            final_label = model_label
        final_labels.append(final_label)

    except Exception as e:
        pos_scores.append(None)
        neg_scores.append(None)
        neutral_scores.append(None)
        labels.append(None)
        dict_labels.append(None)
        final_labels.append(None)
        print(f"❌ 第{i+1}条失败：{e}")

df["PositiveScore"] = pos_scores
df["NeutralScore"] = neutral_scores
df["NegativeScore"] = neg_scores
df["ModelLabel"] = labels
df["DictLabel"] = dict_labels
df["FinalLabel"] = final_labels

df.to_excel("股票分析师前1000条打分结果_FinBERT_含词典修正.xlsx", index=False)
print("✅ 情感打分完成，文件保存为：股票分析师前1000条打分结果_FinBERT_含词典修正.xlsx")