In [None]:
'''
Keywords  关键词	Euclidean Distance, Manhattan Distance, Cosine Similarity, Jaccard Index, Word Mover's Distance (WMD), Dynamic Time Warping, Pearson Correlation,
欧氏距离、曼哈顿距离、余弦相似度、杰卡德指数、词移动距离 (WMD)、动态时间规整、皮尔逊相关系数

Spearman Rank Correlation, Term Frequency-Inverse Document Frequency (TF-IDF), Text Analysis, Natural Language Processing (NLP), Information Retrieval
斯皮尔曼等级相关、词频-逆文档频率 (TF-IDF)、文本分析、自然语言处理 (NLP)、信息检索
'''

In [2]:
# Load libraries
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

In [3]:
import numpy as np          # 导入 numpy —— 干向量运算的主力库

# ----------- 模拟 5 天收益数据 -------------
stock_a = np.array([ 0.02, -0.01,  0.03,  0.01, -0.02])   # 股票 A
stock_b = np.array([ 0.03,  0.01,  0.02,  0.00, -0.01])   # 股票 B


In [4]:
'''
 --------- Euclidean (L2) Distance 一段干啥？
stock_a - stock_b 先算两个向量的差。
** 2 把差平方；np.sum 所有维度加一起。
np.sqrt 开根号得到直线距离。值越小 → 俩票涨跌幅度越像。
----------
'''
euclidean_distance = np.sqrt(np.sum((stock_a - stock_b) ** 2))
print("Euclidean Distance:", euclidean_distance)


Euclidean Distance: 0.0282842712474619


In [None]:
'''
# --------- Manhattan (L1) Distance ----------
先求差，再 np.abs 取绝对值，再 np.sum。
相当于“走城市街区拐 90° 只走横竖”，距离越小 → 总体差异越小。对鬼一样的极端值没 L2 那么敏感。
'''

# --------- Manhattan (L1) Distance ----------
manhattan_distance = np.sum(np.abs(stock_a - stock_b))
print("Manhattan Distance:", manhattan_distance)


In [None]:
# --------- Cosine Similarity ----------
# 这一段干啥？
# np.dot 算内积，衡量「同向力度」。
# np.linalg.norm 各自求长度。
# 点积 / (长度乘积) 就是 cos θ。æ
# 取值 [-1,1] —— 接近 1 = 两向量夹角小，常看“涨跌方向是否齐刷刷”。
dot_product = np.dot(stock_a, stock_b)                  # 向量点积
magnitude_a = np.linalg.norm(stock_a)                   # |A|
magnitude_b = np.linalg.norm(stock_b)                   # |B|
cosine_similarity = dot_product / (magnitude_a * magnitude_b)
print("Cosine Similarity:", cosine_similarity)


In [None]:
'''
2.5 Simplified example of applying TF-IDF
Scenario: Suppose we have a small collection of financial news headlines related to a company called "Acme Corp.":
场景：假设我们有一些与“Acme Corp.”公司相关的财经新闻标题：

Goal: We want to use TF-IDF to identify the most important words in each headline and understand the overall sentiment towards Acme Corp.
目标：我们希望使用 TF-IDF 来识别每个标题中最重要的词语，并了解对 Acme Corp 的整体情绪。

TF 就是“词在这篇里蹦几次”

IDF 就是“全语料里它稀不稀有”

TF-IDF 权重高＝这词在当前文章里牛X、在别处稀罕 —— 价值最大


2. 掌握完整流水线
练习把 从原始句子 → 数值矩阵 全流程过一遍：
预处理/分词
向量化
抓词表
查看权重
以后遇见几十万条新闻，换源数据即可复用，省得每次瞎百度。

3. 训练你的“读表”能力
打印出 DataFrame 你能立刻看出：
哪个标题关键词火力最猛（高权重）
哪些标题关键词权重分布相似（主题接近）
学会肉眼扫表，后面写聚类、情绪阀值才知道阈该调在哪。

4. 铺垫后续进阶
余弦相似度：拿 tfidf_matrix 直接 dot 就能做主题聚类
情绪/分类模型：把 TF-IDF 当输入喂给逻辑回归、SVM
风险雷达：固定风险词表，看它们权重时间序列
没有这一步，你后面所有 NLP 金融因子都是空中楼阁。
'''




In [5]:
'''
Step 1. Tokenization and Preprocessing:
步骤 1.标记化和预处理：
将每个标题分解成单个单词（标记）。
删除停用词（常用词如“the”、“a”、“is”）和标点符号。
将单词转换为小写。

1. Acme Corp. announces record profits, stock surges.   ← 明显利好
2. Acme Corp. faces regulatory scrutiny, shares decline.← 利空
3. Market volatility impacts Acme Corp. earnings.        ← 中性∼负
4. Acme Corp. expands into new markets, analysts optimistic. ← 偏多


| 步 | 干啥             | 目的                      |
| - | -------------- | ----------------------- |
| 1 | Tokenize & 预处理 | 去废词、全小写，让向量更干净          |
| 2 | 计算 TF、IDF      | 词在本条多？全语料稀？             |
| 3 | 乘出 TF-IDF      | 得到“独特又高频”权重             |
| 4 | 转 DataFrame    | 人眼一看就懂谁是大词              |
| 5 | 单点取值           | 验证关键词，如 *profits* 权重大不大 |

'''
import numpy as np                                   # 数值计算
import pandas as pd                                  # 打表漂亮
from sklearn.feature_extraction.text import TfidfVectorizer  # 核心 TF-IDF

# ---------- 教材里的四条标题 ----------
headlines = [
    "Acme Corp. announces record profits, stock surges.",
    "Acme Corp. faces regulatory scrutiny, shares decline.",
    "Market volatility impacts Acme Corp. earnings.",
    "Acme Corp. expands into new markets, analysts optimistic."
]




In [7]:
'''
让我们通过示例标题 1 来演示实施过程：“Acme Corp. 宣布创纪录的利润，股票飙升。”

Tokenization and Preprocessing:
标记化和预处理：

Tokens: ["acme", "corp", "announces", "record", "profits", "stock", "surges"]
令牌：[“acme”、“corp”、“announces”、“record”、“profits”、“stock”、“surges”]
'''
# ---------- 1) 创建 TF-IDF 向量器 ----------
# 中文注释：停用常见英文废词，默认 tokenizer=英文分词
vectorizer = TfidfVectorizer(stop_words='english')
# ---------- 2) 拟合语料（学习词表 + IDF） ----------
vectorizer.fit(headlines)

# ---------- 3) 文本 → TF-IDF 稀疏矩阵 ----------
tfidf_matrix = vectorizer.transform(headlines)  # 行=标题，列=词

# ---------- 4) 打印词表看看 ----------
feature_names = vectorizer.get_feature_names_out()
print("Vocabulary:", feature_names)


Vocabulary: ['acme' 'analysts' 'announces' 'corp' 'decline' 'earnings' 'expands'
 'faces' 'impacts' 'market' 'markets' 'new' 'optimistic' 'profits'
 'record' 'regulatory' 'scrutiny' 'shares' 'stock' 'surges' 'volatility']


In [9]:
# ---------- 5) 把矩阵转成带行列标签的 DataFrame ----------
labels = [f"Headline{i+1}" for i in range(len(headlines))]
df = pd.DataFrame(tfidf_matrix.toarray(), index=labels, columns=feature_names)
print("\nTF-IDF 表格（保留三位小数）")
df.round(3)


TF-IDF 表格（保留三位小数）


Unnamed: 0,acme,analysts,announces,corp,decline,earnings,expands,faces,impacts,market,...,new,optimistic,profits,record,regulatory,scrutiny,shares,stock,surges,volatility
Headline1,0.222,0.0,0.425,0.222,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.425,0.425,0.0,0.0,0.0,0.425,0.425,0.0
Headline2,0.222,0.0,0.0,0.222,0.425,0.0,0.0,0.425,0.0,0.0,...,0.0,0.0,0.0,0.0,0.425,0.425,0.425,0.0,0.0,0.0
Headline3,0.245,0.0,0.0,0.245,0.0,0.469,0.0,0.0,0.469,0.469,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.469
Headline4,0.222,0.425,0.0,0.222,0.0,0.0,0.425,0.0,0.0,0.0,...,0.425,0.425,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [10]:
'''
我们可以使用 feature_names 来访问特定的单词。当我们想要分析某个单词的 TF-IDF 得分时，可以使用 feature_names 列表查找其索引。以下代码重点介绍如何访问特定标题和单词的 TF-IDF 得分。此代码片段识别出我们感兴趣的特定标题（第一个标题）和单词（“profits”）。然后，它会在 TF-IDF 矩阵中找到相应的行和列，从矩阵中的该位置检索 TF-IDF 得分并打印得分
'''
# ---------- 6) 单独查看 'profits' 在第一条的权重 ----------
row_idx = 0                                     # 第一条
col_idx = feature_names.tolist().index('profits')
score = tfidf_matrix[row_idx, col_idx]
print(f"\n'profits' 的权重（Headline1）：{score:.4f}")


'profits' 的权重（Headline1）：0.4247
