In [2]:
# 天气与乒乓球运动决策
import math
from collections import defaultdict, Counter

# ======================== 1. 创建模拟天气数据集（乒乓球场景） ========================
data = [
    ('Sunny', 'High', 'Strong', 'No'),
    ('Sunny', 'Low', 'Weak', 'Yes'),
    ('Rainy', 'High', 'Strong', 'No'),
    ('Rainy', 'Low', 'Weak', 'Yes'),
    ('Cloudy', 'High', 'Strong', 'No'),
    ('Cloudy', 'Low', 'Weak', 'Yes'),
    ('Sunny', 'High', 'Strong', 'No'),
    ('Rainy', 'Low', 'Weak', 'Yes'),
    ('Cloudy', 'High', 'Strong', 'No'),
    ('Sunny', 'Low', 'Weak', 'Yes')
]
features = ['Temperature', 'Humidity', 'Wind']  # 特征名称

# 表格化展示原始数据
print("=== 1. 原始天气数据集（乒乓球场景） ===")
print(f"{'温度':<8}{'湿度':<6}{'风力':<8}{'是否打乒乓球'}")
print("-" * 35)
for row in data:
    print(f"{row[0]:<8}{row[1]:<6}{row[2]:<8}{row[3]}")

# ======================== 2. 数据预处理（分类变量转数值） ========================
def encode_feature(values):
    """手动编码分类特征为数值"""
    unique_vals = list(set(values))
    return {v: i for i, v in enumerate(unique_vals)}, [unique_vals.index(v) for v in values]

# 提取各特征和标签的取值
temp_vals = [row[0] for row in data]
hum_vals = [row[1] for row in data]
wind_vals = [row[2] for row in data]
play_vals = [row[3] for row in data]

# 编码特征与标签
temp_map, temp_encoded = encode_feature(temp_vals)
hum_map, hum_encoded = encode_feature(hum_vals)
wind_map, wind_encoded = encode_feature(wind_vals)
play_map, play_encoded = encode_feature(play_vals)

# 编码后的数据
encoded_data = list(zip(temp_encoded, hum_encoded, wind_encoded, play_encoded))

# 展示编码结果与映射
print("\n=== 2. 编码后的数据集（数值化） ===")
print(f"{'温度':<6}{'湿度':<6}{'风力':<6}{'是否打乒乓球'}")
print("-" * 28)
for row in encoded_data:
    print(f"{row[0]:<6}{row[1]:<6}{row[2]:<6}{row[3]}")

print("\n=== 3. 编码映射关系 ===")
print(f"温度: {temp_map}")
print(f"湿度: {hum_map}")
print(f"风力: {wind_map}")
print(f"是否打乒乓球: {play_map}")

# ======================== 3. 划分训练集/测试集 ========================
train_data = encoded_data[:8]  # 前8条为训练集
test_data = encoded_data[8:]   # 后2条为测试集
X_train = [row[:3] for row in train_data]
y_train = [row[3] for row in train_data]
X_test = [row[:3] for row in test_data]
y_test = [row[3] for row in test_data]

print("\n=== 4. 数据集划分结果 ===")
print(f"训练集样本数: {len(X_train)} | 测试集样本数: {len(X_test)}")

# ======================== 4. 实现简易决策树（替代随机森林） ========================
def calculate_entropy(labels):
    """计算信息熵（衡量数据混乱度）"""
    count = Counter(labels)
    entropy = 0.0
    total = len(labels)
    for cnt in count.values():
        prob = cnt / total
        entropy -= prob * math.log2(prob)
    return entropy

def split_data(data, feature_idx):
    """按指定特征划分数据集"""
    groups = defaultdict(list)
    for row in data:
        groups[row[feature_idx]].append(row)
    return groups

def choose_best_feature(data):
    """选择信息增益最大的特征作为划分节点"""
    base_entropy = calculate_entropy([row[-1] for row in data])
    best_gain = 0.0
    best_idx = -1
    for idx in range(len(data[0]) - 1):
        groups = split_data(data, idx)
        new_entropy = 0.0
        for group in groups.values():
            prob = len(group) / len(data)
            new_entropy += prob * calculate_entropy([row[-1] for row in group])
        gain = base_entropy - new_entropy
        if gain > best_gain:
            best_gain = gain
            best_idx = idx
    return best_idx

def majority_vote(labels):
    """多数投票法（处理叶节点）"""
    return Counter(labels).most_common(1)[0][0]

def build_tree(data):
    """递归构建决策树"""
    labels = [row[-1] for row in data]
    # 终止条件1：所有标签相同
    if len(set(labels)) == 1:
        return labels[0]
    # 终止条件2：无特征可划分
    if len(data[0]) == 1:
        return majority_vote(labels)
    # 选择最优特征
    best_idx = choose_best_feature(data)
    # 构建树结构
    tree = {best_idx: {}}
    groups = split_data(data, best_idx)
    for val, group in groups.items():
        reduced_group = [row[:best_idx] + row[best_idx+1:] for row in group]
        tree[best_idx][val] = build_tree(reduced_group)
    return tree

def predict(tree, sample):
    """用决策树预测样本"""
    if not isinstance(tree, dict):
        return tree
    root_idx = list(tree.keys())[0]
    sample_val = sample[root_idx]
    # 若特征值不在树中，用多数投票补全
    if sample_val not in tree[root_idx]:
        return majority_vote([row[-1] for row in train_data])
    return predict(tree[root_idx][sample_val], sample)

# 训练决策树
tree = build_tree(train_data)
print("\n=== 5. 决策树模型训练完成 ===")

# ======================== 5. 模型评估 ========================
y_pred = [predict(tree, x) for x in X_test]
accuracy = sum(1 for p, t in zip(y_pred, y_test) if p == t) / len(y_test)

print("\n=== 6. 模型评估结果 ===")
print(f"测试集准确率: {accuracy:.2f}")

# ======================== 6. 自定义天气预测（答辩演示） ========================
print("\n=== 7. 自定义天气预测演示（乒乓球场景） ===")
custom_weather = ('Sunny', 'Low', 'Weak')
# 编码自定义输入
custom_encoded = [
    temp_map[custom_weather[0]],
    hum_map[custom_weather[1]],
    wind_map[custom_weather[2]]
]
# 预测
custom_pred = predict(tree, custom_encoded)
# 解码为原始标签
pred_label = [k for k, v in play_map.items() if v == custom_pred][0]

print(f"自定义天气条件: {custom_weather}")
print(f"是否适合打乒乓球: {pred_label}")

=== 1. 原始天气数据集（乒乓球场景） ===
温度      湿度    风力      是否打乒乓球
-----------------------------------
Sunny   High  Strong  No
Sunny   Low   Weak    Yes
Rainy   High  Strong  No
Rainy   Low   Weak    Yes
Cloudy  High  Strong  No
Cloudy  Low   Weak    Yes
Sunny   High  Strong  No
Rainy   Low   Weak    Yes
Cloudy  High  Strong  No
Sunny   Low   Weak    Yes

=== 2. 编码后的数据集（数值化） ===
温度    湿度    风力    是否打乒乓球
----------------------------
1     0     0     0
1     1     1     1
0     0     0     0
0     1     1     1
2     0     0     0
2     1     1     1
1     0     0     0
0     1     1     1
2     0     0     0
1     1     1     1

=== 3. 编码映射关系 ===
温度: {'Rainy': 0, 'Sunny': 1, 'Cloudy': 2}
湿度: {'High': 0, 'Low': 1}
风力: {'Strong': 0, 'Weak': 1}
是否打乒乓球: {'No': 0, 'Yes': 1}

=== 4. 数据集划分结果 ===
训练集样本数: 8 | 测试集样本数: 2

=== 5. 决策树模型训练完成 ===

=== 6. 模型评估结果 ===
测试集准确率: 1.00

=== 7. 自定义天气预测演示（乒乓球场景） ===
自定义天气条件: ('Sunny', 'Low', 'Weak')
是否适合打乒乓球: Yes
