# Model 1 <>

In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense


### V1.0
结构：随机森林+随机森林+随机森林

发现：
1) classifier表现很好，10个随机树时就有很好的分类表现了。
2) progressor表现尚可
3) 预测28年奖牌榜时效果很差，猜测是由于当届参数过多，而预测模型不太行。

In [None]:
class OlympicPredictionModel:
    def __init__(self, lambda_weight=0.3, mu_weight=0.85):
        """
        初始化模型权重
        :param lambda_weight: 小小国权重
        :param mu_weight: 大国权重
        """
        self.lambda_weight = lambda_weight
        self.mu_weight = mu_weight
        self.classifier = RandomForestClassifier(n_estimators=200)
        self.regressor1 = RandomForestRegressor(n_estimators=200)
        self.regressor2 = RandomForestRegressor(n_estimators=200)
        self.lstm_models = {}

    @staticmethod
    def calculate_w4_vector(country_change, athlete_change, event_changes):
        """
        矢量化计算 W4
        """
        w1 = np.sum(country_change, axis=1).astype(np.float64)
        w2 = np.sum(athlete_change, axis=1).astype(np.float64)
        w3 = np.sum(event_changes, axis=1).astype(np.float64)
        w4 = np.divide(0.2 * w1 + 0.8 * w2, w3, out=np.zeros_like(w3, dtype=np.float64), where=w3!=0)
        return w4

    def preprocess_data(self, data):
        """
        数据预处理，包括计算 featureA, featureB, featureC
        """
        # Feature A
        score_rate_list = np.array(data['Score_Rate_List'].apply(eval).tolist())
        score_list = np.array(data['Score_List'].apply(eval).tolist(), dtype=np.float64)
        participants_list = np.array(data['Participants_List'].apply(eval).tolist(), dtype=np.float64)

        v5_5_a = np.sum(score_rate_list, axis=1)
        v8_5_a = np.sum(np.divide(score_list, participants_list, 
                                  out=np.zeros_like(score_list, dtype=np.float64), 
                                  where=participants_list > 0), axis=1)

        country_change = np.array(data['CountryChange'].apply(eval).tolist())
        athlete_change = np.array(data['AthleteChange'].apply(eval).tolist())
        event_changes = np.array(data['Event_Changes'].apply(eval).tolist())
        w4 = self.calculate_w4_vector(country_change, athlete_change, event_changes)

        v8_w4 = np.sum(np.multiply(
            np.divide(score_list, participants_list, out=np.zeros_like(score_list, dtype=np.float64), where=participants_list > 0),
            w4[:, None]
        ), axis=1)

        features_a = np.column_stack((v5_5_a, v8_5_a, v8_w4))

        # Feature B
        v1_b = data['if_host'].values
        v5_5_b = np.sum(participants_list, axis=1)
        v6_b = data['Gold_prev'].fillna(0).values
        v7_b = data['distance'].values
        v3_5_b = data['Score'].values
        v3_w4_b = np.sum(np.multiply(score_list, w4[:, None]), axis=1)

        features_b = np.column_stack((v1_b, v5_5_b, v6_b, v7_b, v3_5_b, v3_w4_b))

        # Feature C (same as B)
        features_c = features_b.copy()

        # Feature D
        v1_d = v1_b
        v3_5_d = v3_5_b
        v4_5_d =
        v5_5_d = v5_5_b
        v6_d = v6_b
        v7_d = v7_b
        v8_5_d = v8_5_a
        w3_5_d = np.sum(event_changes, axis=1)        
        w4_5_d = np.sum(w4, axis=1)

        labels = (data['Total_Medals'] > 0).astype(int).values
        medal_expectation = data['Total_Medals'].values
        gold_expectation = data['Gold'].values

        scaler = StandardScaler()
        features_a_scaled = scaler.fit_transform(features_a)
        features_b_scaled = scaler.fit_transform(features_b)
        features_c_scaled = scaler.fit_transform(features_c)

        return features_a_scaled, features_b_scaled, features_c_scaled, labels, medal_expectation, gold_expectation

    def train_classifier(self, features_a, other_features, labels):
        """
        训练分类器以预测获奖牌概率
        """
        input_features = np.hstack([
            self.lambda_weight * features_a,
            (1 - self.lambda_weight) * other_features
        ])
        self.classifier.fit(input_features, labels)

    def train_regressor1(self, classifier_output, features_b, medal_expectation):
        """
        训练回归器1预测奖牌期望
        """
        input_features = np.hstack([classifier_output.reshape(-1, 1), features_b])
        self.regressor1.fit(input_features, medal_expectation)

    def train_regressor2(self, medal_expectation, features_c, gold_expectation):
        """
        训练回归器2预测金牌期望
        """
        input_features = np.hstack([
            (1 - self.mu_weight) * medal_expectation.reshape(-1, 1),
            self.mu_weight * features_c
        ])
        self.regressor2.fit(input_features, gold_expectation)

    def train_lstm(self, features, feature_name):
        """
        使用 LSTM 训练预测给定的特征
        :param features: 历史特征数据
        :param feature_name: 特征名称 (A, B, C)
        """
        X = features[:-1]
        y = features[1:]

        X = X.reshape((X.shape[0], X.shape[1], 1))

        model = Sequential([
            LSTM(50, activation='relu', input_shape=(X.shape[1], 1)),
            Dense(y.shape[1])
        ])
        model.compile(optimizer='adam', loss='mse')
        model.fit(X, y, epochs=50, verbose=0)

        self.lstm_models[feature_name] = model

    def predict_future_features(self, features, feature_name):
        """
        使用 LSTM 模型批量预测未来特征
        :param features: 最新的特征数据
        :param feature_name: 特征名称
        """
        model = self.lstm_models[feature_name]
        features = features.reshape((features.shape[0], features.shape[1], 1))
        return model.predict(features)

    def predict(self, features_a, features_b, features_c):
        """
        预测2028年奥运结果
        """
        input_features = np.hstack([features_a, features_b])
        classifier_output = self.classifier.predict_proba(input_features)[:, 1]
        medal_expectation = self.regressor1.predict(np.hstack([classifier_output.reshape(-1, 1), features_b]))
        gold_expectation = self.regressor2.predict(np.hstack([
            (1 - self.mu_weight) * medal_expectation.reshape(-1, 1),
            self.mu_weight * features_c
        ]))
        return medal_expectation, gold_expectation

def plot_n_estimators_curve(features_a, features_b, labels, n_range=range(10, 500, 20)):
    """
    绘制随机森林在不同树数量下的学习曲线
    """
    import matplotlib.pyplot as plt
    from sklearn.model_selection import cross_val_score
    
    train_scores = []
    cv_scores = []
    input_features = np.hstack([features_a, features_b])
    
    for n in n_range:
        clf = RandomForestClassifier(n_estimators=n, random_state=42)
        # 训练集得分
        clf.fit(input_features, labels)
        train_scores.append(clf.score(input_features, labels))
        # 交叉验证得分
        cv_score = cross_val_score(clf, input_features, labels, cv=5).mean()
        cv_scores.append(cv_score)
        print(f"Trees: {n}, Train Score: {train_scores[-1]:.4f}, CV Score: {cv_scores[-1]:.4f}")
    
    plt.figure(figsize=(10, 6))
    plt.plot(n_range, train_scores, label='Training Score')
    plt.plot(n_range, cv_scores, label='Cross-Validation Score')
    plt.xlabel('Number of Trees')
    plt.ylabel('Score')
    plt.title('Random Forest Performance vs Number of Trees')
    plt.legend()
    plt.grid(True)
    plt.show()

# 示例：加载数据并训练模型
if __name__ == "__main__":
    # 加载历史数据
    data = pd.read_csv("feature_Noc.csv")

    model = OlympicPredictionModel(lambda_weight=0.3, mu_weight=0.85)
    features_a, features_b, features_c, labels, medal_expectation, gold_expectation = model.preprocess_data(data)

    # 训练分类器和回归器
    model.train_classifier(features_a, features_b, labels)
    classifier_output = model.classifier.predict_proba(np.hstack([features_a, features_b]))[:, 1]
    model.train_regressor1(classifier_output, features_b, medal_expectation)
    model.train_regressor2(medal_expectation, features_c, gold_expectation)

    # 交叉验证分类器
    scores_classifier = cross_val_score(model.classifier, np.hstack([features_a, features_b]), labels, cv=5)
    print(f'Classifier CV Accuracy: {np.mean(scores_classifier):.4f}')

    # 交叉验证回归器1
    X_reg1 = np.hstack([classifier_output.reshape(-1, 1), features_b])
    scores_regressor1 = cross_val_score(model.regressor1, X_reg1, medal_expectation, cv=5)
    print(f'Regressor1 CV R^2: {np.mean(scores_regressor1):.4f}')

    # 交叉验证回归器2
    X_reg2 = np.hstack([(1 - model.mu_weight) * medal_expectation.reshape(-1, 1), model.mu_weight * features_c])
    scores_regressor2 = cross_val_score(model.regressor2, X_reg2, gold_expectation, cv=5)
    print(f'Regressor2 CV R^2: {np.mean(scores_regressor2):.4f}')

    # 绘制学习曲线
    print("\n绘制随机森林学习曲线:")
    plot_n_estimators_curve(features_a, features_b, labels)

    # # 使用 LSTM 训练
    # model.train_lstm(features_a, "A")
    # model.train_lstm(features_b, "B")
    # model.train_lstm(features_c, "C")

    # # 批量预测未来特征
    # future_a = model.predict_future_features(features_a, "A")
    # future_b = model.predict_future_features(features_b, "B")
    # future_c = model.predict_future_features(features_c, "C")

    # # 批量预测2028结果
    # medal_expectation, gold_expectation = model.predict(future_a, future_b, future_c)

    # # 输出结果
    # for i, row in data.iterrows():
    #     print(f"Country: {row['NOC_x']}, Medal Expectation: {medal_expectation[i]:.2f}, Gold Expectation: {gold_expectation[i]:.2f}")

### V1.1

结构：随机森林+随机森林+随机森林

改动：调整预测算法：逐项预测；arima。
太麻烦，不改了！

### V2.0
结构不变

改动：调整所有代码为使用前一届数据作为输入，预测当前届。样本选取1964开始。

可视化观察不同树选取的学习曲线。

In [None]:
# version 2
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
import matplotlib.pyplot as plt
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense

class OlympicPredictionModel:
    def __init__(self, lambda_weight=0.3, mu_weight=0.85):
        self.lambda_weight = lambda_weight
        self.mu_weight = mu_weight
        self.classifier = RandomForestClassifier(n_estimators=300)
        self.regressor1 = RandomForestRegressor()
        self.regressor2 = RandomForestRegressor()
        self.lstm_models = {}

    @staticmethod
    def calculate_w4_vector(country_change, athlete_change, event_changes):
        w1 = np.sum(country_change, axis=1).astype(np.float64)
        w2 = np.sum(athlete_change, axis=1).astype(np.float64)
        w3 = np.sum(event_changes, axis=1).astype(np.float64)
        w4 = np.divide(0.2 * w1 + 0.8 * w2, w3, out=np.zeros_like(w3, dtype=np.float64), where=w3!=0)
        return w4

    def preprocess_data(self, data):
        """数据预处理,使用前一届数据预测当年"""
        # 过滤1964年之前的数据
        data = data[data['Year'] >= 1964].copy()
        
        # 计算特征A
        score_rate_list = np.array(data['Score_Rate_List'].apply(eval).tolist())
        score_list = np.array(data['Score_List'].apply(eval).tolist(), dtype=np.float64)
        participants_list = np.array(data['Participants_List'].apply(eval).tolist(), dtype=np.float64)

        # 将数据按NOC和Year排序,创建前一届特征
        data = data.sort_values(['NOC_x', 'Year'])
        
        # 计算A特征
        v5_5_a = np.sum(score_rate_list, axis=1)
        v8_5_a = np.sum(np.divide(score_list, participants_list, 
                                  out=np.zeros_like(score_list, dtype=np.float64), 
                                  where=participants_list > 0), axis=1)
        
        country_change = np.array(data['CountryChange'].apply(eval).tolist())
        athlete_change = np.array(data['AthleteChange'].apply(eval).tolist())
        event_changes = np.array(data['Event_Changes'].apply(eval).tolist())
        w4 = self.calculate_w4_vector(country_change, athlete_change, event_changes)
        
        v8_w4 = np.sum(np.multiply(
            np.divide(score_list, participants_list, out=np.zeros_like(score_list), where=participants_list > 0),
            w4[:, None]
        ), axis=1)
        
        features_a = np.column_stack((v5_5_a, v8_5_a, v8_w4))
        
        # 计算B特征
        v1_b = data['if_host'].values
        v5_5_b = np.sum(participants_list, axis=1)
        v6_b = data['Gold_prev'].fillna(0).values
        v7_b = data['distance'].values
        v3_5_b = data['Score'].values
        v3_w4_b = np.sum(np.multiply(score_list, w4[:, None]), axis=1)
        
        features_b = np.column_stack((v1_b, v5_5_b, v6_b, v7_b, v3_5_b, v3_w4_b))
        
        # C特征与B相同
        features_c = features_b.copy()
        
        # 向后移动一期作为预测特征
        features_a = np.roll(features_a, 1, axis=0)
        features_b = np.roll(features_b, 1, axis=0)
        features_c = np.roll(features_c, 1, axis=0)
        
        # 移除第一条记录(无前期数据)
        features_a = features_a[1:]
        features_b = features_b[1:]
        features_c = features_c[1:]
        
        # 对应调整标签
        labels = (data['Total_Medals'] > 0).astype(int).values[1:]
        medal_expectation = data['Total_Medals'].values[1:]
        gold_expectation = data['Gold'].values[1:]
        
        # 标准化
        scaler = StandardScaler()
        features_a_scaled = scaler.fit_transform(features_a)
        features_b_scaled = scaler.fit_transform(features_b)
        features_c_scaled = scaler.fit_transform(features_c)
        
        return features_a_scaled, features_b_scaled, features_c_scaled, labels, medal_expectation, gold_expectation

    def train_classifier(self, features_a, other_features, labels):
        input_features = np.hstack([
            self.lambda_weight * features_a,
            (1 - self.lambda_weight) * other_features
        ])
        self.classifier.fit(input_features, labels)

    def train_regressor1(self, classifier_output, features_b, medal_expectation):
        input_features = np.hstack([classifier_output.reshape(-1, 1), features_b])
        self.regressor1.fit(input_features, medal_expectation)

    def train_regressor2(self, medal_expectation, features_c, gold_expectation):
        input_features = np.hstack([
            (1 - self.mu_weight) * medal_expectation.reshape(-1, 1),
            self.mu_weight * features_c
        ])
        self.regressor2.fit(input_features, gold_expectation)

    def predict(self, features_a, features_b, features_c):
        input_features = np.hstack([features_a, features_b])
        classifier_output = self.classifier.predict_proba(input_features)[:, 1]
        medal_expectation = self.regressor1.predict(np.hstack([classifier_output.reshape(-1, 1), features_b]))
        gold_expectation = self.regressor2.predict(np.hstack([
            (1 - self.mu_weight) * medal_expectation.reshape(-1, 1),
            self.mu_weight * features_c
        ]))
        return medal_expectation, gold_expectation

def plot_regressor_learning_curves(features, targets, n_range=range(10, 1000, 10), 
                                regressor_name="Regressor"):
    """绘制随机森林回归器的学习曲线"""
    train_scores = []
    cv_scores = [] 
    
    for n in n_range:
        regr = RandomForestRegressor(n_estimators=n, random_state=42)
        regr.fit(features, targets)
        train_scores.append(regr.score(features, targets))
        cv_score = cross_val_score(regr, features, targets, cv=5).mean()
        cv_scores.append(cv_score)
        print(f"{regressor_name} Trees: {n}, Train Score: {train_scores[-1]:.4f}, CV Score: {cv_scores[-1]:.4f}")
    
    plt.figure(figsize=(10, 6))
    plt.plot(n_range, train_scores, label='Training Score')
    plt.plot(n_range, cv_scores, label='Cross-Validation Score')
    plt.xlabel('Number of Trees')
    plt.ylabel('Score')
    plt.title(f'{regressor_name} Performance vs Number of Trees')
    plt.legend()
    plt.grid(True)
    plt.show()

if __name__ == "__main__":
    # 加载数据
    data = pd.read_csv("feature_Noc.csv")
    
    # 初始化模型
    model = OlympicPredictionModel(lambda_weight=0.3, mu_weight=0.85)
    features_a, features_b, features_c, labels, medal_expectation, gold_expectation = model.preprocess_data(data)
    
    # 训练模型
    model.train_classifier(features_a, features_b, labels)
    classifier_output = model.classifier.predict_proba(np.hstack([features_a, features_b]))[:, 1]
    model.train_regressor1(classifier_output, features_b, medal_expectation)
    model.train_regressor2(medal_expectation, features_c, gold_expectation)
    
    # 交叉验证
    input_features = np.hstack([features_a, features_b])
    scores_classifier = cross_val_score(model.classifier, input_features, labels, cv=5)
    print(f'Classifier CV Accuracy: {np.mean(scores_classifier):.4f}')
    
    X_reg1 = np.hstack([classifier_output.reshape(-1, 1), features_b])
    scores_regressor1 = cross_val_score(model.regressor1, X_reg1, medal_expectation, cv=5)
    print(f'Regressor1 CV R^2: {np.mean(scores_regressor1):.4f}')
    
    X_reg2 = np.hstack([(1 - model.mu_weight) * medal_expectation.reshape(-1, 1), model.mu_weight * features_c])
    scores_regressor2 = cross_val_score(model.regressor2, X_reg2, gold_expectation, cv=5)
    print(f'Regressor2 CV R^2: {np.mean(scores_regressor2):.4f}')
    
    # 绘制回归器学习曲线
    print("\n绘制第一层回归器(Total Medals)学习曲线:")
    plot_regressor_learning_curves(X_reg1, medal_expectation, regressor_name="Medal Regressor")
    
    print("\n绘制第二层回归器(Gold Medals)学习曲线:")
    plot_regressor_learning_curves(X_reg2, gold_expectation, regressor_name="Gold Regressor")

### V2.1

结构不变，不做可视化。确定为200层随机树。

In [6]:
# Version 2.1
class OlympicPredictionModel:
    def __init__(self, lambda_weight=0.3, mu_weight=0.85):
        """
        初始化模型权重
        :param lambda_weight: 小小国权重
        :param mu_weight: 大国权重
        """
        self.lambda_weight = lambda_weight
        self.mu_weight = mu_weight
        self.classifier = RandomForestClassifier(n_estimators=200)
        self.regressor1 = RandomForestRegressor(n_estimators=200)
        self.regressor2 = RandomForestRegressor(n_estimators=200)
        self.lstm_models = {}

    @staticmethod

    def Country_classification(year, Noc_x, data):
        """
        判断是否是小小国
        :param year: 当前年份
        :param Noc_x: 国家代码
        :param data: 完整数据集
        :return: True if 小小国, False otherwise
        """
        # 计算该国在该年之前(含该年)的累计参赛人数
        historical_data = data[data['Year'] <= year]
        country_total = historical_data[historical_data['NOC_x'] == Noc_x]['Name_Count'].sum()
        
        # 获取该年所有国家的累计参赛人数
        all_countries = historical_data.groupby('NOC_x')['Name_Count'].sum()
        
        # 计算20%分位点
        threshold = all_countries.quantile(0.2)
        
        return country_total <= threshold


    def calculate_w4_vector(self, country_change, athlete_change, event_changes):
        """
        矢量化计算 W4
        """
        w1 = np.sum(country_change, axis=1).astype(np.float64)
        w2 = np.sum(athlete_change, axis=1).astype(np.float64)
        w3 = np.sum(event_changes, axis=1).astype(np.float64)
        w4 = np.divide(0.2 * w1 + 0.8 * w2, w3, out=np.zeros_like(w3, dtype=np.float64), where=w3!=0)
        return w4

    def preprocess_data(self, data):
        """
        数据预处理，包括计算 featureA, featureB, featureC, feature_all，使用参考年份的数据
        """
        # 创建年份映射字典
        year_mapping = pd.read_csv("year_transmition_reference.csv")
        year_dict = dict(zip(year_mapping['evaluating_year'], year_mapping['use_data_from']))
        
        features_list = []
        for idx, row in data.iterrows():
            current_year = row['Year']
            # 获取参考年份数据
            reference_year = year_dict.get(current_year, current_year)
            reference_data = data[data['Year'] == reference_year]
            
            if len(reference_data) > 0:
                # 使用参考年份的特征数据
                ref_row = reference_data[reference_data['NOC_x'] == row['NOC_x']]
                if len(ref_row) > 0:
                    ref_row = ref_row.iloc[0]
                else:
                    ref_row = row  # 如果找不到对应国家，使用当前数据
            else:
                ref_row = row  # 如果找不到参考年份，使用当前数据

            # Feature A
            score_rate_list = np.array(eval(ref_row['Score_Rate_List']))
            score_list = np.array(eval(ref_row['Score_List']), dtype=np.float64)
            participants_list = np.array(eval(ref_row['Participants_List']), dtype=np.float64)

            v4_ = np.sum(score_rate_list)
            v5_ = np.sum(participants_list)
            v8_ = np.sum(np.divide(score_list, participants_list, 
                                  out=np.zeros_like(score_list, dtype=np.float64), 
                                  where=participants_list > 0))

            country_change = np.array(eval(ref_row['CountryChange']))
            athlete_change = np.array(eval(ref_row['AthleteChange']))
            event_changes = np.array(eval(ref_row['Event_Changes']))
            w4_ = self.calculate_w4_vector(country_change.reshape(1,-1), 
                                         athlete_change.reshape(1,-1), 
                                         event_changes.reshape(1,-1))[0]

            v8_w4 = np.sum(np.multiply(
                np.divide(score_list, participants_list, 
                         out=np.zeros_like(score_list, dtype=np.float64), 
                         where=participants_list > 0),
                w4_
            ))

            # Feature B
            v1 = ref_row['if_host']
            v6 = ref_row['Gold_prev'] if pd.notna(ref_row['Gold_prev']) else 0
            v7 = ref_row['distance']
            v3_ = ref_row['Score']
            v3_w4 = np.sum(np.multiply(score_list, w4_))

            # Feature C (same as B)
            
            # Feature all
            w3_ = np.sum(event_changes)

            features = {
                'features_a': [v5_, v8_, v8_w4],
                'features_b': [v1, v5_, v6, v7, v3_, v3_w4, v8_],
                'features_c': [v1, v5_, v6, v7, v3_, v3_w4, v8_],
                'features_all': [v1, v3_, v4_, v5_, v6, v7, v8_, w3_, w4_]
            }
            features_list.append(features)

        # 将特征列表转换为numpy数组
        features_a = np.array([f['features_a'] for f in features_list])
        features_b = np.array([f['features_b'] for f in features_list])
        features_c = np.array([f['features_c'] for f in features_list])
        features_all = np.array([f['features_all'] for f in features_list])

        # 使用当前年份的标签和期望值
        labels = (data['Total_Medals'] > 0).astype(int).values
        medal_expectation = data['Total_Medals'].values
        gold_expectation = data['Gold'].values

        # 标准化特征
        scaler = StandardScaler()
        features_a_scaled = scaler.fit_transform(features_a)
        features_b_scaled = scaler.fit_transform(features_b)
        features_c_scaled = scaler.fit_transform(features_c)
        features_all_scaled = scaler.fit_transform(features_all)

        return features_a_scaled, features_b_scaled, features_c_scaled, features_all_scaled, labels, medal_expectation, gold_expectation



    def train_classifier(self, features_a, features_all, labels, data):
        """
        训练分类器以预测获奖牌概率，如果小小国，则加权使用features_a
        :param data: 完整数据集，包含年份和国家信息
        """
        years = data['Year'].values
        nocs = data['NOC_x'].values

        # 为每个数据点判断是否为小小国
        is_small_countries = np.array([self.Country_classification(year, noc, data) 
                                     for year, noc in zip(years, nocs)])

        # 准备最终特征矩阵
        final_features = np.zeros((len(labels), features_all.shape[1]))
        
        # 对于小小国，使用加权组合的特征
        small_countries_mask = is_small_countries
        # 对于小小国，先用features_all作为基础
        final_features[small_countries_mask] = (1 - self.lambda_weight) * features_all[small_countries_mask]
        # 然后将features_a的列加入到对应位置
        final_features[small_countries_mask, :features_a.shape[1]] += self.lambda_weight * features_a[small_countries_mask]
        
        # 对于非小小国，直接使用features_all
        final_features[~small_countries_mask] = features_all[~small_countries_mask]

        # 训练分类器
        self.classifier.fit(final_features, labels)

    def train_regressor1(self, classifier_output, features_b, medal_expectation):
        """
        训练回归器1预测奖牌期望
        """
        input_features = np.hstack([classifier_output.reshape(-1, 1), features_b])
        self.regressor1.fit(input_features, medal_expectation)

    def train_regressor2(self, medal_expectation, features_c, gold_expectation):
        """
        训练回归器2预测金牌期望
        """
        input_features = np.hstack([
            (1 - self.mu_weight) * medal_expectation.reshape(-1, 1),
            self.mu_weight * features_c
        ])
        self.regressor2.fit(input_features, gold_expectation)

    def predict(self, features_a, features_b, features_c, features_all, data):
        """
        预测是否获奖、奖牌数和金牌数，只输出最近一届的结果
        """
        # 获取最近一年的数据索引
        latest_year = data['Year'].max()
        latest_year_mask = (data['Year'] == latest_year)
        
        # 只处理最近一年的数据
        years = data.loc[latest_year_mask, 'Year'].values
        nocs = data.loc[latest_year_mask, 'NOC_x'].values
        
        # 为每个数据点判断是否为小小国
        is_small_countries = np.array([self.Country_classification(year, noc, data) 
                                     for year, noc in zip(years, nocs)])
        
        # 准备最终特征矩阵
        final_features = np.zeros((len(years), features_all[latest_year_mask].shape[1]))
        final_features[is_small_countries] = (1 - self.lambda_weight) * features_all[latest_year_mask][is_small_countries]
        final_features[is_small_countries, :features_a.shape[1]] += self.lambda_weight * features_a[latest_year_mask][is_small_countries]
        final_features[~is_small_countries] = features_all[latest_year_mask][~is_small_countries]
        
        # 预测获奖概率
        win_probability = self.classifier.predict_proba(final_features)[:, 1]
        will_win = win_probability > 0.5
        
        # 预测奖牌总数
        input_features = np.hstack([win_probability.reshape(-1, 1), features_b[latest_year_mask]])
        medal_prediction = self.regressor1.predict(input_features)
        
        # 预测金牌数
        input_features = np.hstack([
            (1 - self.mu_weight) * medal_prediction.reshape(-1, 1),
            self.mu_weight * features_c[latest_year_mask]
        ])
        gold_prediction = self.regressor2.predict(input_features)
        
        # 返回预测结果
        return will_win, medal_prediction, gold_prediction
    
    def train_and_predict_period(self, data, start_year, end_year):
        """
        在指定时间段内训练模型并预测最后一年的结果
        :param data: 完整数据集
        :param start_year: 训练起始年份
        :param end_year: 训练结束年份
        :return: 最后一年的预测结果DataFrame
        """
        # 筛选时间段内的数据
        period_data = data[(data['Year'] >= start_year) & (data['Year'] <= end_year)].copy()
        
        # 预处理数据
        features_a, features_b, features_c, features_all, labels, medal_expectation, gold_expectation = self.preprocess_data(period_data)
        
        # 训练模型
        self.train_classifier(features_a, features_all, labels, period_data)
        classifier_output = self.classifier.predict_proba(final_features)[:, 1]
        self.train_regressor1(classifier_output, features_b, medal_expectation)
        self.train_regressor2(medal_expectation, features_c, gold_expectation)
        
        # 获取最后一年的预测结果
        last_year_mask = (period_data['Year'] == end_year)
        last_year_data = period_data[last_year_mask]
        
        will_win, medal_prediction, gold_prediction = self.predict(
            features_a[last_year_mask], 
            features_b[last_year_mask], 
            features_c[last_year_mask], 
            features_all[last_year_mask], 
            period_data
        )
        
        # 创建结果DataFrame
        results = pd.DataFrame({
            'NOC': last_year_data['NOC_x'],
            'Will_Win_Medal': will_win,
            'Medal_Prediction': medal_prediction,
            'Gold_Prediction': gold_prediction
        })
        
        return results
    

    def cross_validate_model(self, data, n_splits=5):
        """
        对模型进行交叉验证
        :param data: 完整数据集
        :param n_splits: 交叉验证折数
        :return: 各指标的交叉验证得分
        """
        # 预处理数据
        features_a, features_b, features_c, features_all, labels, medal_expectation, gold_expectation = self.preprocess_data(data)
        
        # 保存每次交叉验证的得分
        scores_classifier = []
        scores_regressor1 = []
        scores_regressor2 = []
        
        # 获取数据集大小和每折的样本数
        n_samples = len(labels)
        fold_size = n_samples // n_splits
        
        for i in range(n_splits):
            # 划分训练集和测试集
            test_start = i * fold_size
            test_end = (i + 1) * fold_size if i < n_splits - 1 else n_samples
            
            test_indices = slice(test_start, test_end)
            train_indices = list(range(0, test_start)) + list(range(test_end, n_samples))
            
            # 训练分类器
            self.train_classifier(
                features_a[train_indices], 
                features_all[train_indices], 
                labels[train_indices],
                data.iloc[train_indices]
            )
            
            # 预测测试集
            will_win, medal_pred, gold_pred = self.predict(
                features_a[test_indices],
                features_b[test_indices],
                features_c[test_indices],
                features_all[test_indices],
                data.iloc[test_indices]
            )
            
            # 计算各项指标的得分
            classifier_score = np.mean(will_win == labels[test_indices])
            regressor1_score = 1 - np.sum((medal_pred - medal_expectation[test_indices])**2) / np.sum((medal_expectation[test_indices] - np.mean(medal_expectation[test_indices]))**2)
            regressor2_score = 1 - np.sum((gold_pred - gold_expectation[test_indices])**2) / np.sum((gold_expectation[test_indices] - np.mean(gold_expectation[test_indices]))**2)
            
            scores_classifier.append(classifier_score)
            scores_regressor1.append(regressor1_score)
            scores_regressor2.append(regressor2_score)
        
        return {
            'classifier_scores': scores_classifier,
            'regressor1_scores': scores_regressor1,
            'regressor2_scores': scores_regressor2,
            'mean_classifier_score': np.mean(scores_classifier),
            'mean_regressor1_score': np.mean(scores_regressor1),
            'mean_regressor2_score': np.mean(scores_regressor2)
        }

    

# 示例使用
if __name__ == "__main__":
    # 加载数据
    data = pd.read_csv("feature_Noc.csv")

    # 初始化模型
    model = OlympicPredictionModel(lambda_weight=0.3, mu_weight=0.85)

    # 数据预处理
    features_a, features_b, features_c, features_all, labels, medal_expectation, gold_expectation = model.preprocess_data(data)

    # 训练模型
    model.train_classifier(features_a, features_all, labels, data)
    
    # 准备分类器输出
    years = data['Year'].values
    nocs = data['NOC_x'].values
    is_small_countries = np.array([model.Country_classification(year, noc, data) 
                                 for year, noc in zip(years, nocs)])
    
    final_features = np.zeros((len(labels), features_all.shape[1]))
    final_features[is_small_countries] = (1 - model.lambda_weight) * features_all[is_small_countries]
    final_features[is_small_countries, :features_a.shape[1]] += model.lambda_weight * features_a[is_small_countries]
    final_features[~is_small_countries] = features_all[~is_small_countries]
    
    classifier_output = model.classifier.predict_proba(final_features)[:, 1]
    
    # 训练回归器
    model.train_regressor1(classifier_output, features_b, medal_expectation)
    model.train_regressor2(medal_expectation, features_c, gold_expectation)

    # 进行预测
    will_win, medal_prediction, gold_prediction = model.predict(features_a, features_b, features_c, features_all, data)
    
    # 创建预测结果DataFrame
    latest_year_mask = (data['Year'] == data['Year'].max())
    results = pd.DataFrame({
        'NOC': data.loc[latest_year_mask, 'NOC_x'],
        'Will_Win_Medal': will_win,
        'Medal_Prediction': medal_prediction,
        'Gold_Prediction': gold_prediction
    })
    
    print("\n预测结果:")
    print(results)
    #保存文件
    results.to_csv('results.csv', index=False)


预测结果:
      NOC  Will_Win_Medal  Medal_Prediction  Gold_Prediction
1781  AIN            True          5.427917            1.715
1782  ALG            True          2.680000            1.490
1783  ARG            True         18.537500            3.840
1784  ARM            True          3.792500            0.325
1785  ARU           False          0.000000            0.000
...   ...             ...               ...              ...
1888  USA            True        296.430000          126.995
1889  UZB            True         12.412500            4.645
1890  VAN           False          0.000000            0.000
1891  YEM           False          0.000000            0.000
1892  ZAM            True          1.129286            0.000

[112 rows x 4 columns]


### V3.0

加入了5-fold交叉验证

结构改为：
1) classifier:随机森林：200
2) regressor:MLP

In [None]:
# version2.999999
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from keras.models import Model
from keras.layers import Input, LSTM, Dense, Dot, Activation, Concatenate
from keras.optimizers import Adam
from keras.callbacks import ReduceLROnPlateau

class OlympicPredictionModel:
    def __init__(self, lambda_weight=0.3, mu_weight=0.85, learning_rate=0.001):
        self.lambda_weight = lambda_weight
        self.mu_weight = mu_weight
        self.learning_rate = learning_rate  # Added for Adam optimizer tuning
        self.classifier = RandomForestClassifier(n_estimators=200)
        self.regressor1 = RandomForestRegressor(n_estimators=200)
        self.regressor2 = RandomForestRegressor(n_estimators=200)
        self.lstm_model = None

    @staticmethod
    def Country_classification(year, Noc_x, data):
        historical_data = data[data['Year'] <= year]
        country_total = historical_data[historical_data['NOC_x'] == Noc_x]['Name_Count'].sum()
        all_countries = historical_data.groupby('NOC_x')['Name_Count'].sum()
        threshold = all_countries.quantile(0.2)
        return country_total <= threshold

    def calculate_w4_vector(self, country_change, athlete_change, event_changes):
        w1 = np.sum(country_change, axis=1).astype(np.float64)
        w2 = np.sum(athlete_change, axis=1).astype(np.float64)
        w3 = np.sum(event_changes, axis=1).astype(np.float64)
        w4 = np.divide(0.2 * w1 + 0.8 * w2, w3, out=np.zeros_like(w3, dtype=np.float64), where=w3 != 0)
        return w4

    def preprocess_data(self, data):
        year_mapping = pd.read_csv("year_transmition_reference.csv")
        year_dict = dict(zip(year_mapping['evaluating_year'], year_mapping['use_data_from']))

        features_list = []
        for idx, row in data.iterrows():
            current_year = row['Year']
            reference_year = year_dict.get(current_year, current_year)
            reference_data = data[data['Year'] == reference_year]

            if len(reference_data) > 0:
                ref_row = reference_data[reference_data['NOC_x'] == row['NOC_x']]
                ref_row = ref_row.iloc[0] if len(ref_row) > 0 else row
            else:
                ref_row = row

            score_rate_list = np.array(eval(ref_row['Score_Rate_List']))
            score_list = np.array(eval(ref_row['Score_List']), dtype=np.float64)
            participants_list = np.array(eval(ref_row['Participants_List']), dtype=np.float64)

            v4_ = np.sum(score_rate_list)
            v5_ = np.sum(participants_list)
            v8_ = np.sum(np.divide(score_list, participants_list, out=np.zeros_like(score_list, dtype=np.float64), where=participants_list > 0))

            country_change = np.array(eval(ref_row['CountryChange']))
            athlete_change = np.array(eval(ref_row['AthleteChange']))
            event_changes = np.array(eval(ref_row['Event_Changes']))
            w4_ = self.calculate_w4_vector(country_change.reshape(1, -1), athlete_change.reshape(1, -1), event_changes.reshape(1, -1))[0]

            v8_w4 = np.sum(np.multiply(np.divide(score_list, participants_list, out=np.zeros_like(score_list, dtype=np.float64), where=participants_list > 0), w4_))

            v1 = ref_row['if_host']
            v6 = ref_row['Gold_prev'] if pd.notna(ref_row['Gold_prev']) else 0
            v7 = ref_row['distance']
            v3_ = ref_row['Score']
            v3_w4 = np.sum(np.multiply(score_list, w4_))

            w3_ = np.sum(event_changes)

            features = {
                'features_a': [v5_, v8_, v8_w4],
                'features_b': [v1, v5_, v6, v7, v3_, v3_w4, v8_],
                'features_c': [v1, v5_, v6, v7, v3_, v3_w4, v8_],
                'features_all': [v1, v3_, v4_, v5_, v6, v7, v8_, w3_, w4_]
            }
            features_list.append(features)

        features_a = np.array([f['features_a'] for f in features_list])
        features_b = np.array([f['features_b'] for f in features_list])
        features_c = np.array([f['features_c'] for f in features_list])
        features_all = np.array([f['features_all'] for f in features_list])

        labels = (data['Total_Medals'] > 0).astype(int).values
        medal_expectation = data['Total_Medals'].values
        gold_expectation = data['Gold'].values

        scaler = StandardScaler()
        features_a_scaled = scaler.fit_transform(features_a)
        features_b_scaled = scaler.fit_transform(features_b)
        features_c_scaled = scaler.fit_transform(features_c)
        features_all_scaled = scaler.fit_transform(features_all)

        return features_a_scaled, features_b_scaled, features_c_scaled, features_all_scaled, labels, medal_expectation, gold_expectation

    def train_classifier(self, features_a, features_all, labels, data):
        years = data['Year'].values
        nocs = data['NOC_x'].values

        is_small_countries = np.array([self.Country_classification(year, noc, data) 
                                     for year, noc in zip(years, nocs)])

        final_features = np.zeros((len(labels), features_all.shape[1]))
        small_countries_mask = is_small_countries

        final_features[small_countries_mask] = (1 - self.lambda_weight) * features_all[small_countries_mask]
        final_features[small_countries_mask, :features_a.shape[1]] += self.lambda_weight * features_a[small_countries_mask]

        final_features[~small_countries_mask] = features_all[~small_countries_mask]

        self.classifier.fit(final_features, labels)
        self.classifier_output = self.classifier.predict_proba(final_features)

    def train_regressor1(self, features_b, medal_expectation):
        classifier_probabilities = self.classifier_output[:, 1]
        input_features = np.hstack([classifier_probabilities.reshape(-1, 1), features_b])
        self.regressor1.fit(input_features, medal_expectation)

    def train_regressor2(self, features_c, gold_expectation):
        # Get the classifier probabilities
        classifier_probabilities = self.classifier_output[:, 1]
        # Create input features for regressor1 prediction
        input_features_reg1 = np.hstack([classifier_probabilities.reshape(-1, 1), features_c])
        # Predict medal expectation
        medal_expectation = self.regressor1.predict(input_features_reg1)
        # Create input features for regressor2
        input_features = np.hstack([
            (1 - self.mu_weight) * medal_expectation.reshape(-1, 1),
            self.mu_weight * features_c
        ])
        self.regressor2.fit(input_features, gold_expectation)

    def create_lstm_attention_model(self, input_shape):
        inputs = Input(shape=(input_shape[1], input_shape[2]))
        lstm_out, state_h, state_c = LSTM(64, return_sequences=True, return_state=True)(inputs)
        attention = Dot(axes=[2, 2])([lstm_out, lstm_out])
        attention = Activation('softmax')(attention)
        context = Dot(axes=[2, 1])([attention, lstm_out])
        combined = Concatenate()([context, lstm_out])
        outputs = Dense(1, activation='linear')(combined)

        # Use Adam optimizer with tunable learning rate
        optimizer = Adam(learning_rate=self.learning_rate)
        model = Model(inputs, outputs)
        model.compile(optimizer=optimizer, loss='mean_squared_error')

        return model

    def evaluate_model(self, features_a, features_b, features_c, features_all, labels, medal_expectation, gold_expectation, data, cv=5):
        """评估模型性能"""
        from sklearn.model_selection import KFold 
        from sklearn.metrics import accuracy_score, roc_auc_score, r2_score, mean_squared_error
        
        kf = KFold(n_splits=cv, shuffle=True, random_state=42)
        
        classifier_scores = {'train_acc': [], 'test_acc': [], 'train_auc': [], 'test_auc': []}
        regressor1_scores = {'train_r2': [], 'test_r2': [], 'train_mse': [], 'test_mse': []}
        regressor2_scores = {'train_r2': [], 'test_r2': [], 'train_mse': [], 'test_mse': []}
        
        for train_idx, test_idx in kf.split(features_all):
            # 准备训练集和测试集
            X_train_a = features_a[train_idx]
            X_test_a = features_a[test_idx] 
            X_train_b = features_b[train_idx]
            X_test_b = features_b[test_idx]
            X_train_c = features_c[train_idx]
            X_test_c = features_c[test_idx]
            X_train_all = features_all[train_idx]
            X_test_all = features_all[test_idx]
            y_train = labels[train_idx]
            y_test = labels[test_idx]
            medal_train = medal_expectation[train_idx]
            medal_test = medal_expectation[test_idx]
            gold_train = gold_expectation[train_idx]
            gold_test = gold_expectation[test_idx]
            
            # 训练和评估分类器
            train_data = data.iloc[train_idx].reset_index(drop=True)
            self.train_classifier(X_train_a, X_train_all, y_train, train_data)
            train_prob = self.classifier.predict_proba(X_train_all)[:, 1]
            test_prob = self.classifier.predict_proba(X_test_all)[:, 1]
            
            classifier_scores['train_acc'].append(accuracy_score(y_train, train_prob > 0.5))
            classifier_scores['test_acc'].append(accuracy_score(y_test, test_prob > 0.5))
            classifier_scores['train_auc'].append(roc_auc_score(y_train, train_prob))
            classifier_scores['test_auc'].append(roc_auc_score(y_test, test_prob))
            
            # 训练和评估回归器1
            self.train_regressor1(X_train_b, medal_train)
            train_medal_pred = self.regressor1.predict(np.hstack([train_prob.reshape(-1, 1), X_train_b]))
            test_medal_pred = self.regressor1.predict(np.hstack([test_prob.reshape(-1, 1), X_test_b]))
            
            regressor1_scores['train_r2'].append(r2_score(medal_train, train_medal_pred))
            regressor1_scores['test_r2'].append(r2_score(medal_test, test_medal_pred))
            regressor1_scores['train_mse'].append(mean_squared_error(medal_train, train_medal_pred))
            regressor1_scores['test_mse'].append(mean_squared_error(medal_test, test_medal_pred))
            
            # 训练和评估回归器2
            self.train_regressor2(X_train_c, gold_train)
            train_gold_pred = self.regressor2.predict(np.hstack([
                (1 - self.mu_weight) * train_medal_pred.reshape(-1, 1),
                self.mu_weight * X_train_c
            ]))
            test_gold_pred = self.regressor2.predict(np.hstack([
                (1 - self.mu_weight) * test_medal_pred.reshape(-1, 1),
                self.mu_weight * X_test_c
            ]))
            
            regressor2_scores['train_r2'].append(r2_score(gold_train, train_gold_pred))
            regressor2_scores['test_r2'].append(r2_score(gold_test, test_gold_pred))
            regressor2_scores['train_mse'].append(mean_squared_error(gold_train, train_gold_pred))
            regressor2_scores['test_mse'].append(mean_squared_error(gold_test, test_gold_pred))

        # 打印评估结果
        print("\n模型评估结果 (5-fold交叉验证):")
        print("\n分类器 (预测是否获得奖牌):")
        print(f"训练集 Accuracy: {np.mean(classifier_scores['train_acc']):.4f} (±{np.std(classifier_scores['train_acc']):.4f})")
        print(f"测试集 Accuracy: {np.mean(classifier_scores['test_acc']):.4f} (±{np.std(classifier_scores['test_acc']):.4f})")
        print(f"训练集 AUC: {np.mean(classifier_scores['train_auc']):.4f} (±{np.std(classifier_scores['train_auc']):.4f})")
        print(f"测试集 AUC: {np.mean(classifier_scores['test_auc']):.4f} (±{np.std(classifier_scores['test_auc']):.4f})")
        
        print("\n回归器1 (预测奖牌总数):")
        print(f"训练集 R2: {np.mean(regressor1_scores['train_r2']):.4f} (±{np.std(regressor1_scores['train_r2']):.4f})")
        print(f"测试集 R2: {np.mean(regressor1_scores['test_r2']):.4f} (±{np.std(regressor1_scores['test_r2']):.4f})")
        print(f"训练集 MSE: {np.mean(regressor1_scores['train_mse']):.4f} (±{np.std(regressor1_scores['train_mse']):.4f})")
        print(f"测试集 MSE: {np.mean(regressor1_scores['test_mse']):.4f} (±{np.std(regressor1_scores['test_mse']):.4f})")
        
        print("\n回归器2 (预测金牌数):")
        print(f"训练集 R2: {np.mean(regressor2_scores['train_r2']):.4f} (±{np.std(regressor2_scores['train_r2']):.4f})")
        print(f"测试集 R2: {np.mean(regressor2_scores['test_r2']):.4f} (±{np.std(regressor2_scores['test_r2']):.4f})")
        print(f"训练集 MSE: {np.mean(regressor2_scores['train_mse']):.4f} (±{np.std(regressor2_scores['train_mse']):.4f})")
        print(f"测试集 MSE: {np.mean(regressor2_scores['test_mse']):.4f} (±{np.std(regressor2_scores['test_mse']):.4f})")
    def train_and_predict_period(self, data, start_year, end_year):
        olympic_years = [1896, 1900, 1904, 1908, 1912, 1920, 1924, 1928, 1932, 1936, 1948, 1952, 1956, 1960, 1964, 1968, 1972, 1976, 1980, 1984, 1988, 1992, 1996, 2000, 2004, 2008, 2012, 2016, 2020]
        period_data = data[(data['Year'] >= start_year) & (data['Year'] <= end_year)].reset_index(drop=True)

        features_a, features_b, features_c, features_all, labels, medal_expectation, gold_expectation = self.preprocess_data(period_data)

        self.train_classifier(features_a, features_all, labels, period_data)
        self.train_regressor1(features_b, medal_expectation)
        self.train_regressor2(features_c, gold_expectation)

        olympic_mask = period_data['Year'].isin(olympic_years)
        olympic_data = period_data[olympic_mask].reset_index(drop=True)
        olympic_features_all = features_all[olympic_mask]
        time_series_data = olympic_features_all.reshape((olympic_features_all.shape[0], 1, olympic_features_all.shape[1]))

        if self.lstm_model is None:
            self.lstm_model = self.create_lstm_attention_model(time_series_data.shape)

        # Add ReduceLROnPlateau callback for Adam optimizer tuning
        lr_scheduler = ReduceLROnPlateau(monitor='loss', factor=0.5, patience=3, min_lr=1e-6, verbose=1)
        self.lstm_model.fit(time_series_data, gold_expectation[olympic_mask], epochs=10, batch_size=32, verbose=1, callbacks=[lr_scheduler])

        last_year_data = period_data[period_data['Year'] == end_year].reset_index(drop=True)
        end_year_olympic_mask = olympic_data['Year'] == end_year
        time_series_last_year = time_series_data[end_year_olympic_mask]

        if len(time_series_last_year) == 0:
            most_recent_year = olympic_data['Year'].max()
            time_series_last_year = time_series_data[olympic_data['Year'] == most_recent_year]

        if len(time_series_last_year.shape) == 2:
            time_series_last_year = np.expand_dims(time_series_last_year, axis=0)

        lstm_gold_prediction = self.lstm_model.predict(time_series_last_year, verbose=0)
        available_years = period_data['Year'].unique()
        prediction_year = end_year if end_year in available_years else available_years.max()

        classifier_probabilities = self.classifier_output[:, 1][period_data['Year'] == prediction_year]
        medal_prediction = self.regressor1.predict(np.hstack([
            classifier_probabilities.reshape(-1, 1), 
            features_b[period_data['Year'] == prediction_year]
        ]))

        gold_prediction_rf = self.regressor2.predict(np.hstack([
            (1 - self.mu_weight) * medal_prediction.reshape(-1, 1),
            self.mu_weight * features_c[period_data['Year'] == prediction_year]
        ]))
        prediction_data = period_data[period_data['Year'] == prediction_year].reset_index(drop=True)
        results = pd.DataFrame({
            'NOC': prediction_data['NOC_x'],
            'Year': prediction_year,
            'Will_Win_Medal': classifier_probabilities > 0.5,
            'Medal_Prediction': medal_prediction,
            'Gold_Prediction': gold_prediction_rf
        })
        # 在预测之后评估模型
        print("\n执行模型评估...")
        self.evaluate_model(features_a, features_b, features_c, features_all, 
                        labels, medal_expectation, gold_expectation, period_data)
        return results
    
# Example usage
data = pd.read_csv("E:/2025_MCM_C\Model1/feature_Noc.csv")  # Replace with your actual dataset
model = OlympicPredictionModel()
start_year = 1964
end_year = 2028
results = model.train_and_predict_period(data, start_year, end_year)
print(results)
# Save the results to a CSV file
results.to_csv("E:/2025_MCM_C\Model1/results.csv", index=False)  # Replace with your desired file path

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10

执行模型评估...

模型评估结果 (5-fold交叉验证):

分类器 (预测是否获得奖牌):
训练集 Accuracy: 0.9659 (±0.0018)
测试集 Accuracy: 0.8655 (±0.0146)
训练集 AUC: 0.9955 (±0.0002)
测试集 AUC: 0.9321 (±0.0110)

回归器1 (预测奖牌总数):
训练集 R2: 0.9743 (±0.0026)
测试集 R2: 0.8389 (±0.0238)
训练集 MSE: 45.8949 (±4.4684)
测试集 MSE: 288.2902 (±75.1224)

回归器2 (预测金牌数):
训练集 R2: 0.9722 (±0.0060)
测试集 R2: 0.7583 (±0.0315)
训练集 MSE: 8.8745 (±2.0037)
测试集 MSE: 76.3350 (±17.3660)
     NOC  Year  Will_Win_Medal  Medal_Prediction  Gold_Prediction
0    AIN  2024            True          5.471333         1.188667
1    ALG  2024            True          3.020000         1.307500
2    ARG  2024            True         18.214583         2.685000
3    ARM  2024            True          3.553333         0.365000
4    ARU  2024           False          0.000000         0.000000
..   ...   ...             ...               ...              ...
107  USA  2024        

In [64]:
# version 3.0
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from keras.models import Model
from keras.layers import Input, LSTM, Dense, Dot, Activation, Concatenate
from keras.optimizers import Adam
from keras.callbacks import ReduceLROnPlateau

class OlympicPredictionModel:
    from sklearn.neural_network import MLPRegressor

    def __init__(self, lambda_weight=0.3, mu_weight=0.85, learning_rate=0.001):
        self.lambda_weight = lambda_weight
        self.mu_weight = mu_weight
        self.learning_rate = learning_rate  # Added for Adam optimizer tuning
        self.classifier = RandomForestClassifier(n_estimators=200)
        # Change RandomForestRegressor to MLPRegressor
        self.regressor1 = self.MLPRegressor(hidden_layer_sizes=(64, 32), max_iter=500, random_state=42)
        self.regressor2 = self.MLPRegressor(hidden_layer_sizes=(64, 32), max_iter=500, random_state=42)
        self.lstm_model = None

    @staticmethod
    def Country_classification(year, Noc_x, data):
        historical_data = data[data['Year'] <= year]
        country_total = historical_data[historical_data['NOC_x'] == Noc_x]['Name_Count'].sum()
        all_countries = historical_data.groupby('NOC_x')['Name_Count'].sum()
        threshold = all_countries.quantile(0.2)
        return country_total <= threshold

    def calculate_w4_vector(self, country_change, athlete_change, event_changes):
        w1 = np.sum(country_change, axis=1).astype(np.float64)
        w2 = np.sum(athlete_change, axis=1).astype(np.float64)
        w3 = np.sum(event_changes, axis=1).astype(np.float64)
        w4 = np.divide(0.2 * w1 + 0.8 * w2, w3, out=np.zeros_like(w3, dtype=np.float64), where=w3 != 0)
        return w4

    def preprocess_data(self, data):
        year_mapping = pd.read_csv("year_transmition_reference.csv")
        year_dict = dict(zip(year_mapping['evaluating_year'], year_mapping['use_data_from']))

        features_list = []
        for idx, row in data.iterrows():
            current_year = row['Year']
            reference_year = year_dict.get(current_year, current_year)
            reference_data = data[data['Year'] == reference_year]

            if len(reference_data) > 0:
                ref_row = reference_data[reference_data['NOC_x'] == row['NOC_x']]
                ref_row = ref_row.iloc[0] if len(ref_row) > 0 else row
            else:
                ref_row = row

            score_rate_list = np.array(eval(ref_row['Score_Rate_List']))
            score_list = np.array(eval(ref_row['Score_List']), dtype=np.float64)
            participants_list = np.array(eval(ref_row['Participants_List']), dtype=np.float64)

            v4_ = np.sum(score_rate_list)
            v5_ = np.sum(participants_list)
            v8_ = np.sum(np.divide(score_list, participants_list, out=np.zeros_like(score_list, dtype=np.float64), where=participants_list > 0))

            country_change = np.array(eval(ref_row['CountryChange']))
            athlete_change = np.array(eval(ref_row['AthleteChange']))
            event_changes = np.array(eval(ref_row['Event_Changes']))
            w4_ = self.calculate_w4_vector(country_change.reshape(1, -1), athlete_change.reshape(1, -1), event_changes.reshape(1, -1))[0]

            v8_w4 = np.sum(np.multiply(np.divide(score_list, participants_list, out=np.zeros_like(score_list, dtype=np.float64), where=participants_list > 0), w4_))

            v1 = ref_row['if_host']
            v6 = ref_row['Gold_prev'] if pd.notna(ref_row['Gold_prev']) else 0
            v7 = ref_row['distance']
            v3_ = ref_row['Score']
            v3_w4 = np.sum(np.multiply(score_list, w4_))

            w3_ = np.sum(event_changes)

            features = {
                'features_a': [v5_, v8_, v8_w4],
                'features_b': [v1, v5_, v6, v7, v3_, v3_w4, v8_],
                'features_c': [v1, v5_, v6, v7, v3_, v3_w4, v8_],
                'features_all': [v1, v3_, v4_, v5_, v6, v7, v8_, w3_, w4_]
            }
            features_list.append(features)

        features_a = np.array([f['features_a'] for f in features_list])
        features_b = np.array([f['features_b'] for f in features_list])
        features_c = np.array([f['features_c'] for f in features_list])
        features_all = np.array([f['features_all'] for f in features_list])

        labels = (data['Total_Medals'] > 0).astype(int).values
        medal_expectation = data['Total_Medals'].values
        gold_expectation = data['Gold'].values

        scaler = StandardScaler()
        features_a_scaled = scaler.fit_transform(features_a)
        features_b_scaled = scaler.fit_transform(features_b)
        features_c_scaled = scaler.fit_transform(features_c)
        features_all_scaled = scaler.fit_transform(features_all)

        return features_a_scaled, features_b_scaled, features_c_scaled, features_all_scaled, labels, medal_expectation, gold_expectation

    def train_classifier(self, features_a, features_all, labels, data):
        years = data['Year'].values
        nocs = data['NOC_x'].values

        is_small_countries = np.array([self.Country_classification(year, noc, data)
                                     for year, noc in zip(years, nocs)])

        final_features = np.zeros((len(labels), features_all.shape[1]))
        small_countries_mask = is_small_countries

        final_features[small_countries_mask] = (1 - self.lambda_weight) * features_all[small_countries_mask]
        final_features[small_countries_mask, :features_a.shape[1]] += self.lambda_weight * features_a[small_countries_mask]

        final_features[~small_countries_mask] = features_all[~small_countries_mask]

        self.classifier.fit(final_features, labels)
        self.classifier_output = self.classifier.predict_proba(final_features)

    def train_regressor1(self, features_b, medal_expectation):
        classifier_probabilities = self.classifier_output[:, 1]
        input_features = np.hstack([classifier_probabilities.reshape(-1, 1), features_b])
        self.regressor1.fit(input_features, medal_expectation)

    def train_regressor2(self, features_c, gold_expectation):
        classifier_probabilities = self.classifier_output[:, 1]
        input_features_reg1 = np.hstack([classifier_probabilities.reshape(-1, 1), features_c])
        medal_expectation = self.regressor1.predict(input_features_reg1)
        input_features = np.hstack([
            (1 - self.mu_weight) * medal_expectation.reshape(-1, 1),
            self.mu_weight * features_c
        ])
        self.regressor2.fit(input_features, gold_expectation)

    def create_lstm_attention_model(self, input_shape):
        inputs = Input(shape=(input_shape[1], input_shape[2]))
        lstm_out, state_h, state_c = LSTM(64, return_sequences=True, return_state=True)(inputs)
        attention = Dot(axes=[2, 2])([lstm_out, lstm_out])
        attention = Activation('softmax')(attention)
        context = Dot(axes=[2, 1])([attention, lstm_out])
        combined = Concatenate()([context, lstm_out])
        outputs = Dense(1, activation='linear')(combined)
        optimizer = Adam(learning_rate=self.learning_rate)
        model = Model(inputs, outputs)
        model.compile(optimizer=optimizer, loss='mean_squared_error')
        return model

    def evaluate_model(self, features_a, features_b, features_c, features_all, labels, medal_expectation, gold_expectation, data, cv=5):
        """评估模型性能"""
        from sklearn.model_selection import KFold 
        from sklearn.metrics import accuracy_score, roc_auc_score, r2_score, mean_squared_error
        
        kf = KFold(n_splits=cv, shuffle=True, random_state=42)
        
        classifier_scores = {'train_acc': [], 'test_acc': [], 'train_auc': [], 'test_auc': []}
        regressor1_scores = {'train_r2': [], 'test_r2': [], 'train_mse': [], 'test_mse': []}
        regressor2_scores = {'train_r2': [], 'test_r2': [], 'train_mse': [], 'test_mse': []}
        
        for train_idx, test_idx in kf.split(features_all):
            X_train_a = features_a[train_idx]
            X_test_a = features_a[test_idx] 
            X_train_b = features_b[train_idx]
            X_test_b = features_b[test_idx]
            X_train_c = features_c[train_idx]
            X_test_c = features_c[test_idx]
            X_train_all = features_all[train_idx]
            X_test_all = features_all[test_idx]
            y_train = labels[train_idx]
            y_test = labels[test_idx]
            medal_train = medal_expectation[train_idx]
            medal_test = medal_expectation[test_idx]
            gold_train = gold_expectation[train_idx]
            gold_test = gold_expectation[test_idx]
            
            train_data = data.iloc[train_idx].reset_index(drop=True)
            self.train_classifier(X_train_a, X_train_all, y_train, train_data)
            train_prob = self.classifier.predict_proba(X_train_all)[:, 1]
            test_prob = self.classifier.predict_proba(X_test_all)[:, 1]
            
            classifier_scores['train_acc'].append(accuracy_score(y_train, train_prob > 0.5))
            classifier_scores['test_acc'].append(accuracy_score(y_test, test_prob > 0.5))
            classifier_scores['train_auc'].append(roc_auc_score(y_train, train_prob))
            classifier_scores['test_auc'].append(roc_auc_score(y_test, test_prob))
            
            self.train_regressor1(X_train_b, medal_train)
            train_medal_pred = self.regressor1.predict(np.hstack([train_prob.reshape(-1, 1), X_train_b]))
            test_medal_pred = self.regressor1.predict(np.hstack([test_prob.reshape(-1, 1), X_test_b]))
            regressor1_scores['train_r2'].append(r2_score(medal_train, train_medal_pred))
            regressor1_scores['test_r2'].append(r2_score(medal_test, test_medal_pred))
            regressor1_scores['train_mse'].append(mean_squared_error(medal_train, train_medal_pred))
            regressor1_scores['test_mse'].append(mean_squared_error(medal_test, test_medal_pred))
            
            self.train_regressor2(X_train_c, gold_train)
            train_gold_pred = self.regressor2.predict(np.hstack([
                (1 - self.mu_weight) * train_medal_pred.reshape(-1, 1),
                self.mu_weight * X_train_c
            ]))
            test_gold_pred = self.regressor2.predict(np.hstack([
                (1 - self.mu_weight) * test_medal_pred.reshape(-1, 1),
                self.mu_weight * X_test_c
            ]))
            
            regressor2_scores['train_r2'].append(r2_score(gold_train, train_gold_pred))
            regressor2_scores['test_r2'].append(r2_score(gold_test, test_gold_pred))
            regressor2_scores['train_mse'].append(mean_squared_error(gold_train, train_gold_pred))
            regressor2_scores['test_mse'].append(mean_squared_error(gold_test, test_gold_pred))

        print("\n模型评估结果 (5-fold交叉验证):")
        print("\n分类器 (预测是否获得奖牌):")
        print(f"训练集 Accuracy: {np.mean(classifier_scores['train_acc']):.4f} (±{np.std(classifier_scores['train_acc']):.4f})")
        print(f"测试集 Accuracy: {np.mean(classifier_scores['test_acc']):.4f} (±{np.std(classifier_scores['test_acc']):.4f})")
        print(f"训练集 AUC: {np.mean(classifier_scores['train_auc']):.4f} (±{np.std(classifier_scores['train_auc']):.4f})")
        print(f"测试集 AUC: {np.mean(classifier_scores['test_auc']):.4f} (±{np.std(classifier_scores['test_auc']):.4f})")
        
        print("\n回归器1 (预测奖牌总数):")
        print(f"训练集 R2: {np.mean(regressor1_scores['train_r2']):.4f} (±{np.std(regressor1_scores['train_r2']):.4f})")
        print(f"测试集 R2: {np.mean(regressor1_scores['test_r2']):.4f} (±{np.std(regressor1_scores['test_r2']):.4f})")
        print(f"训练集 MSE: {np.mean(regressor1_scores['train_mse']):.4f} (±{np.std(regressor1_scores['train_mse']):.4f})")
        print(f"测试集 MSE: {np.mean(regressor1_scores['test_mse']):.4f} (±{np.std(regressor1_scores['test_mse']):.4f})")
        
        print("\n回归器2 (预测金牌数):")
        print(f"训练集 R2: {np.mean(regressor2_scores['train_r2']):.4f} (±{np.std(regressor2_scores['train_r2']):.4f})")
        print(f"测试集 R2: {np.mean(regressor2_scores['test_r2']):.4f} (±{np.std(regressor2_scores['test_r2']):.4f})")
        print(f"训练集 MSE: {np.mean(regressor2_scores['train_mse']):.4f} (±{np.std(regressor2_scores['train_mse']):.4f})")
        print(f"测试集 MSE: {np.mean(regressor2_scores['test_mse']):.4f} (±{np.std(regressor2_scores['test_mse']):.4f})")

    def train_and_predict_period(self, data, start_year, end_year):
        olympic_years = [1896, 1900, 1904, 1908, 1912, 1920, 1924, 1928, 1932, 1936, 1948, 1952, 1956, 1960, 1964, 1968, 1972, 1976, 1980, 1984, 1988, 1992, 1996, 2000, 2004, 2008, 2012, 2016, 2020]
        period_data = data[(data['Year'] >= start_year) & (data['Year'] <= end_year)].reset_index(drop=True)

        features_a, features_b, features_c, features_all, labels, medal_expectation, gold_expectation = self.preprocess_data(period_data)

        self.train_classifier(features_a, features_all, labels, period_data)
        self.train_regressor1(features_b, medal_expectation)
        self.train_regressor2(features_c, gold_expectation)

        olympic_mask = period_data['Year'].isin(olympic_years)
        olympic_data = period_data[olympic_mask].reset_index(drop=True)
        olympic_features_all = features_all[olympic_mask]
        time_series_data = olympic_features_all.reshape((olympic_features_all.shape[0], 1, olympic_features_all.shape[1]))

        if self.lstm_model is None:
            self.lstm_model = self.create_lstm_attention_model(time_series_data.shape)

        lr_scheduler = ReduceLROnPlateau(monitor='loss', factor=0.5, patience=3, min_lr=1e-6, verbose=1)
        self.lstm_model.fit(time_series_data, gold_expectation[olympic_mask], epochs=10, batch_size=32, verbose=1, callbacks=[lr_scheduler])

        last_year_data = period_data[period_data['Year'] == end_year].reset_index(drop=True)
        end_year_olympic_mask = olympic_data['Year'] == end_year
        time_series_last_year = time_series_data[end_year_olympic_mask]

        if len(time_series_last_year) == 0:
            most_recent_year = olympic_data['Year'].max()
            time_series_last_year = time_series_data[olympic_data['Year'] == most_recent_year]

        if len(time_series_last_year.shape) == 2:
            time_series_last_year = np.expand_dims(time_series_last_year, axis=0)

        lstm_gold_prediction = self.lstm_model.predict(time_series_last_year, verbose=0)
        available_years = period_data['Year'].unique()
        prediction_year = end_year if end_year in available_years else available_years.max()

        classifier_probabilities = self.classifier_output[:, 1][period_data['Year'] == prediction_year]
        medal_prediction = self.regressor1.predict(np.hstack([
            classifier_probabilities.reshape(-1, 1), 
            features_b[period_data['Year'] == prediction_year]
        ]))

        gold_prediction_rf = self.regressor2.predict(np.hstack([
            (1 - self.mu_weight) * medal_prediction.reshape(-1, 1),
            self.mu_weight * features_c[period_data['Year'] == prediction_year]
        ]))

        prediction_data = period_data[period_data['Year'] == prediction_year].reset_index(drop=True)
        results = pd.DataFrame({
            'NOC': prediction_data['NOC_x'],
            'Year': prediction_year,
            'Will_Win_Medal': classifier_probabilities > 0.5,
            'Medal_Prediction': medal_prediction,
            'Gold_Prediction': gold_prediction_rf
        })
        print("\n执行模型评估...")
        self.evaluate_model(features_a, features_b, features_c, features_all, 
                            labels, medal_expectation, gold_expectation, period_data)
        return results
    
# Example usage
data = pd.read_csv("E:/2025_MCM_C\Model1/feature_Noc.csv")  # Replace with your actual dataset
model = OlympicPredictionModel()
start_year = 1964
end_year = 2028
results = model.train_and_predict_period(data, start_year, end_year)
print(results)
# Save the results to a CSV file
results.to_csv("E:/2025_MCM_C\Model1/results.csv", index=False)  # Replace with your desired file path

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10

执行模型评估...





模型评估结果 (5-fold交叉验证):

分类器 (预测是否获得奖牌):
训练集 Accuracy: 0.9650 (±0.0025)
测试集 Accuracy: 0.8662 (±0.0091)
训练集 AUC: 0.9955 (±0.0004)
测试集 AUC: 0.9317 (±0.0127)

回归器1 (预测奖牌总数):
训练集 R2: 0.9175 (±0.0080)
测试集 R2: 0.8576 (±0.0271)
训练集 MSE: 147.5519 (±13.7178)
测试集 MSE: 249.2202 (±49.0508)

回归器2 (预测金牌数):
训练集 R2: 0.8886 (±0.0185)
测试集 R2: 0.7573 (±0.0620)
训练集 MSE: 35.5964 (±6.2137)
测试集 MSE: 77.5772 (±26.9304)
     NOC  Year  Will_Win_Medal  Medal_Prediction  Gold_Prediction
0    AIN  2024            True          6.410787         1.061121
1    ALG  2024            True          3.317256         0.357918
2    ARG  2024            True         13.091574         3.527046
3    ARM  2024            True          3.598994         0.062954
4    ARU  2024           False          0.093999        -0.357004
..   ...   ...             ...               ...              ...
107  USA  2024            True        279.754774       123.787257
108  UZB  2024            True         11.124393         3.698060
109  VAN 

In [None]:
# version 3.0
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from keras.models import Model
from keras.layers import Input, LSTM, Dense, Dot, Activation, Concatenate
from keras.optimizers import Adam
from keras.callbacks import ReduceLROnPlateau

class OlympicPredictionModel:
    from sklearn.neural_network import MLPRegressor

    def __init__(self, lambda_weight=0.3, mu_weight=0.85, learning_rate=0.001):
        self.lambda_weight = lambda_weight
        self.mu_weight = mu_weight
        self.learning_rate = learning_rate  # Added for Adam optimizer tuning
        self.classifier = RandomForestClassifier(n_estimators=200)
        # Change RandomForestRegressor to MLPRegressor
        self.regressor1 = self.MLPRegressor(hidden_layer_sizes=(64, 32), max_iter=500, random_state=42)
        self.regressor2 = self.MLPRegressor(hidden_layer_sizes=(64, 32), max_iter=500, random_state=42)
        self.lstm_model = None

    @staticmethod
    def Country_classification(year, Noc_x, data):
        historical_data = data[data['Year'] <= year]
        country_total = historical_data[historical_data['NOC_x'] == Noc_x]['Name_Count'].sum()
        all_countries = historical_data.groupby('NOC_x')['Name_Count'].sum()
        threshold = all_countries.quantile(0.2)
        return country_total <= threshold

    def calculate_w4_vector(self, country_change, athlete_change, event_changes):
        w1 = np.sum(country_change, axis=1).astype(np.float64)
        w2 = np.sum(athlete_change, axis=1).astype(np.float64)
        w3 = np.sum(event_changes, axis=1).astype(np.float64)
        w4 = np.divide(0.2 * w1 + 0.8 * w2, w3, out=np.zeros_like(w3, dtype=np.float64), where=w3 != 0)
        return w4

    def preprocess_data(self, data):
        year_mapping = pd.read_csv("year_transmition_reference.csv")
        year_dict = dict(zip(year_mapping['evaluating_year'], year_mapping['use_data_from']))

        features_list = []
        for idx, row in data.iterrows():
            current_year = row['Year']
            reference_year = year_dict.get(current_year, current_year)
            reference_data = data[data['Year'] == reference_year]

            if len(reference_data) > 0:
                ref_row = reference_data[reference_data['NOC_x'] == row['NOC_x']]
                ref_row = ref_row.iloc[0] if len(ref_row) > 0 else row
            else:
                ref_row = row

            score_rate_list = np.array(eval(ref_row['Score_Rate_List']))
            score_list = np.array(eval(ref_row['Score_List']), dtype=np.float64)
            participants_list = np.array(eval(ref_row['Participants_List']), dtype=np.float64)

            v4_ = np.sum(score_rate_list)
            v5_ = np.sum(participants_list)
            v8_ = np.sum(np.divide(score_list, participants_list, out=np.zeros_like(score_list, dtype=np.float64), where=participants_list > 0))

            country_change = np.array(eval(ref_row['CountryChange']))
            athlete_change = np.array(eval(ref_row['AthleteChange']))
            event_changes = np.array(eval(ref_row['Event_Changes']))
            w4_ = self.calculate_w4_vector(country_change.reshape(1, -1), athlete_change.reshape(1, -1), event_changes.reshape(1, -1))[0]

            v8_w4 = np.sum(np.multiply(np.divide(score_list, participants_list, out=np.zeros_like(score_list, dtype=np.float64), where=participants_list > 0), w4_))

            v1 = ref_row['if_host']
            v6 = ref_row['Gold_prev'] if pd.notna(ref_row['Gold_prev']) else 0
            v7 = ref_row['distance']
            v3_ = ref_row['Score']
            v3_w4 = np.sum(np.multiply(score_list, w4_))

            w3_ = np.sum(event_changes)

            features = {
                'features_a': [v5_, v8_, v8_w4],
                'features_b': [v1, v5_, v6, v7, v3_, v3_w4, v8_],
                'features_c': [v1, v5_, v6, v7, v3_, v3_w4, v8_],
                'features_all': [v1, v3_, v4_, v5_, v6, v7, v8_, w3_, w4_]
            }
            features_list.append(features)

        features_a = np.array([f['features_a'] for f in features_list])
        features_b = np.array([f['features_b'] for f in features_list])
        features_c = np.array([f['features_c'] for f in features_list])
        features_all = np.array([f['features_all'] for f in features_list])

        labels = (data['Total_Medals'] > 0).astype(int).values
        medal_expectation = data['Total_Medals'].values
        gold_expectation = data['Gold'].values

        scaler = StandardScaler()
        features_a_scaled = scaler.fit_transform(features_a)
        features_b_scaled = scaler.fit_transform(features_b)
        features_c_scaled = scaler.fit_transform(features_c)
        features_all_scaled = scaler.fit_transform(features_all)

        return features_a_scaled, features_b_scaled, features_c_scaled, features_all_scaled, labels, medal_expectation, gold_expectation

    def train_classifier(self, features_a, features_all, labels, data):
        years = data['Year'].values
        nocs = data['NOC_x'].values

        is_small_countries = np.array([self.Country_classification(year, noc, data)
                                     for year, noc in zip(years, nocs)])

        final_features = np.zeros((len(labels), features_all.shape[1]))
        small_countries_mask = is_small_countries

        final_features[small_countries_mask] = (1 - self.lambda_weight) * features_all[small_countries_mask]
        final_features[small_countries_mask, :features_a.shape[1]] += self.lambda_weight * features_a[small_countries_mask]

        final_features[~small_countries_mask] = features_all[~small_countries_mask]

        self.classifier.fit(final_features, labels)
        self.classifier_output = self.classifier.predict_proba(final_features)

    def train_regressor1(self, features_b, medal_expectation):
        classifier_probabilities = self.classifier_output[:, 1]
        input_features = np.hstack([classifier_probabilities.reshape(-1, 1), features_b])
        self.regressor1.fit(input_features, medal_expectation)

    def train_regressor2(self, features_c, gold_expectation):
        classifier_probabilities = self.classifier_output[:, 1]
        input_features_reg1 = np.hstack([classifier_probabilities.reshape(-1, 1), features_c])
        medal_expectation = self.regressor1.predict(input_features_reg1)
        input_features = np.hstack([
            (1 - self.mu_weight) * medal_expectation.reshape(-1, 1),
            self.mu_weight * features_c
        ])
        self.regressor2.fit(input_features, gold_expectation)

    def create_lstm_attention_model(self, input_shape):
        inputs = Input(shape=(input_shape[1], input_shape[2]))
        lstm_out, state_h, state_c = LSTM(64, return_sequences=True, return_state=True)(inputs)
        attention = Dot(axes=[2, 2])([lstm_out, lstm_out])
        attention = Activation('softmax')(attention)
        context = Dot(axes=[2, 1])([attention, lstm_out])
        combined = Concatenate()([context, lstm_out])
        outputs = Dense(1, activation='linear')(combined)
        optimizer = Adam(learning_rate=self.learning_rate)
        model = Model(inputs, outputs)
        model.compile(optimizer=optimizer, loss='mean_squared_error')
        return model

    def evaluate_model(self, features_a, features_b, features_c, features_all, labels, medal_expectation, gold_expectation, data, cv=5):
        """评估模型性能"""
        from sklearn.model_selection import KFold 
        from sklearn.metrics import accuracy_score, roc_auc_score, r2_score, mean_squared_error
        
        kf = KFold(n_splits=cv, shuffle=True, random_state=42)
        
        classifier_scores = {'train_acc': [], 'test_acc': [], 'train_auc': [], 'test_auc': []}
        regressor1_scores = {'train_r2': [], 'test_r2': [], 'train_mse': [], 'test_mse': []}
        regressor2_scores = {'train_r2': [], 'test_r2': [], 'train_mse': [], 'test_mse': []}
        
        for train_idx, test_idx in kf.split(features_all):
            X_train_a = features_a[train_idx]
            X_test_a = features_a[test_idx] 
            X_train_b = features_b[train_idx]
            X_test_b = features_b[test_idx]
            X_train_c = features_c[train_idx]
            X_test_c = features_c[test_idx]
            X_train_all = features_all[train_idx]
            X_test_all = features_all[test_idx]
            y_train = labels[train_idx]
            y_test = labels[test_idx]
            medal_train = medal_expectation[train_idx]
            medal_test = medal_expectation[test_idx]
            gold_train = gold_expectation[train_idx]
            gold_test = gold_expectation[test_idx]
            
            train_data = data.iloc[train_idx].reset_index(drop=True)
            self.train_classifier(X_train_a, X_train_all, y_train, train_data)
            train_prob = self.classifier.predict_proba(X_train_all)[:, 1]
            test_prob = self.classifier.predict_proba(X_test_all)[:, 1]
            
            classifier_scores['train_acc'].append(accuracy_score(y_train, train_prob > 0.5))
            classifier_scores['test_acc'].append(accuracy_score(y_test, test_prob > 0.5))
            classifier_scores['train_auc'].append(roc_auc_score(y_train, train_prob))
            classifier_scores['test_auc'].append(roc_auc_score(y_test, test_prob))
            
            self.train_regressor1(X_train_b, medal_train)
            train_medal_pred = self.regressor1.predict(np.hstack([train_prob.reshape(-1, 1), X_train_b]))
            test_medal_pred = self.regressor1.predict(np.hstack([test_prob.reshape(-1, 1), X_test_b]))
            regressor1_scores['train_r2'].append(r2_score(medal_train, train_medal_pred))
            regressor1_scores['test_r2'].append(r2_score(medal_test, test_medal_pred))
            regressor1_scores['train_mse'].append(mean_squared_error(medal_train, train_medal_pred))
            regressor1_scores['test_mse'].append(mean_squared_error(medal_test, test_medal_pred))
            
            self.train_regressor2(X_train_c, gold_train)
            train_gold_pred = self.regressor2.predict(np.hstack([
                (1 - self.mu_weight) * train_medal_pred.reshape(-1, 1),
                self.mu_weight * X_train_c
            ]))
            test_gold_pred = self.regressor2.predict(np.hstack([
                (1 - self.mu_weight) * test_medal_pred.reshape(-1, 1),
                self.mu_weight * X_test_c
            ]))
            
            regressor2_scores['train_r2'].append(r2_score(gold_train, train_gold_pred))
            regressor2_scores['test_r2'].append(r2_score(gold_test, test_gold_pred))
            regressor2_scores['train_mse'].append(mean_squared_error(gold_train, train_gold_pred))
            regressor2_scores['test_mse'].append(mean_squared_error(gold_test, test_gold_pred))

        print("\n模型评估结果 (5-fold交叉验证):")
        print("\n分类器 (预测是否获得奖牌):")
        print(f"训练集 Accuracy: {np.mean(classifier_scores['train_acc']):.4f} (±{np.std(classifier_scores['train_acc']):.4f})")
        print(f"测试集 Accuracy: {np.mean(classifier_scores['test_acc']):.4f} (±{np.std(classifier_scores['test_acc']):.4f})")
        print(f"训练集 AUC: {np.mean(classifier_scores['train_auc']):.4f} (±{np.std(classifier_scores['train_auc']):.4f})")
        print(f"测试集 AUC: {np.mean(classifier_scores['test_auc']):.4f} (±{np.std(classifier_scores['test_auc']):.4f})")
        
        print("\n回归器1 (预测奖牌总数):")
        print(f"训练集 R2: {np.mean(regressor1_scores['train_r2']):.4f} (±{np.std(regressor1_scores['train_r2']):.4f})")
        print(f"测试集 R2: {np.mean(regressor1_scores['test_r2']):.4f} (±{np.std(regressor1_scores['test_r2']):.4f})")
        print(f"训练集 MSE: {np.mean(regressor1_scores['train_mse']):.4f} (±{np.std(regressor1_scores['train_mse']):.4f})")
        print(f"测试集 MSE: {np.mean(regressor1_scores['test_mse']):.4f} (±{np.std(regressor1_scores['test_mse']):.4f})")
        
        print("\n回归器2 (预测金牌数):")
        print(f"训练集 R2: {np.mean(regressor2_scores['train_r2']):.4f} (±{np.std(regressor2_scores['train_r2']):.4f})")
        print(f"测试集 R2: {np.mean(regressor2_scores['test_r2']):.4f} (±{np.std(regressor2_scores['test_r2']):.4f})")
        print(f"训练集 MSE: {np.mean(regressor2_scores['train_mse']):.4f} (±{np.std(regressor2_scores['train_mse']):.4f})")
        print(f"测试集 MSE: {np.mean(regressor2_scores['test_mse']):.4f} (±{np.std(regressor2_scores['test_mse']):.4f})")

    def train_and_predict_period(self, data, start_year, end_year):
        olympic_years = [1896, 1900, 1904, 1908, 1912, 1920, 1924, 1928, 1932, 1936, 1948, 1952, 1956, 1960, 1964, 1968, 1972, 1976, 1980, 1984, 1988, 1992, 1996, 2000, 2004, 2008, 2012, 2016, 2020]
        period_data = data[(data['Year'] >= start_year) & (data['Year'] <= end_year)].reset_index(drop=True)

        features_a, features_b, features_c, features_all, labels, medal_expectation, gold_expectation = self.preprocess_data(period_data)

        self.train_classifier(features_a, features_all, labels, period_data)
        self.train_regressor1(features_b, medal_expectation)
        self.train_regressor2(features_c, gold_expectation)

        olympic_mask = period_data['Year'].isin(olympic_years)
        olympic_data = period_data[olympic_mask].reset_index(drop=True)
        olympic_features_all = features_all[olympic_mask]
        time_series_data = olympic_features_all.reshape((olympic_features_all.shape[0], 1, olympic_features_all.shape[1]))

        if self.lstm_model is None:
            self.lstm_model = self.create_lstm_attention_model(time_series_data.shape)

        lr_scheduler = ReduceLROnPlateau(monitor='loss', factor=0.5, patience=3, min_lr=1e-6, verbose=1)
        self.lstm_model.fit(time_series_data, gold_expectation[olympic_mask], epochs=10, batch_size=32, verbose=1, callbacks=[lr_scheduler])

        last_year_data = period_data[period_data['Year'] == end_year].reset_index(drop=True)
        end_year_olympic_mask = olympic_data['Year'] == end_year
        time_series_last_year = time_series_data[end_year_olympic_mask]

        if len(time_series_last_year) == 0:
            most_recent_year = olympic_data['Year'].max()
            time_series_last_year = time_series_data[olympic_data['Year'] == most_recent_year]

        if len(time_series_last_year.shape) == 2:
            time_series_last_year = np.expand_dims(time_series_last_year, axis=0)

        lstm_gold_prediction = self.lstm_model.predict(time_series_last_year, verbose=0)
        available_years = period_data['Year'].unique()
        prediction_year = end_year if end_year in available_years else available_years.max()

        classifier_probabilities = self.classifier_output[:, 1][period_data['Year'] == prediction_year]
        medal_prediction = self.regressor1.predict(np.hstack([
            classifier_probabilities.reshape(-1, 1), 
            features_b[period_data['Year'] == prediction_year]
        ]))

        gold_prediction_rf = self.regressor2.predict(np.hstack([
            (1 - self.mu_weight) * medal_prediction.reshape(-1, 1),
            self.mu_weight * features_c[period_data['Year'] == prediction_year]
        ]))

        prediction_data = period_data[period_data['Year'] == prediction_year].reset_index(drop=True)
        results = pd.DataFrame({
            'NOC': prediction_data['NOC_x'],
            'Year': prediction_year,
            'Will_Win_Medal': classifier_probabilities > 0.5,
            'Medal_Prediction': medal_prediction,
            'Gold_Prediction': gold_prediction_rf
        })
        print("\n执行模型评估...")
        self.evaluate_model(features_a, features_b, features_c, features_all, 
                            labels, medal_expectation, gold_expectation, period_data)
        return results
    
# Example usage
data = pd.read_csv("E:/2025_MCM_C\Model1/feature_Noc.csv")  # Replace with your actual dataset
model = OlympicPredictionModel()
start_year = 1964
end_year = 2028
results = model.train_and_predict_period(data, start_year, end_year)
print(results)
# Save the results to a CSV file
results.to_csv("E:/2025_MCM_C\Model1/results.csv", index=False)  # Replace with your desired file path