### 导入数据，进行初步的EDA。

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns



df = pd.read_csv('../data/Label/Updated_Medal_Counts_with_Athlete_Numbers.csv')

### 进行特征筛选与特征降维
- 首先分析变量与目标变量之间的相关性，然后分析变量与目标变量的互信息，综合考虑两者的结果，进行特征选择。
- 观察看有没有必要进行特征降维，如果有，可以考虑使用PCA等方法进行特征降维。

In [30]:
df.head()
target = ['Gold', 'Silver', 'Bronze','Total']
features = ['Rank', 'Host','Number_of_people']

# # 相关性分析
# corr = df[features+target].corr()
# plt.figure(figsize=(10, 10))
# sns.heatmap(corr, annot=True, cmap='coolwarm')
# plt.title('Correlation Matrix')
# plt.show()

    
# # 统计之前从来没有获得过奖牌的国家
medal_0_country = []
# for country in df['NOC'].unique():
#     country_df = df[df['NOC'] == country]
#     if country_df['Goden'].sum() == 0:
#         medal_0_country.append(country)
        
# print('The number of countries that have never won a medal:', len(medal_0_country))


df1 = pd.read_csv('../data/summerOly_athletes.csv')
#把Medal 这列的数据进行lable编码
df1['Medal'] = df1['Medal'].map({'Gold': 3, 'Silver': 2, 'Bronze': 1, 'No medal': 0})
for country in df1['NOC'].unique():
    country_df = df1[df1['NOC'] == country]
    #  观察Medal列，如果这个国家一直都是"no medal"，那么这个国家就是从来没有获得过奖牌的国家
    if country_df['Medal'].sum() == 0:
        medal_0_country.append(country)

        
        
print('The number of countries that have never won a medal:', len(medal_0_country))
print(medal_0_country)

    
    

The number of countries that have never won a medal: 77
['CHA', 'NCA', 'LBA', 'PLE', 'COM', 'BRU', 'MDV', 'YAR', 'CGO', 'BEN', 'SOM', 'MLI', 'ANG', 'BAN', 'ESA', 'HON', 'SEY', 'MTN', 'SKN', 'VIN', 'LBR', 'NEP', 'PLW', 'ASA', 'SAM', 'RWA', 'MLT', 'GUI', 'BIZ', 'YMD', 'SLE', 'PNG', 'YEM', 'OMA', 'VAN', 'IVB', 'CAF', 'MAD', 'MAL', 'BIH', 'GUM', 'CAY', 'GBS', 'TLS', 'COD', 'LAO', 'ROT', 'CAM', 'SOL', 'CRT', 'GEQ', 'BOL', 'SAA', 'ANT', 'AND', 'FSM', 'MYA', 'MAW', 'RHO', 'STP', 'LIE', 'GAM', 'COK', 'SWZ', 'NBO', 'ARU', 'NRU', 'VNM', 'BHU', 'MHL', 'KIR', 'UNK', 'TUV', 'NFL', 'SSD', 'LES', 'LBN']


### 接下来完成第一步也就是零膨胀模型的构建
- 首先判断历史数据的平稳性，使用Augmented Dickey-Fuller检验验证奖牌数时间序列的平稳性。
- 然后为第一部分（零膨胀逻辑回归）选择合适的特征，使用逻辑回归模型预测某国是否属于“永远无法获奖”的群体（结构性零值）。
- 对于第二部分（计数部分），如果数据不平稳就选择负二项分布，如果数据平稳就选择泊松分布。
- 最后，将两部分的结果相乘，得到最终的预测结果。


In [None]:
from statsmodels.tsa.stattools import adfuller
from statsmodels.discrete.count_model import ZeroInflatedNegativeBinomialP


def check_stationarity(df):
    for col in target:
        # ADF test
        result = adfuller(df[col])
        print(f'ADF Statistic for {col}: {result[0]}')
        print(f'p-value: {result[1]}')
        if result[1] > 0.05:
            return False
    return True

## 开始训练零膨胀负二项模型


# 首先检查时间序列的平稳性
if check_stationarity(df[target]):
    print('Time Series is Stationary')

# 划分训练集和测试集
from sklearn.model_selection import train_test_split
X = df[features]
y = df[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
formular = f'{target[0]} ~ {" + ".join(features)}'

# 训练模型
model = ZeroInflatedNegativeBinomialP.from_formula(formular, data=X_train,exog_infl=~1)
zinb_results = model.fit(maxiter=1000)

## 预测
y_pred = zinb_results.predict(X_test,which='prob')

#计算ROC曲线
from sklearn.metrics import roc_curve, roc_auc_score
fpr, tpr, thresholds = roc_curve(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred)

plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)

#寻找最佳阈值
optimal_idx = np.argmax(tpr - fpr)
optimal_threshold = thresholds[optimal_idx]
plt.scatter(fpr[optimal_idx], tpr[optimal_idx], color='red', label='Best Threshold')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic Curve')



#预测下一届奥运会哪些国家第一次获得奖牌

next_year = 2028
prediction = zinb_results.predict(df[features.keys()])
print(f'Predicted Medal Counts for {next_year} Olympics:')
print(prediction)







        

ADF Statistic for Gold: -17.049991695407932
p-value: 8.082859650011109e-30
ADF Statistic for Silver: -10.192795347880692
p-value: 6.258055104118044e-18
ADF Statistic for Bronze: -12.867253968284931
p-value: 4.964890754665885e-24
ADF Statistic for Total: -14.598533929571866
p-value: 4.202245859199421e-27
Time Series is Stationary


PatsyError: Error evaluating factor: NameError: name 'Gold' is not defined
    Gold ~ Rank + Host + Number_of_people
    ^^^^

### 第二阶段使用混合预测模型
- 首先使用ARIMAX进行时间序预测，结合历史数据和外部因素，预测下一届奥运会的奖牌数。
- 然后结合Xgboost进行残差预测，得到最终的预测结果。

In [None]:
from statsmodels.tsa.statespace.sarimax import SARIMAX
from pmdarima import auto_arima
from sklearn.preprocessing import StandardScaler

## 平稳性处理模块
def make_stationary(series):
    d = 0 
    while adfuller(series)[1] > 0.05:
        series = series.diff().dropna()
        d += 1
    return series, d

#从外部变量预处理模块
# def prepare