In [1]:
import pandas as pd
import glob
import random
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, f1_score, recall_score
from sklearn.preprocessing import StandardScaler

In [2]:
"""
数据准备
"""
folder_path = './data_with_momentum'
file_list = glob.glob(f'{folder_path}/2023-wimbledon-*.csv')
dataframes = [pd.read_csv(file) for file in file_list]
# combined_data = pd.concat(dataframes, ignore_index=True)

In [3]:
"""
特征工程：创建新的特征列
"""
# 数据拆分
def dataset_split(dataframes, ratio=0.2):
    # 确定训练集和测试集的数量
    total_dataframes = len(dataframes)
    num_test = int(total_dataframes * ratio)
    num_train = total_dataframes - num_test

    # 随机选择DataFrame分配到训练集和测试集
    random.shuffle(dataframes)
    train_dfs = dataframes[:num_train]
    test_dfs = dataframes[num_train:]

    # 合并DataFrame
    combined_train_df = pd.concat(train_dfs, ignore_index=True)
    combined_test_df = pd.concat(test_dfs, ignore_index=True)

    return combined_train_df, combined_test_df


combined_train_df, combined_test_df = dataset_split(dataframes, ratio=0.2)

# 从合并后的DataFrame中提取特征和目标变量
feature_columns = ['p1_sets', 'p2_sets', 'p1_games', 'p2_games', 'server', 'p1_ace', 'p2_ace', 'p1_double_fault', 'p2_double_fault', 'p1_unf_err', 'p2_unf_err', 'p1_break_pt_won', 'p2_break_pt_won', 'p1_break_pt_missed', 'p2_break_pt_missed', 'p1_distance_run', 'p2_distance_run', 'rally_count', 'speed_mph']
target_column = 'score_diff'

# 创建新的特征列
combined_train_df['score_diff'] = combined_train_df['comprehensive_momentum_1'] - combined_train_df['comprehensive_momentum_2']
combined_test_df['score_diff'] = combined_test_df['comprehensive_momentum_1'] - combined_test_df['comprehensive_momentum_2']

X_train = combined_train_df[feature_columns]
y_train = combined_train_df[target_column]
X_test = combined_test_df[feature_columns]
y_test = combined_test_df[target_column]

# 特征标准化
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [4]:
"""
选择模型
"""
model = RandomForestClassifier()  # 或 GradientBoostingClassifier()

# 训练模型
model.fit(X_train, y_train)

# 交叉验证评估
scores = cross_val_score(model, X_train, y_train, cv=5)
print("Cross-validated scores:", scores)

# 测试集预测
y_pred = model.predict(X_test)

ValueError: Input X contains NaN.
RandomForestClassifier does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

In [None]:
"""
性能评估
"""
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='binary')
recall = recall_score(y_test, y_pred, average='binary')

print("Accuracy:", accuracy)
print("F1 Score:", f1)
print("Recall:", recall)