In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd

def load_data(filepath):
    """
    Load the data and split it into training and testing sets with a 3:7 ratio.

    Parameters:
    filepath (str): Path to the data file

    Returns:
    X_train, X_test, y_train, y_test: Features and target for training and testing sets
    """

    data = pd.read_csv(filepath)
    
    X = data.drop(columns=['Rings'])  
    y = data['Rings']
    
    
    return X,y


def decision_tree_find_best_params_avg(X, y, selected_features=None, n_splits=5):
    """在多个训练/测试数据集上寻找决策树的最优超参数组合，并计算平均性能"""
    # 定义超参数范围
    param_grid = {
        'max_depth': [None, 5, 10, 15, 20],
        'min_samples_split': [2, 5, 10, 15],
        'criterion': ['gini', 'entropy']
    }
    
    best_params = None
    best_avg_accuracy = 0
    best_avg_f1 = 0
    best_model = None

    # 遍历所有的超参数组合
    for max_depth in param_grid['max_depth']:
        for min_samples_split in param_grid['min_samples_split']:
            for criterion in param_grid['criterion']:
                # 设置当前的超参数
                params = {
                    'max_depth': max_depth,
                    'min_samples_split': min_samples_split,
                    'criterion': criterion
                }

                accuracies = []
                f1_scores = []
                
                # 进行n_splits次训练和评估
                for _ in range(n_splits):
                    # 随机划分训练/测试数据集
                    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=None)
                    
                    # 初始化模型并进行训练
                    dt_clf = DecisionTreeClassifier(**params)
                    dt_clf.fit(X_train, y_train.values.ravel())
                    
                    # 进行预测并计算准确率和F1分数
                    y_pred = dt_clf.predict(X_test)
                    accuracies.append(accuracy_score(y_test, y_pred))
                    f1_scores.append(f1_score(y_test, y_pred, average='weighted'))
                
                # 计算该超参数组合的平均准确率和平均F1分数
                avg_accuracy = np.mean(accuracies)
                avg_f1 = np.mean(f1_scores)
                
                print(f"Parameters: {params} - Average Accuracy: {avg_accuracy:.4f}, Average F1 Score: {avg_f1:.4f}")
                
                # 检查是否是最佳模型
                if avg_f1 > best_avg_f1 or (avg_f1 == best_avg_f1 and avg_accuracy > best_avg_accuracy):
                    best_avg_f1 = avg_f1
                    best_avg_accuracy = avg_accuracy
                    best_params = params
                    best_model = dt_clf
    
    print(f"\nBest Parameters: {best_params} - Best Average Accuracy: {best_avg_accuracy:.4f}, Best Average F1 Score: {best_avg_f1:.4f}")
    return best_model, best_params


if __name__ == '__main__':
    X,y = load_data('C:/Users/Admin/Desktop/NN_assignment3/data/abalone.csv')

    decision_tree_find_best_params_avg(X, y, selected_features=None, n_splits=5)


In [None]:
# Experiment PART
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd

def load_data(filepath):
    """
    Load the data and split it into features (X) and target (y).
    """
    data = pd.read_csv(filepath)
    X = data.drop(columns=['Rings'])  
    y = data['Rings']
    return X, y

def decision_tree_post_pruning(X, y, selected_features=None, n_splits=5):
    """Find the best post-pruning parameter (ccp_alpha) for Decision Tree using average performance."""
    
    # Initialize variables to track the best model and its parameters
    best_ccp_alpha = None
    best_avg_accuracy = 0
    best_avg_f1 = 0
    best_model = None
    
    # Split the data once to determine the range of ccp_alpha values
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=0)
    dt = DecisionTreeClassifier(random_state=0)
    path = dt.cost_complexity_pruning_path(X_train, y_train)
    ccp_alphas = path.ccp_alphas  # Get range of ccp_alpha values for pruning

    # Iterate over each value of ccp_alpha
    for ccp_alpha in ccp_alphas:
        accuracies = []
        f1_scores = []
        
        # Perform n_splits train-test splits and evaluate the model
        for _ in range(n_splits):
            # Split the data randomly
            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=None)
            
            # Initialize the decision tree with the current ccp_alpha value
            dt_pruned = DecisionTreeClassifier(random_state=0, ccp_alpha=ccp_alpha)
            dt_pruned.fit(X_train, y_train.values.ravel())
            
            # Predict and evaluate
            y_pred = dt_pruned.predict(X_test)
            accuracies.append(accuracy_score(y_test, y_pred))
            f1_scores.append(f1_score(y_test, y_pred, average='weighted'))
        
        # Calculate average accuracy and F1 score for the current ccp_alpha
        avg_accuracy = np.mean(accuracies)
        avg_f1 = np.mean(f1_scores)
        
        print(f"ccp_alpha: {ccp_alpha:.4f} - Average Accuracy: {avg_accuracy:.4f}, Average F1 Score: {avg_f1:.4f}")
        
        # Check if this is the best model so far
        if avg_f1 > best_avg_f1 or (avg_f1 == best_avg_f1 and avg_accuracy > best_avg_accuracy):
            best_avg_f1 = avg_f1
            best_avg_accuracy = avg_accuracy
            best_ccp_alpha = ccp_alpha
            best_model = dt_pruned
    
    print(f"\nBest ccp_alpha: {best_ccp_alpha} - Best Average Accuracy: {best_avg_accuracy:.4f}, Best Average F1 Score: {best_avg_f1:.4f}")
    return best_model, best_ccp_alpha

if __name__ == '__main__':
    X, y = load_data('C:/Users/Admin/Desktop/NN_assignment3/data/abalone.csv')
    decision_tree_post_pruning(X, y, selected_features=None, n_splits=5)

ccp_alpha: 0.0000 - Average Accuracy: 0.5452, Average F1 Score: 0.5457
ccp_alpha: 0.0002 - Average Accuracy: 0.5498, Average F1 Score: 0.5508
ccp_alpha: 0.0002 - Average Accuracy: 0.5359, Average F1 Score: 0.5362
ccp_alpha: 0.0002 - Average Accuracy: 0.5359, Average F1 Score: 0.5364
ccp_alpha: 0.0002 - Average Accuracy: 0.5359, Average F1 Score: 0.5360
ccp_alpha: 0.0002 - Average Accuracy: 0.5416, Average F1 Score: 0.5417
ccp_alpha: 0.0002 - Average Accuracy: 0.5443, Average F1 Score: 0.5442
ccp_alpha: 0.0002 - Average Accuracy: 0.5411, Average F1 Score: 0.5404
ccp_alpha: 0.0002 - Average Accuracy: 0.5364, Average F1 Score: 0.5365
ccp_alpha: 0.0002 - Average Accuracy: 0.5416, Average F1 Score: 0.5422
ccp_alpha: 0.0002 - Average Accuracy: 0.5478, Average F1 Score: 0.5479
ccp_alpha: 0.0002 - Average Accuracy: 0.5419, Average F1 Score: 0.5432
ccp_alpha: 0.0002 - Average Accuracy: 0.5505, Average F1 Score: 0.5511
ccp_alpha: 0.0003 - Average Accuracy: 0.5435, Average F1 Score: 0.5437
ccp_al