In [3]:
# import
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from random_forest import DecisionTree
from matplotlib import pyplot as plt
from random_forest import RandomForest
from pre import preprocess
from tqdm import tqdm
import seaborn as sns

In [4]:
df,scores=preprocess()


In [None]:
#1-1 n_trees
tree_num = np.arange(4, 71, 3)
mse_list = [0] * len(tree_num)
r2_list = [0] * len(tree_num)
for j in range(10):
    print(f"Iteration {j + 1}")
    X_train,X_test,y_train,y_test=train_test_split(df,scores,test_size=0.2,shuffle=True)
    for i in tree_num:
        print(f"Training Random Forest with {i} trees")
        # Initialize the RandomForest model
        model = RandomForest(
            n_trees=i,
            max_depth=5,
            max_features='sqrt'
        )
        model.fit(X_train, y_train)
        # predict
        y_pred=model.predict(X_test)
        mse = mean_squared_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)
        """print(y_pred[:10])  # Print first 10 predictions
        print(f"Mean Squared Error: {mse}")
        print(f"R^2 Score: {r2}")"""
        mse_list[i // 3 - 1] += mse
        r2_list[i // 3 - 1] += r2

mse_list = np.array(mse_list)
r2_list = np.array(r2_list)
# evaluate
plt.plot(tree_num, mse_list / 10, label='MSE', marker='o')
plt.xlabel('Number of Trees')
plt.ylabel('MSE')
plt.title('Number of Trees vs MSE')
plt.legend()
plt.savefig('mse_vs_trees.png')
plt.show()

plt.plot(tree_num, r2_list / 10, label='R^2', marker='o')
plt.xlabel('Number of Trees')
plt.ylabel('R^2')
plt.title('Number of Trees vs R^2')
plt.legend()
plt.savefig('r2_vs_trees.png')
plt.show()

In [None]:
#1-2 max_features
mse_list=[]
r2_list=[]
for i in range(10):
    X_train,X_test,y_train,y_test=train_test_split(df,scores,test_size=0.2,shuffle=True)
    for features_nums in range (0,X_train.shape[1]+1):
        print("features_num= ", features_nums)
        model = RandomForest(
        n_trees=10,
        max_depth=5,
        max_features=features_nums
        )
        model.fit(X_train, y_train)
        # predict
        y_pred=model.predict(X_test)

        # evaluate
        mse = mean_squared_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)
        if(i==0):
            mse_list.append(mse)
            r2_list.append(r2)
        else :
            mse_list[features_nums]+=mse
            r2_list[features_nums]+=r2
mse_list=[x/10 for x in mse_list]
r2_list= [x/10 for x in r2_list]



In [None]:
# 1-3 max depth
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from random_forest import RandomForest
from pre import preprocess
import warnings
warnings.filterwarnings('ignore')

plt.rcParams['font.sans-serif'] = ['Microsoft YaHei', 'SimHei', 'Arial Unicode MS', 'DejaVu Sans']
plt.rcParams['axes.unicode_minus'] = False

df, scores = preprocess()
X_train, X_test, y_train, y_test = train_test_split(df, scores, test_size=0.2, shuffle=True)


depth_values = [3, 5, 8, 10, 12, 15, 18, 20, 25, 30, None]
print(f"test depth: {[str(d) if d is not None else 'None' for d in depth_values]}")

results = []

for i, depth in enumerate(depth_values, 1):
    depth_str = "None" if depth is None else str(depth)
    print(f"[{i}/{len(depth_values)}] test depth: {depth_str}")

    model = RandomForest(
        n_trees=46,
        max_depth=depth,
        max_features='sqrt'
    )

    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)

    r2 = r2_score(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)

    results.append({
        'depth': depth,
        'depth_str': depth_str,
        'r2': r2,
        'mse': mse,
        'rmse': rmse
    })
    
    print(f"    R²: {r2:.4f}, MSE: {mse:.4f}, RMSE: {rmse:.4f}")

results_df = pd.DataFrame(results)

best_r2_idx = results_df['r2'].idxmax()
best_mse_idx = results_df['mse'].idxmin()



x_labels = results_df['depth_str'].tolist()
x_pos = range(len(x_labels))
r2_values = results_df['r2'].values
mse_values = results_df['mse'].values
rmse_values = results_df['rmse'].values

fig = plt.figure(figsize=(18, 6))

plt.subplot(1, 3, 1)
plt.plot(x_pos, r2_values, 'bo-', linewidth=2, markersize=8, label='R² Score')
plt.scatter(best_r2_idx, results_df.loc[best_r2_idx, 'r2'], 
           color='red', s=100, zorder=5, label=f'Best: {results_df.loc[best_r2_idx, "r2"]:.4f}')
plt.xlabel('Max Depth')
plt.ylabel('R² Score')
plt.title('Depth vs R² Score\n(n_trees=10, max_features=sqrt)')
plt.xticks(x_pos, x_labels, rotation=45)
plt.grid(True, alpha=0.3)
plt.legend()
plt.ylim(min(r2_values) - 0.01, max(r2_values) + 0.01)

plt.subplot(1, 3, 2)
plt.plot(x_pos, mse_values, 'go-', linewidth=2, markersize=8, label='MSE')
plt.scatter(best_mse_idx, results_df.loc[best_mse_idx, 'mse'], 
           color='red', s=100, zorder=5, label=f'Best: {results_df.loc[best_mse_idx, "mse"]:.4f}')
plt.xlabel('Max Depth')
plt.ylabel('MSE')
plt.title('Depth vs MSE\n(n_trees=10, max_features=sqrt)')
plt.xticks(x_pos, x_labels, rotation=45)
plt.grid(True, alpha=0.3)
plt.legend()
plt.ylim(min(mse_values) - 0.01, max(mse_values) + 0.01)

plt.subplot(1, 3, 3)
plt.plot(x_pos, rmse_values, 'mo-', linewidth=2, markersize=8, label='RMSE')
best_rmse_idx = results_df['rmse'].idxmin()
plt.scatter(best_rmse_idx, results_df.loc[best_rmse_idx, 'rmse'], 
           color='red', s=100, zorder=5, label=f'Best: {results_df.loc[best_rmse_idx, "rmse"]:.4f}')
plt.xlabel('Max Depth')
plt.ylabel('RMSE')
plt.title('Depth vs RMSE\n(n_trees=10, max_features=sqrt)')
plt.xticks(x_pos, x_labels, rotation=45)
plt.grid(True, alpha=0.3)
plt.legend()
plt.ylim(min(rmse_values) - 0.01, max(rmse_values) + 0.01)

plt.tight_layout()

plt.savefig('depth_analysis_combined.png', dpi=300, bbox_inches='tight')

fig1 = plt.figure(figsize=(10, 6))
plt.plot(x_pos, r2_values, 'bo-', linewidth=3, markersize=10)
plt.scatter(best_r2_idx, results_df.loc[best_r2_idx, 'r2'], 
           color='red', s=150, zorder=5)
plt.xlabel('Max Depth', fontsize=12)
plt.ylabel('R² Score', fontsize=12)
plt.title('Random Forest: Depth vs R² Score\n(n_trees=10, max_features=sqrt)', fontsize=14)
plt.xticks(x_pos, x_labels, rotation=45)
plt.grid(True, alpha=0.3)

for i, (x, y) in enumerate(zip(x_pos, r2_values)):
    plt.annotate(f'{y:.3f}', (x, y), textcoords="offset points", 
                xytext=(0,10), ha='center', fontsize=9)

plt.tight_layout()
plt.savefig('r2_vs_depth.png', dpi=300, bbox_inches='tight')
plt.close()

fig2 = plt.figure(figsize=(10, 6))
plt.plot(x_pos, mse_values, 'go-', linewidth=3, markersize=10)
plt.scatter(best_mse_idx, results_df.loc[best_mse_idx, 'mse'], 
           color='red', s=150, zorder=5)
plt.xlabel('Max Depth', fontsize=12)
plt.ylabel('MSE', fontsize=12)
plt.title('Random Forest: Depth vs MSE\n(n_trees=10, max_features=sqrt)', fontsize=14)
plt.xticks(x_pos, x_labels, rotation=45)
plt.grid(True, alpha=0.3)

for i, (x, y) in enumerate(zip(x_pos, mse_values)):
    plt.annotate(f'{y:.3f}', (x, y), textcoords="offset points", 
                xytext=(0,10), ha='center', fontsize=9)

plt.tight_layout()
plt.savefig('mse_vs_depth.png', dpi=300, bbox_inches='tight')
plt.close()

fig3 = plt.figure(figsize=(10, 6))
plt.plot(x_pos, rmse_values, 'mo-', linewidth=3, markersize=10)
plt.scatter(best_rmse_idx, results_df.loc[best_rmse_idx, 'rmse'], 
           color='red', s=150, zorder=5)
plt.xlabel('Max Depth', fontsize=12)
plt.ylabel('RMSE', fontsize=12)
plt.title('Random Forest: Depth vs RMSE\n(n_trees=10, max_features=sqrt)', fontsize=14)
plt.xticks(x_pos, x_labels, rotation=45)
plt.grid(True, alpha=0.3)

for i, (x, y) in enumerate(zip(x_pos, rmse_values)):
    plt.annotate(f'{y:.3f}', (x, y), textcoords="offset points", 
                xytext=(0,10), ha='center', fontsize=9)

plt.tight_layout()
plt.savefig('rmse_vs_depth.png', dpi=300, bbox_inches='tight')
plt.close()

print("finished!")



In [None]:
# 2-1 fix n_trees=35
X_train,X_test,y_train,y_test=train_test_split(df,scores,test_size=0.2,shuffle=True)
depth_num = np.arange(10, 101, 30)
feature_num = np.arange(3,34,10)

mse_list = np.zeros((len(depth_num), len(feature_num)))
r2_list = np.zeros((len(depth_num), len(feature_num)))


for i, depth in enumerate(depth_num):
    print(f"Training Random Forest with max depth {depth}")
    for j, n_feature in enumerate(feature_num):
        print(f"Training Random Forest with {n_feature} features")
        # Initialize the RandomForest model
        model = RandomForest(
            n_trees=35,
            max_depth=depth,
            max_features=n_feature
        )
        model.fit(X_train, y_train)
        # predict
        y_pred=model.predict(X_test)
        mse = mean_squared_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)
        mse_list[i, j] = mse
        r2_list[i, j] = r2


mse_list = pd.DataFrame(mse_list, index=depth_num, columns=feature_num)
r2_list = pd.DataFrame(r2_list, index=depth_num, columns=feature_num)
# evaluate
sns.heatmap(mse_list, annot=True, fmt=".4f", cmap='viridis')
plt.xlabel('max_features')
plt.ylabel('max_depth')
plt.title('MSE heatmap')
plt.savefig('mse_heatmap.png')
plt.show()

sns.heatmap(r2_list, annot=True, fmt=".4f", cmap='viridis')
plt.xlabel('max_features')
plt.ylabel('max_depth')
plt.title('R^2 heatmap')
plt.savefig('r2_heatmap.png')
plt.show()

In [None]:
#2-2 fix max_features= 13
tree_num = np.arange(25, 55, 3)
depth_num = np.arange(20, 55, 3)
mse_list = np.zeros((len(depth_num), len(tree_num)))
r2_list = np.zeros((len(depth_num), len(tree_num)))
for i, depth in enumerate(depth_num):
    print(f"Training Random Forest with max depth {depth}")
    for j, n_tree in enumerate(tree_num):
        print(f"Training Random Forest with {n_tree} trees")
        # Initialize the RandomForest model
        model = RandomForest(
            n_trees=n_tree,
            max_depth=depth,
            max_features=13
        )
        model.fit(X_train, y_train)
        # predict
        y_pred=model.predict(X_test)
        mse = mean_squared_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)
        mse_list[i, j] = mse
        r2_list[i, j] = r2

mse_list = pd.DataFrame(mse_list, index=depth_num, columns=tree_num)
r2_list = pd.DataFrame(r2_list, index=depth_num, columns=tree_num)
# evaluate
plt.figure(figsize=(12, 8))
sns.heatmap(mse_list, annot=True, fmt=".4f", cmap='viridis', xticklabels=tree_num, yticklabels=depth_num, annot_kws={"size": 6})
plt.xlabel('Number of Trees')
plt.ylabel('Depth of Trees')
plt.title('MSE heatmap')
plt.savefig('mse_heatmap.png')
plt.show()

plt.figure(figsize=(12, 8))
sns.heatmap(r2_list, annot=True, fmt=".4f", cmap='viridis', xticklabels=tree_num, yticklabels=depth_num, annot_kws={"size": 6})
plt.xlabel('Number of Trees')
plt.ylabel('Depth of Trees')
plt.title('R^2 heatmap')
plt.savefig('r2_heatmap.png')
plt.show()

In [None]:
#2-3 fixed max depth= 15
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor
from random_forest import RandomForest
from pre import preprocess
import warnings
warnings.filterwarnings('ignore')

plt.rcParams['font.sans-serif'] = ['Microsoft YaHei', 'SimHei', 'Arial Unicode MS', 'DejaVu Sans']
plt.rcParams['axes.unicode_minus'] = False
sns.set_style("whitegrid")


X, y = preprocess()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True)


n_trees_values = [5, 10, 20, 30, 50, 80, 100]
max_features_values = ['sqrt', 'log2',5,7,9,11,13,]

r2_matrix = np.zeros((len(max_features_values), len(n_trees_values)))
mse_matrix = np.zeros((len(max_features_values), len(n_trees_values)))
time_matrix = np.zeros((len(max_features_values), len(n_trees_values)))


import time

for i, max_features in enumerate(max_features_values):
    for j, n_trees in enumerate(n_trees_values):

        current = i * len(n_trees_values) + j + 1
        total = len(max_features_values) * len(n_trees_values)
        
        feature_str = str(max_features) if max_features is not None else "All"
        print(f"[{current:2d}/{total}] 測試: n_trees={n_trees:3d}, max_features={feature_str}")

        start_time = time.time()

        model = RandomForest(
            n_trees=n_trees,
            max_depth=15,
            max_features=max_features
        )

        model.fit(X_train, y_train)

        y_pred = model.predict(X_test)

        r2 = r2_score(y_test, y_pred)
        mse = mean_squared_error(y_test, y_pred)

        training_time = time.time() - start_time

        r2_matrix[i, j] = r2
        mse_matrix[i, j] = mse
        time_matrix[i, j] = training_time
        
        print(f"     R² = {r2:.4f}, MSE = {mse:.4f}, 時間 = {training_time:.2f}s")

feature_labels = []
for f in max_features_values:
    if f == 'sqrt':
        feature_labels.append(f'sqrt({int(np.sqrt(X_train.shape[1]))})')
    elif f == 'log2':
        feature_labels.append(f'log2({int(np.log2(X_train.shape[1]))})')
    elif f is None:
        feature_labels.append(f'All({X_train.shape[1]})')
    else:
        feature_labels.append(f'{f}({int(f * X_train.shape[1])})')

tree_labels = [str(n) for n in n_trees_values]

fig, axes = plt.subplots(2, 2, figsize=(20, 16))

sns.heatmap(r2_matrix, 
            xticklabels=tree_labels,
            yticklabels=feature_labels,
            annot=True, 
            fmt='.4f',
            cmap='RdYlGn',
            center=np.mean(r2_matrix),
            ax=axes[0,0],
            cbar_kws={'label': 'R² Score'})
axes[0,0].set_title('R² Score Heatmap\n(max_depth=15)', fontsize=14, fontweight='bold')
axes[0,0].set_xlabel('Number of Trees', fontsize=12)
axes[0,0].set_ylabel('Max Features Strategy', fontsize=12)

sns.heatmap(mse_matrix, 
            xticklabels=tree_labels,
            yticklabels=feature_labels,
            annot=True, 
            fmt='.4f',
            cmap='RdYlGn_r',
            center=np.mean(mse_matrix),
            ax=axes[0,1],
            cbar_kws={'label': 'MSE'})
axes[0,1].set_title('MSE Heatmap\n(max_depth=15)', fontsize=14, fontweight='bold')
axes[0,1].set_xlabel('Number of Trees', fontsize=12)
axes[0,1].set_ylabel('Max Features Strategy', fontsize=12)

sns.heatmap(time_matrix, 
            xticklabels=tree_labels,
            yticklabels=feature_labels,
            annot=True, 
            fmt='.2f',
            cmap='YlOrRd',
            ax=axes[1,0],
            cbar_kws={'label': 'Training Time (seconds)'})
axes[1,0].set_title('Training Time Heatmap\n(max_depth=15)', fontsize=14, fontweight='bold')
axes[1,0].set_xlabel('Number of Trees', fontsize=12)
axes[1,0].set_ylabel('Max Features Strategy', fontsize=12)

efficiency_matrix = r2_matrix / time_matrix
sns.heatmap(efficiency_matrix, 
            xticklabels=tree_labels,
            yticklabels=feature_labels,
            annot=True, 
            fmt='.4f',
            cmap='viridis',
            ax=axes[1,1],
            cbar_kws={'label': 'Efficiency (R² / Time)'})
axes[1,1].set_title('Training Efficiency Heatmap\n(R² Score / Training Time)', fontsize=14, fontweight='bold')
axes[1,1].set_xlabel('Number of Trees', fontsize=12)
axes[1,1].set_ylabel('Max Features Strategy', fontsize=12)

plt.tight_layout()
plt.savefig('random_forest_heatmap_analysis.png', dpi=300, bbox_inches='tight')
plt.show()

