In [None]:
# XG
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from xgboost import XGBRegressor
import numpy as np
from sklearn.preprocessing import StandardScaler
import matplotlib
from sklearn.metrics import mean_absolute_error

matplotlib.rcParams['font.family'] = 'Times New Roman'

data = pd.read_csv('C:\\Users\\Administrator\\Desktop\\train.csv')

data.dropna(inplace=True)

data['CHLA'] = np.log10(data['CHLA'])

data_cleaned = data.dropna()

data_cleaned = data_cleaned.sample(n=3000, random_state=42)

feature_columns = ['SST', 'SSS', 'DIN', 'aCDOM']

X = data_cleaned[feature_columns]

y = data_cleaned['N2O']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

threshold = np.quantile(y_train, 0.7)
weights = np.where(y_train > threshold, 1.3, 1)

reg = XGBRegressor(n_estimators=50, random_state=42)

reg.fit(X_train_scaled, y_train, sample_weight=weights)

y_train_pred = reg.predict(X_train_scaled)
y_pred = reg.predict(X_test_scaled)

mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

mape = np.mean(np.abs((y_test - y_pred) / y_test)) * 100

print(f'R2: {r2:.4f}')
print(f'RMSE: {rmse:.4f}')

mae = mean_absolute_error(y_test, y_pred)
print(f'MAE: {mae:.4f}')

mb = np.mean(y_pred - y_test)
print(f'Mean Bias: {mb:.4f}')

mre = np.mean(np.abs((y_pred - y_test) / (y_test + 1e-8)))
print(f'Mean Relative Error: {mre:.4f}')

plt.figure(figsize=(10, 10),dpi=300)

plt.scatter(y_train, y_train_pred, marker='^', color='#EA365B', edgecolors='k', linewidths=0.5, label='Training set', s=200)
plt.scatter(y_test, y_pred, marker='o', color='#4286DE', edgecolors='k', linewidths=0.5, label='Test set', s=200)

plt.tick_params(axis='both', which='major', labelsize=22, colors='black', width=2, direction = 'in')

ax = plt.gca()
ax.spines['top'].set_color('black')
ax.spines['top'].set_linewidth(2)
ax.spines['bottom'].set_color('black')
ax.spines['bottom'].set_linewidth(2)
ax.spines['left'].set_color('black')
ax.spines['left'].set_linewidth(2)
ax.spines['right'].set_color('black')
ax.spines['right'].set_linewidth(2)

plt.xlabel('Observed dissolved N₂O concentration (nmol/L)', fontsize=24, fontfamily='Times New Roman')
plt.ylabel('Modeled dissolved N₂O concentration (nmol/L)', fontsize=24, fontfamily='Times New Roman')

plt.plot([y.min(), y.max()], [y.min(), y.max()], 'k--', linewidth=2, zorder=0)

lgd = plt.legend(loc='lower right', fontsize=15, frameon=True, edgecolor='black', scatterpoints=1, markerscale=0.5, handletextpad=0.5, borderpad=0.5)
lgd.get_frame().set_linewidth(2.0)

textstr = '\n'.join((
    'R²: {:.2f}'.format(r2),
    'RMSE: {:.2f}'.format(rmse),
    'MAE: {:.2f}'.format(mae),
    'MRE: {:.2f}'.format(mre),
    'Bias: {:.2f}'.format(mb)
)
)
plt.text(0.05, 0.95, textstr, transform=plt.gca().transAxes, fontsize=25,
         verticalalignment='top', bbox=dict(boxstyle="round", alpha=0.5))
plt.xlim(5, 80)
plt.ylim(5, 80)

plt.title("XGBoosting", fontsize=26, fontweight='bold', fontname='Times New Roman')
plt.show()