In [37]:
# Basic data handling
import pandas as pd
import numpy as np

# Model & evaluation
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

In [38]:
# Load dataset
df = pd.read_csv("scouting_reports.csv")  # use your reduced dataset

# Quick check
print(df.shape)
df.head()


(3224, 122)


Unnamed: 0,player_id,player_name,age,league,value,Goals,Assists,Goals + Assists,Non-Penalty Goals,Penalty Kicks Made,...,Fouls Committed,Fouls Drawn,Offsides,Penalty Kicks Won,Penalty Kicks Conceded,Own Goals,Ball Recoveries,Aerials Won,Aerials Lost,% of Aerials Won
0,dcb6f03c,Bence Dárdai,19.5,Bundesliga,12000000.0,0.07,0.07,0.14,0.07,0.0,...,0.62,1.1,0.0,0.07,0.0,0.0,3.59,0.69,1.04,40.0%
1,9d00bcd3,Jochem Ritmeester van de Kamp,21.8,Keuken Kampioen Divisie,800000.0,0.0,0.05,0.05,0.0,0.0,...,1.08,1.17,0.09,0.0,0.0,0.05,4.45,1.08,1.62,40.0%
2,3cb6f224,Michael Nicolás Santos Rosadilla,32.4,Torneo Clausura,800000.0,0.34,0.0,0.34,0.34,0.0,...,0.67,1.35,1.46,0.11,0.0,0.11,2.02,2.81,4.38,39.1%
3,dc62b55d,Matheus Cunha,26.2,Premier League,60000000.0,0.52,0.21,0.73,0.52,0.0,...,1.46,2.46,0.28,0.0,0.03,0.0,4.33,0.45,1.01,31.0%
4,f86cd5df,Aster Vranckx,22.8,Bundesliga,7000000.0,0.0,0.0,0.0,0.0,0.0,...,1.94,1.78,0.0,0.0,0.0,0.0,3.56,1.13,2.27,33.3%


In [39]:
# Drop rows with missing target
df = df.dropna(subset=["value"])

# One-hot encode categorical features
df = pd.get_dummies(df, columns=['league'], drop_first=True)

# Clean percentage strings and convert to numeric
for col in df.select_dtypes(include='object').columns:
    df[col] = df[col].str.replace('%', '', regex=False)
    df[col] = pd.to_numeric(df[col], errors='coerce')

# Fill any remaining NaNs with 0 (can also try median imputation)
df = df.fillna(0)

# Features and target
X = df.drop(columns=["value", "player_id", "player_name"])

# Log-transform the target to stabilize variance
y = np.log1p(df["value"])  # comment this out if you don't want transformation

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print(f"Training set: {X_train.shape}, Test set: {X_test.shape}")


Training set: (2579, 181), Test set: (645, 181)


In [40]:
# Initialize model
rf = RandomForestRegressor(random_state=42)
param_distributions = {
    'n_estimators': [200, 500, 1000, 1500],
    'max_depth': [None, 20, 40, 60, 80],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': [0.3, 0.5, 0.7, 'sqrt']
}

random_search = RandomizedSearchCV(
    estimator=rf,
    param_distributions=param_distributions,
    n_iter=50,  # much higher
    cv=5,
    scoring='r2',  # explicitly optimize for R²
    verbose=2,
    random_state=42,
    n_jobs=-1
)


In [46]:
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

# 1. Fit RandomizedSearchCV to find the best parameters
random_search.fit(X_train, y_train)

# 2. Get the best model
rf_best = random_search.best_estimator_

# 3. Predictions on test set
y_pred = rf_best.predict(X_test)

# 4. If you applied log-transform earlier, reverse it (uncomment if needed)
# y_pred = np.expm1(y_pred)
# y_test = np.expm1(y_test)

# 5. Calculate metrics
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print(f"Best Parameters: {random_search.best_params_}")
print(f"RMSE: {rmse:.2f}")
print(f"R² Score: {r2:.4f}")


Fitting 5 folds for each of 50 candidates, totalling 250 fits
[CV] END max_depth=40, max_features=0.5, min_samples_leaf=2, min_samples_split=5, n_estimators=200; total time=   8.9s
[CV] END max_depth=40, max_features=0.5, min_samples_leaf=2, min_samples_split=5, n_estimators=200; total time=   9.0s
[CV] END max_depth=40, max_features=0.5, min_samples_leaf=2, min_samples_split=5, n_estimators=200; total time=   9.0s
[CV] END max_depth=40, max_features=0.5, min_samples_leaf=2, min_samples_split=5, n_estimators=200; total time=   9.1s
[CV] END max_depth=40, max_features=0.5, min_samples_leaf=2, min_samples_split=5, n_estimators=200; total time=   9.3s
[CV] END max_depth=40, max_features=0.3, min_samples_leaf=1, min_samples_split=2, n_estimators=1000; total time=  30.0s
[CV] END max_depth=40, max_features=0.3, min_samples_leaf=1, min_samples_split=2, n_estimators=1000; total time=  30.1s
[CV] END max_depth=40, max_features=0.3, min_samples_leaf=1, min_samples_split=2, n_estimators=1000; to



[CV] END max_depth=None, max_features=0.5, min_samples_leaf=1, min_samples_split=2, n_estimators=1500; total time= 1.3min
[CV] END max_depth=None, max_features=0.7, min_samples_leaf=4, min_samples_split=5, n_estimators=500; total time=  29.1s
[CV] END max_depth=40, max_features=sqrt, min_samples_leaf=4, min_samples_split=10, n_estimators=1000; total time=   6.6s
[CV] END max_depth=None, max_features=0.5, min_samples_leaf=1, min_samples_split=2, n_estimators=1500; total time= 1.3min
[CV] END max_depth=40, max_features=sqrt, min_samples_leaf=4, min_samples_split=10, n_estimators=1000; total time=   7.1s
[CV] END max_depth=40, max_features=sqrt, min_samples_leaf=4, min_samples_split=10, n_estimators=1000; total time=   7.0s
[CV] END max_depth=40, max_features=sqrt, min_samples_leaf=4, min_samples_split=10, n_estimators=1000; total time=   6.6s
[CV] END max_depth=None, max_features=0.7, min_samples_leaf=4, min_samples_split=5, n_estimators=500; total time=  30.0s
[CV] END max_depth=40, max

In [47]:
# Get feature importances
importances = pd.DataFrame({
    "Feature": X.columns,
    "Importance": rf.feature_importances_
}).sort_values(by="Importance", ascending=False)

# Plot top 20
plt.figure(figsize=(10,6))
sns.barplot(x="Importance", y="Feature", data=importances.head(20), palette="viridis")
plt.title("Top 20 Feature Importances - Random Forest")
plt.tight_layout()
plt.show()


NotFittedError: This RandomForestRegressor instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.