In [12]:
# Basic data handling
import pandas as pd
import numpy as np

# Model & evaluation
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

In [13]:
# Load dataset
df = pd.read_csv("scouting_reports.csv")  # use your reduced dataset

# Quick check
print(df.shape)
df.head()


(3224, 122)


Unnamed: 0,player_id,player_name,age,league,value,Goals,Assists,Goals + Assists,Non-Penalty Goals,Penalty Kicks Made,...,Fouls Committed,Fouls Drawn,Offsides,Penalty Kicks Won,Penalty Kicks Conceded,Own Goals,Ball Recoveries,Aerials Won,Aerials Lost,% of Aerials Won
0,dcb6f03c,Bence Dárdai,19.5,Bundesliga,12000000.0,0.07,0.07,0.14,0.07,0.0,...,0.62,1.1,0.0,0.07,0.0,0.0,3.59,0.69,1.04,40.0%
1,9d00bcd3,Jochem Ritmeester van de Kamp,21.8,Keuken Kampioen Divisie,800000.0,0.0,0.05,0.05,0.0,0.0,...,1.08,1.17,0.09,0.0,0.0,0.05,4.45,1.08,1.62,40.0%
2,3cb6f224,Michael Nicolás Santos Rosadilla,32.4,Torneo Clausura,800000.0,0.34,0.0,0.34,0.34,0.0,...,0.67,1.35,1.46,0.11,0.0,0.11,2.02,2.81,4.38,39.1%
3,dc62b55d,Matheus Cunha,26.2,Premier League,60000000.0,0.52,0.21,0.73,0.52,0.0,...,1.46,2.46,0.28,0.0,0.03,0.0,4.33,0.45,1.01,31.0%
4,f86cd5df,Aster Vranckx,22.8,Bundesliga,7000000.0,0.0,0.0,0.0,0.0,0.0,...,1.94,1.78,0.0,0.0,0.0,0.0,3.56,1.13,2.27,33.3%


In [14]:
# Features (X) and target (y)
df = pd.get_dummies(df, columns=['league'], drop_first=True)
for col in df.select_dtypes(include='object').columns:
    df[col] = df[col].str.replace('%', '', regex=False)
X = df.drop(columns=["value", "player_id", "player_name"])
y = df["value"]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print(f"Training set: {X_train.shape}, Test set: {X_test.shape}")


Training set: (2579, 181), Test set: (645, 181)


In [20]:
# Initialize model
rf = RandomForestRegressor(random_state=42)
param_distributions = {
    'n_estimators': [100, 200, 500, 1000],
    'max_depth': [None, 10, 20, 30, 40, 50],
    'min_samples_split': [2, 5, 10, 15],
    'min_samples_leaf': [1, 2, 4, 8],
    'max_features': ['auto', 'sqrt', 'log2', 0.2, 0.5, 0.8]
}
random_search = RandomizedSearchCV(
    estimator=rf,
    param_distributions=param_distributions,
    n_iter=3,  
    cv=5,       
    verbose=2,
    random_state=42,
    n_jobs=-1   
)

# Train model
random_search.fit(X_train, y_train)
rf = random_search.best_estimator_


Fitting 5 folds for each of 3 candidates, totalling 15 fits


In [21]:
# Predictions
y_pred = rf.predict(X_test)

# Metrics
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print(f"RMSE: {rmse:.2f}")
print(f"R² Score: {r2:.4f}")


RMSE: 10375528.69
R² Score: 0.5391


In [None]:
# Get feature importances
importances = pd.DataFrame({
    "Feature": X.columns,
    "Importance": rf.feature_importances_
}).sort_values(by="Importance", ascending=False)

# Plot top 20
plt.figure(figsize=(10,6))
sns.barplot(x="Importance", y="Feature", data=importances.head(20), palette="viridis")
plt.title("Top 20 Feature Importances - Random Forest")
plt.tight_layout()
plt.show()
