In [2]:
# Basic data handling
import pandas as pd
import numpy as np

# Model & evaluation
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

ValueError: numpy.dtype size changed, may indicate binary incompatibility. Expected 96 from C header, got 88 from PyObject

In [None]:
# Load dataset
df = pd.read_csv("players_reduced.csv")  # use your reduced dataset

# Quick check
print(df.shape)
df.head()


In [None]:
# Features (X) and target (y)
X = df.drop(columns=["value"])
y = df["value"]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print(f"Training set: {X_train.shape}, Test set: {X_test.shape}")


In [None]:
# Initialize model
rf = RandomForestRegressor(
    n_estimators=500,
    random_state=42,
    n_jobs=-1
)

# Train model
rf.fit(X_train, y_train)


In [None]:
# Predictions
y_pred = rf.predict(X_test)

# Metrics
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print(f"RMSE: {rmse:.2f}")
print(f"RÂ² Score: {r2:.4f}")


In [None]:
# Get feature importances
importances = pd.DataFrame({
    "Feature": X.columns,
    "Importance": rf.feature_importances_
}).sort_values(by="Importance", ascending=False)

# Plot top 20
plt.figure(figsize=(10,6))
sns.barplot(x="Importance", y="Feature", data=importances.head(20), palette="viridis")
plt.title("Top 20 Feature Importances - Random Forest")
plt.tight_layout()
plt.show()
