In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

df = pd.read_csv("hitters.csv")

print(df.isnull().sum())

df.dropna(subset=["Salary"], inplace=True)

# Özellik Mühendisliği
df["Carrier"] = df["CRuns"] / df["CAtBat"]
df["Carrier2"] = df["CHits"] / df["CAtBat"]
df["experience"] = df["Years"] * df["CHits"]
df["Salary"] = np.log1p(df["Salary"]) # Bağımlı değişkenin log dönüşümü
display(df)

AtBat         0
Hits          0
HmRun         0
Runs          0
RBI           0
Walks         0
Years         0
CAtBat        0
CHits         0
CHmRun        0
CRuns         0
CRBI          0
CWalks        0
League        0
Division      0
PutOuts       0
Assists       0
Errors        0
Salary       59
NewLeague     0
dtype: int64


Unnamed: 0,AtBat,Hits,HmRun,Runs,RBI,Walks,Years,CAtBat,CHits,CHmRun,...,League,Division,PutOuts,Assists,Errors,Salary,NewLeague,Carrier,Carrier2,experience
1,315,81,7,24,38,39,14,3449,835,69,...,N,W,632,43,10,6.165418,N,0.093070,0.242099,11690
2,479,130,18,66,72,76,3,1624,457,63,...,A,W,880,82,14,6.175867,A,0.137931,0.281404,1371
3,496,141,20,65,78,37,11,5628,1575,225,...,N,E,200,11,3,6.216606,N,0.147122,0.279851,17325
4,321,87,10,39,42,30,2,396,101,12,...,N,E,805,40,4,4.527209,N,0.121212,0.255051,202
5,594,169,4,74,51,35,11,4408,1133,19,...,A,W,282,421,25,6.621406,A,0.113657,0.257033,12463
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
317,497,127,7,65,48,37,5,2703,806,32,...,N,E,325,9,3,6.552508,N,0.140215,0.298187,4030
318,492,136,5,76,50,94,12,5511,1511,39,...,A,E,313,381,20,6.775366,A,0.162765,0.274179,18132
319,475,126,3,61,43,52,6,1700,433,7,...,A,W,37,113,7,5.955837,A,0.127647,0.254706,2598
320,573,144,9,85,60,78,8,3198,857,97,...,A,E,1314,131,12,6.867974,A,0.146967,0.267980,6856


In [None]:
numerical_cols = df.select_dtypes(include=np.number).columns.tolist()
categorical_cols = df.select_dtypes(include='object').columns.tolist()
numerical_cols.remove('Salary')

print(categorical_cols)

df = pd.get_dummies(df, columns=categorical_cols, drop_first=True, dtype=int) # One-Hot Encoding
display(df)
print(df.dtypes)


['League', 'Division', 'NewLeague']


Unnamed: 0,AtBat,Hits,HmRun,Runs,RBI,Walks,Years,CAtBat,CHits,CHmRun,...,PutOuts,Assists,Errors,Salary,Carrier,Carrier2,experience,League_N,Division_W,NewLeague_N
1,315,81,7,24,38,39,14,3449,835,69,...,632,43,10,6.165418,0.093070,0.242099,11690,1,1,1
2,479,130,18,66,72,76,3,1624,457,63,...,880,82,14,6.175867,0.137931,0.281404,1371,0,1,0
3,496,141,20,65,78,37,11,5628,1575,225,...,200,11,3,6.216606,0.147122,0.279851,17325,1,0,1
4,321,87,10,39,42,30,2,396,101,12,...,805,40,4,4.527209,0.121212,0.255051,202,1,0,1
5,594,169,4,74,51,35,11,4408,1133,19,...,282,421,25,6.621406,0.113657,0.257033,12463,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
317,497,127,7,65,48,37,5,2703,806,32,...,325,9,3,6.552508,0.140215,0.298187,4030,1,0,1
318,492,136,5,76,50,94,12,5511,1511,39,...,313,381,20,6.775366,0.162765,0.274179,18132,0,0,0
319,475,126,3,61,43,52,6,1700,433,7,...,37,113,7,5.955837,0.127647,0.254706,2598,0,1,0
320,573,144,9,85,60,78,8,3198,857,97,...,1314,131,12,6.867974,0.146967,0.267980,6856,0,0,0


AtBat            int64
Hits             int64
HmRun            int64
Runs             int64
RBI              int64
Walks            int64
Years            int64
CAtBat           int64
CHits            int64
CHmRun           int64
CRuns            int64
CRBI             int64
CWalks           int64
PutOuts          int64
Assists          int64
Errors           int64
Salary         float64
Carrier        float64
Carrier2       float64
experience       int64
League_N         int32
Division_W       int32
NewLeague_N      int32
dtype: object


In [None]:
y = df['Salary']
X = df.drop(['Salary'], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # Train-Test Bölme

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train) # Train set için ölçekleme
X_test_scaled = scaler.transform(X_test) # Test set için ölçekleme

def train_and_evaluate_model(model, model_name): # Her model için eğitim ve değerlendirme fonksiyonu

    print(f"\n--- {model_name} ---")
    
    model.fit(X_train_scaled, y_train)
    
    y_pred_test = model.predict(X_test_scaled)
    
    test_mse = mean_squared_error(y_test, y_pred_test)
    test_r2 = r2_score(y_test, y_pred_test)
    
    print(f"Test Seti MSE: {test_mse:.4f}, R2: {test_r2:.4f}")


In [None]:
linear_model = LinearRegression()
train_and_evaluate_model(linear_model, "LinearRegression") # Linear Regression Modeli

ridge_model = Ridge(random_state=42)
train_and_evaluate_model(ridge_model,  "Ridge") # Ridge Regression Modeli

lasso_model = Lasso(random_state=42)
train_and_evaluate_model(lasso_model,  "Lasso") # Lasso Regression Modeli

rf_model_initial = RandomForestRegressor(random_state=42)
train_and_evaluate_model(rf_model_initial, "RandomForestRegressor") # Random Forest Modeli

gb_model = GradientBoostingRegressor(random_state=42)
train_and_evaluate_model(gb_model, "GradientBoostingRegressor") # Gradient Boosting Modeli

xgb_model = XGBRegressor(random_state=42, objective='reg:squarederror')
train_and_evaluate_model(xgb_model, "XGBoostRegressor") # XGBoost Modeli


--- LinearRegression ---
Test Seti MSE: 0.3116, R2: 0.5127

--- Ridge ---
Test Seti MSE: 0.3389, R2: 0.4700

--- Lasso ---
Test Seti MSE: 0.6397, R2: -0.0004

--- RandomForestRegressor ---
Test Seti MSE: 0.1952, R2: 0.6948

--- GradientBoostingRegressor ---
Test Seti MSE: 0.1463, R2: 0.7712

--- XGBoostRegressor ---
Test Seti MSE: 0.1946, R2: 0.6956


In [12]:
param_grid_ridge = {"alpha": np.logspace(-1, 1, 1000)} # Ridge için hiperparametre arama alanı
grid_search_ridge = GridSearchCV(Ridge(random_state=33), param_grid_ridge, cv=2, scoring="r2", n_jobs=-1, verbose=1) # Hiperparametre araması için GridSearchCV kullanımı
grid_search_ridge.fit(X_train_scaled, y_train) 

best_ridge_model = grid_search_ridge.best_estimator_ 

y_pred_grid = best_ridge_model.predict(X_test_scaled) # En iyi parametresi bulunmuş model ile test seti üzerinde tahmin yapma
mse_xgb = mean_squared_error(y_test, y_pred_grid)
r2_xgb = r2_score(y_test, y_pred_grid)
print(f"Ridge - MSE: {mse_xgb:.4f}, R²: {r2_xgb:.4f}")

Fitting 2 folds for each of 1000 candidates, totalling 2000 fits
Ridge - MSE: 0.3425, R²: 0.4644


In [13]:
param_grid_lasso = {'alpha': np.logspace(-3, 1, 1000)} # Lasso için hiperparametre arama alanı

grid_search_lasso = GridSearchCV(Lasso(random_state=42, max_iter=10000),  param_grid_lasso, cv=2, scoring='r2', n_jobs=-1, verbose=1) # Hiperparametre araması için GridSearchCV kullanımı
grid_search_lasso.fit(X_train_scaled, y_train)
best_lasso_model = grid_search_lasso.best_estimator_

y_pred_lasso_grid = best_lasso_model.predict(X_test_scaled) # En iyi parametresi bulunmuş model ile test seti üzerinde tahmin yapma
mse_lasso_grid = mean_squared_error(y_test, y_pred_lasso_grid)
r2_lasso_grid = r2_score(y_test, y_pred_lasso_grid)
print(f"Lasso - MSE: {mse_lasso_grid:.4f}, R²: {r2_lasso_grid:.4f}")

Fitting 2 folds for each of 1000 candidates, totalling 2000 fits
Lasso - MSE: 0.3259, R²: 0.4904


In [None]:
param_grid_rf = {'n_estimators': [100, 200], 'max_depth': [None, 10, 20], 'min_samples_split': [2, 5], 'min_samples_leaf': [1, 2], 'max_features': [None, 'sqrt', 'log2']}
grid_search_rf = GridSearchCV(RandomForestRegressor(random_state=42), param_grid_rf, cv=4,  scoring='r2',  n_jobs=-1, verbose=1)
# Random Forest için hiperparametre alanı ve grid search kullanımı

grid_search_rf.fit(X_train_scaled, y_train)
best_rf_model = grid_search_rf.best_estimator_

y_pred_rf_grid = best_rf_model.predict(X_test_scaled) # Modeli oluşturma ve test seti üzerinde tahmin yapma
mse_rf_grid = mean_squared_error(y_test, y_pred_rf_grid)
r2_rf_grid = r2_score(y_test, y_pred_rf_grid)
print(f"Random Forest - MSE: {mse_rf_grid:.4f}, R²: {r2_rf_grid:.4f}")

Fitting 4 folds for each of 72 candidates, totalling 288 fits
Random Forest - MSE: 0.1877, R²: 0.7064


In [14]:
param_grid_gb = {'n_estimators': [100, 200], 'learning_rate': [0.01, 0.05, 0.1], 'max_depth': [3, 5, 7], 'min_samples_split': [2, 5], 'min_samples_leaf': [1, 2], 'subsample': [0.8, 1.0]}
grid_search_gb = GridSearchCV(GradientBoostingRegressor(random_state=42), param_grid_gb, cv=2, scoring='r2', n_jobs=-1, verbose=1)
# Gradient Boosting için hiperparametre alanı ve grid search kullanımı

grid_search_gb.fit(X_train_scaled, y_train)
best_gb_model = grid_search_gb.best_estimator_

y_pred_gb_grid = best_gb_model.predict(X_test_scaled)
mse_gb_grid = mean_squared_error(y_test, y_pred_gb_grid)
r2_gb_grid = r2_score(y_test, y_pred_gb_grid)
print(f"Gradient Boosting - MSE: {mse_gb_grid:.4f}, R²: {r2_gb_grid:.4f}")

Fitting 2 folds for each of 144 candidates, totalling 288 fits
Gradient Boosting - MSE: 0.1475, R²: 0.7694


  _data = np.array(data, dtype=dtype, copy=copy,


In [15]:
param_grid_xgb = {'n_estimators': [100, 200], 'learning_rate': [0.01, 0.05, 0.1], 'max_depth': [3, 5, 7], 'subsample': [0.7, 0.8, 1.0],  'colsample_bytree': [0.7, 0.8, 1.0], 
                  'gamma': [0, 0.1, 0.2] , 'reg_alpha': [0, 0.01, 0.1], 'reg_lambda': [0.1, 1, 10]}
# XGBoost için hiperparametre alanı

grid_search_xgb = GridSearchCV(XGBRegressor(random_state=42, objective='reg:squarederror'), param_grid_xgb, cv=2, scoring='r2', n_jobs=-1, verbose=1) # GridSearchCV kullanımı
grid_search_xgb.fit(X_train_scaled, y_train)
best_xgb_model = grid_search_xgb.best_estimator_

y_pred_xgb_grid = best_xgb_model.predict(X_test_scaled) # Model tahmini
mse_xgb_grid = mean_squared_error(y_test, y_pred_xgb_grid)
r2_xgb_grid = r2_score(y_test, y_pred_xgb_grid)
print(f"XGBoost - MSE: {mse_xgb_grid:.4f}, R²: {r2_xgb_grid:.4f}")

Fitting 2 folds for each of 4374 candidates, totalling 8748 fits
XGBoost - MSE: 0.1832, R²: 0.7136


Hiperparametreler gridsearch ile bulunmadan önce de, sonra da en iyi modelimiz Gradient Boosting olarak gözükmektedir. Lasso modelimiz için hiperparametreler ayarlanmadan önce R² değerimiz 0 çıksa da hiperparametre sonrası değerimiz 0.49'a kadar çıkabilmiştir.

Eksik değerlerimiz sadece bağımlı değişkende olduğu için eksik değerleri doldurmaya ihtiyacımız yoktu. Bağımlı değişkendeki eksik değerlerin olduğu sütunları sildik ve modellerimizi kurmadan önce logaritmik dönüşüme aldık.