In [3]:
# Step 1: Import Required Libraries and Data
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import cross_val_score


In [4]:
# Load the dataset
url = "https://github.com/dsrscientist/Data-Science-ML-Capstone-Projects/raw/master/baseball.csv"
data = pd.read_csv(url)


In [5]:
# Exploratory Data Analysis (EDA)
print(data.head())
print(data.info())
# Perform more exploratory data analysis as necessary


    W    R    AB     H   2B  3B   HR   BB    SO   SB   RA   ER   ERA  CG  SHO  \
0  95  724  5575  1497  300  42  139  383   973  104  641  601  3.73   2    8   
1  83  696  5467  1349  277  44  156  439  1264   70  700  653  4.07   2   12   
2  81  669  5439  1395  303  29  141  533  1157   86  640  584  3.67  11   10   
3  76  622  5533  1381  260  27  136  404  1231   68  701  643  3.98   7    9   
4  74  689  5605  1515  289  49  151  455  1259   83  803  746  4.64   7   12   

   SV    E  
0  56   88  
1  45   86  
2  38   79  
3  37  101  
4  35   86  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30 entries, 0 to 29
Data columns (total 17 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   W       30 non-null     int64  
 1   R       30 non-null     int64  
 2   AB      30 non-null     int64  
 3   H       30 non-null     int64  
 4   2B      30 non-null     int64  
 5   3B      30 non-null     int64  
 6   HR      30 non-null     int64  

In [6]:

# Step 3: Data Preprocessing and Feature Engineering
# Drop any rows with missing values
data.dropna(inplace=True)
# Feature Selection
X = data.drop(columns=['W'])
y = data['W']
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Feature Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [7]:

# Step 4: Build/Test Multiple Models
# Linear Regression Model
linear_reg = LinearRegression()
linear_reg.fit(X_train_scaled, y_train)
# Random Forest Regression Model
random_forest_reg = RandomForestRegressor(random_state=42)
random_forest_reg.fit(X_train_scaled, y_train)


In [8]:
# Step 5: Check for Underfitting/Overfitting
# Evaluate on training set
y_pred_train_linear = linear_reg.predict(X_train_scaled)
mse_train_linear = mean_squared_error(y_train, y_pred_train_linear)
r2_train_linear = r2_score(y_train, y_pred_train_linear)
print("Linear Regression - Training MSE:", mse_train_linear)
print("Linear Regression - Training R^2 Score:", r2_train_linear)



Linear Regression - Training MSE: 2.521469259954142
Linear Regression - Training R^2 Score: 0.974042638444853


In [9]:
# Step 4: Build/Test Multiple Models (Random Forest Regression)
# Random Forest Regression Model
random_forest_reg = RandomForestRegressor(random_state=42)
random_forest_reg.fit(X_train_scaled, y_train)


In [10]:
 Check for Underfitting/Overfitting
# Evaluate on training set
y_pred_train_rf = random_forest_reg.predict(X_train_scaled)
mse_train_rf = mean_squared_error(y_train, y_pred_train_rf)
r2_train_rf = r2_score(y_train, y_pred_train_rf)
print("Random Forest Regression - Training MSE:", mse_train_rf)
print("Random Forest Regression - Training R^2 Score:", r2_train_rf)


Random Forest Regression - Training MSE: 8.204691666666669
Random Forest Regression - Training R^2 Score: 0.9155364884186445


In [11]:
# Evaluate on testing set
y_pred_test_rf = random_forest_reg.predict(X_test_scaled)
mse_test_rf = mean_squared_error(y_test, y_pred_test_rf)
r2_test_rf = r2_score(y_test, y_pred_test_rf)
print("Random Forest Regression - Testing MSE:", mse_test_rf)
print("Random Forest Regression - Testing R^2 Score:", r2_test_rf)


Random Forest Regression - Testing MSE: 52.0209166666667
Random Forest Regression - Testing R^2 Score: 0.6046542115262822


In [14]:
# Cross-Validation and Performance Metrics
# Cross-validation for Random Forest Regression
cv_scores_rf = cross_val_score(random_forest_reg, X_train_scaled, y_train, cv=5, scoring='neg_mean_squared_error')
print("Random Forest Regression - Cross-Validation MSE:", -cv_scores_rf.mean())



Random Forest Regression - Cross-Validation MSE: 48.648014


In [19]:
# Select the Best/Final Performing Model
# Compare the performance of Random Forest Regression with Linear Regression

# Linear Regression Evaluation
linear_reg = LinearRegression()
linear_reg.fit(X_train_scaled, y_train)
y_pred_linear = linear_reg.predict(X_test_scaled)
mse_linear = mean_squared_error(y_test, y_pred_linear)
r2_linear = r2_score(y_test, y_pred_linear)

# Random Forest Regression Evaluation
random_forest_reg = RandomForestRegressor(random_state=42)
random_forest_reg.fit(X_train_scaled, y_train)
y_pred_rf = random_forest_reg.predict(X_test_scaled)
mse_rf = mean_squared_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)

# Print the evaluation metrics
print("Linear Regression - Testing MSE:", mse_linear)
print("Linear Regression - Testing R^2 Score:", r2_linear)
print("Random Forest Regression - Testing MSE:", mse_rf)
print("Random Forest Regression - Testing R^2 Score:", r2_rf)



Linear Regression - Testing MSE: 27.94303250666778
Linear Regression - Testing R^2 Score: 0.7876400316149377
Random Forest Regression - Testing MSE: 52.0209166666667
Random Forest Regression - Testing R^2 Score: 0.6046542115262822


In [23]:
# Compare the metrics and select the best model
if mse_rf < mse_linear and r2_rf > r2_linear:
    print("Random Forest Regression performs better.")
    final_model = random_forest_reg
else:
    print("Linear Regression performs better.")
    final_model = linear_reg



Linear Regression performs better.
