<a href="https://colab.research.google.com/github/codenameglen/2017_NFL_DK/blob/master/MLBHomePrediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install pybaseball
!pip install requests


Collecting pybaseball
  Downloading pybaseball-2.2.7-py3-none-any.whl.metadata (11 kB)
Collecting pygithub>=1.51 (from pybaseball)
  Downloading PyGithub-2.6.1-py3-none-any.whl.metadata (3.9 kB)
Collecting pynacl>=1.4.0 (from pygithub>=1.51->pybaseball)
  Downloading PyNaCl-1.5.0-cp36-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl.metadata (8.6 kB)
Downloading pybaseball-2.2.7-py3-none-any.whl (426 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m426.1/426.1 kB[0m [31m12.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading PyGithub-2.6.1-py3-none-any.whl (410 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m410.5/410.5 kB[0m [31m23.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading PyNaCl-1.5.0-cp36-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl (856 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m856.7/856.7 kB[0m [31m38.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected

In [3]:
import pandas as pd
import numpy as np
import requests
from pybaseball import batting_stats, pitching_stats
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error



In [4]:
# Load DraftKings game data
game_data = pd.read_csv("DKSalaries5.csv")

# Display sample data
game_data.head()


Unnamed: 0,Position,Name + ID,batter,ID,Roster Position,Game Info,TeamAbbrev,AvgPointsPerGame,pitcher
0,IF,Adrian Del Castillo (38848526),Adrian Del Castillo,38848526,UTIL,ARI@SF 05/12/2025 09:45PM ET,ARI,0.0,Justin Verlander
1,OF,Alek Thomas (38848351),Alek Thomas,38848351,UTIL,ARI@SF 05/12/2025 09:45PM ET,ARI,0.0,Justin Verlander
2,IF,Blaze Alexander (38848532),Blaze Alexander,38848532,UTIL,ARI@SF 05/12/2025 09:45PM ET,ARI,0.0,Justin Verlander
3,OF,Corbin Carroll (38848225),Corbin Carroll,38848225,UTIL,ARI@SF 05/12/2025 09:45PM ET,ARI,0.28,Justin Verlander
4,IF,Eugenio Suarez (38848274),Eugenio Suarez,38848274,UTIL,ARI@SF 05/12/2025 09:45PM ET,ARI,0.3,Justin Verlander


In [6]:
# Load Baseball Savant park factors CSV
park_factors = pd.read_csv("park_factors.csv")

# Check available columns
print(park_factors.columns)

# Merge park factors with game data based on stadium names
def get_park_factors(stadium, park_factors_df):
    """Retrieve HR Park Factor for each stadium"""
    stadium_data = park_factors_df[park_factors_df['Venue'] == stadium]
    return stadium_data['HR_factor'].values[0] if not stadium_data.empty else 1.0  # Default neutral factor

# Apply park factor to game data
game_data["HR_ParkFactor"] = game_data["Game Info"].apply(lambda x: get_park_factors(x.split(" ")[0], park_factors))

# Display updated dataset with park factors
game_data.head()


Index(['Rk.', 'Team', 'Venue', 'Year', 'Park Factor', 'wOBACon', 'xwOBACon',
       'BACON', 'xBACON', 'HardHit', 'R', 'OBP', 'H', '1B', '2B', '3B', 'HR',
       'BB', 'SO', 'PA'],
      dtype='object')


Unnamed: 0,Position,Name + ID,batter,ID,Roster Position,Game Info,TeamAbbrev,AvgPointsPerGame,pitcher,HR_ParkFactor
0,IF,Adrian Del Castillo (38848526),Adrian Del Castillo,38848526,UTIL,ARI@SF 05/12/2025 09:45PM ET,ARI,0.0,Justin Verlander,1.0
1,OF,Alek Thomas (38848351),Alek Thomas,38848351,UTIL,ARI@SF 05/12/2025 09:45PM ET,ARI,0.0,Justin Verlander,1.0
2,IF,Blaze Alexander (38848532),Blaze Alexander,38848532,UTIL,ARI@SF 05/12/2025 09:45PM ET,ARI,0.0,Justin Verlander,1.0
3,OF,Corbin Carroll (38848225),Corbin Carroll,38848225,UTIL,ARI@SF 05/12/2025 09:45PM ET,ARI,0.28,Justin Verlander,1.0
4,IF,Eugenio Suarez (38848274),Eugenio Suarez,38848274,UTIL,ARI@SF 05/12/2025 09:45PM ET,ARI,0.3,Justin Verlander,1.0


In [7]:
def get_pitcher_stats():
    """Retrieve advanced pitcher stats for HR prediction from 2023-2025"""
    try:
        stats_2023 = pitching_stats(2023)
        stats_2024 = pitching_stats(2024)
        stats_2025 = pitching_stats(2025)
        pitcher_data = pd.concat([stats_2023, stats_2024, stats_2025])

        # Select relevant columns
        pitcher_data = pitcher_data[['Name', 'HR/9', 'xFIP', 'SwStr%', 'GB%', 'Hard%']]
        return pitcher_data
    except Exception as e:
        print(f"Error fetching pitcher stats: {e}")
        return None

# Fetch pitcher stats from 2023-2025
pitcher_stats = get_pitcher_stats()

# Display sample pitcher stats
pitcher_stats.head()


Unnamed: 0,Name,HR/9,xFIP,SwStr%,GB%,Hard%
19,Zack Wheeler,0.94,3.54,0.133,0.412,0.313
26,Spencer Strider,1.06,2.92,0.189,0.341,0.354
1,Gerrit Cole,0.86,3.6,0.117,0.396,0.311
2,Sonny Gray,0.39,3.64,0.114,0.473,0.3
15,Zac Gallen,0.94,3.49,0.112,0.418,0.401


In [8]:
def get_batter_stats():
    """Retrieve advanced batter stats for HR prediction from 2023-2025"""
    try:
        stats_2023 = batting_stats(2023)
        stats_2024 = batting_stats(2024)
        stats_2025 = batting_stats(2025)
        batter_data = pd.concat([stats_2023, stats_2024, stats_2025])

        # Select relevant columns
        batter_data = batter_data[['Name', 'HR', 'SLG', 'ISO', 'FB%', 'HR/FB', 'Barrel%']]
        return batter_data
    except Exception as e:
        print(f"Error fetching batter stats: {e}")
        return None

# Fetch batter stats from 2023-2025
batter_stats = get_batter_stats()

# Display sample batter stats
batter_stats.head()


Unnamed: 0,Name,HR,SLG,ISO,FB%,HR/FB,Barrel%
1,Ronald Acuna Jr.,41,0.596,0.258,0.304,0.24,0.153
5,Freddie Freeman,29,0.567,0.235,0.37,0.15,0.111
3,Mookie Betts,39,0.579,0.272,0.485,0.167,0.124
4,Matt Olson,54,0.604,0.321,0.436,0.278,0.164
0,Shohei Ohtani,44,0.654,0.35,0.395,0.312,0.193


In [9]:
merged_data = game_data.merge(batter_stats, left_on="batter", right_on="Name", how="left")
merged_data = merged_data.merge(pitcher_stats, left_on="pitcher", right_on="Name", how="left")

merged_data.head()

Unnamed: 0,Position,Name + ID,batter,ID,Roster Position,Game Info,TeamAbbrev,AvgPointsPerGame,pitcher,HR_ParkFactor,...,ISO,FB%,HR/FB,Barrel%,Name_y,HR/9,xFIP,SwStr%,GB%,Hard%
0,IF,Adrian Del Castillo (38848526),Adrian Del Castillo,38848526,UTIL,ARI@SF 05/12/2025 09:45PM ET,ARI,0.0,Justin Verlander,1.0,...,,,,,Justin Verlander,1.0,4.56,0.099,0.355,0.307
1,IF,Adrian Del Castillo (38848526),Adrian Del Castillo,38848526,UTIL,ARI@SF 05/12/2025 09:45PM ET,ARI,0.0,Justin Verlander,1.0,...,,,,,Justin Verlander,1.29,4.45,0.124,0.378,0.323
2,OF,Alek Thomas (38848351),Alek Thomas,38848351,UTIL,ARI@SF 05/12/2025 09:45PM ET,ARI,0.0,Justin Verlander,1.0,...,,,,,Justin Verlander,1.0,4.56,0.099,0.355,0.307
3,OF,Alek Thomas (38848351),Alek Thomas,38848351,UTIL,ARI@SF 05/12/2025 09:45PM ET,ARI,0.0,Justin Verlander,1.0,...,,,,,Justin Verlander,1.29,4.45,0.124,0.378,0.323
4,IF,Blaze Alexander (38848532),Blaze Alexander,38848532,UTIL,ARI@SF 05/12/2025 09:45PM ET,ARI,0.0,Justin Verlander,1.0,...,,,,,Justin Verlander,1.0,4.56,0.099,0.355,0.307


In [10]:
WEATHER_API_KEY = "d199541c5ccea23293014fc9ae0bc843"

def get_weather(stadium):
    """Fetch real-time weather data for the stadium"""
    try:
        url = f"http://api.openweathermap.org/data/2.5/weather?q={stadium}&appid={WEATHER_API_KEY}&units=imperial"
        response = requests.get(url)
        data = response.json()

        return {
            "temp": data["main"]["temp"],
            "humidity": data["main"]["humidity"],
            "wind_speed": data["wind"]["speed"],
            "wind_direction": data["wind"]["deg"]
        }
    except:
        return {"temp": 70, "humidity": 50, "wind_speed": 5, "wind_direction": 180}  # Default values

weather_adjustments = []
for _, row in merged_data.iterrows():
    stadium = row["Game Info"].split(" ")[0]
    weather = get_weather(stadium)
    weather_adjustments.append(weather)

weather_df = pd.DataFrame(weather_adjustments)
merged_data = pd.concat([merged_data, weather_df], axis=1)

merged_data.head()


Unnamed: 0,Position,Name + ID,batter,ID,Roster Position,Game Info,TeamAbbrev,AvgPointsPerGame,pitcher,HR_ParkFactor,...,Name_y,HR/9,xFIP,SwStr%,GB%,Hard%,temp,humidity,wind_speed,wind_direction
0,IF,Adrian Del Castillo (38848526),Adrian Del Castillo,38848526,UTIL,ARI@SF 05/12/2025 09:45PM ET,ARI,0.0,Justin Verlander,1.0,...,Justin Verlander,1.0,4.56,0.099,0.355,0.307,70,50,5,180
1,IF,Adrian Del Castillo (38848526),Adrian Del Castillo,38848526,UTIL,ARI@SF 05/12/2025 09:45PM ET,ARI,0.0,Justin Verlander,1.0,...,Justin Verlander,1.29,4.45,0.124,0.378,0.323,70,50,5,180
2,OF,Alek Thomas (38848351),Alek Thomas,38848351,UTIL,ARI@SF 05/12/2025 09:45PM ET,ARI,0.0,Justin Verlander,1.0,...,Justin Verlander,1.0,4.56,0.099,0.355,0.307,70,50,5,180
3,OF,Alek Thomas (38848351),Alek Thomas,38848351,UTIL,ARI@SF 05/12/2025 09:45PM ET,ARI,0.0,Justin Verlander,1.0,...,Justin Verlander,1.29,4.45,0.124,0.378,0.323,70,50,5,180
4,IF,Blaze Alexander (38848532),Blaze Alexander,38848532,UTIL,ARI@SF 05/12/2025 09:45PM ET,ARI,0.0,Justin Verlander,1.0,...,Justin Verlander,1.0,4.56,0.099,0.355,0.307,70,50,5,180


In [11]:
# Select predictive features
features = [
    "HR_ParkFactor", "HR/9", "xFIP", "SwStr%", "GB%", "Hard%",
    "SLG", "ISO", "FB%", "HR/FB", "Barrel%",
    "temp", "humidity", "wind_speed"
]
target = "HR"

# Drop missing values from the original merged_data DataFrame
merged_data.dropna(subset=features + [target], inplace=True)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    merged_data[features], merged_data[target], test_size=0.2, random_state=42
)

# Train Linear Regression Model
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

# Evaluate Model
y_pred = lr_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)

print(f"Model MSE: {mse:.4f}")

# Apply Model to Today's Games
# Now merged_data is cleaned, so prediction will work
merged_data["Predicted_HR_Probability"] = lr_model.predict(merged_data[features])

# Display Top HR Candidates
top_hr_candidates = merged_data.sort_values("Predicted_HR_Probability", ascending=False).head(10)
top_hr_candidates[["Name + ID", "TeamAbbrev", "Predicted_HR_Probability", "pitcher", "Game Info"]]

Model MSE: 63.4734


Unnamed: 0,Name + ID,TeamAbbrev,Predicted_HR_Probability,pitcher,Game Info
347,Aaron Judge (38848218),NYY,43.225607,Paul Skenes,NYY@SEA 05/12/2025 09:40PM ET
348,Aaron Judge (38848218),NYY,41.906583,Paul Skenes,NYY@SEA 05/12/2025 09:40PM ET
89,Matt Olson (38848248),ATL,41.425457,Jake Irvin,WSH@ATL 05/12/2025 07:15PM ET
88,Matt Olson (38848248),ATL,41.374768,Jake Irvin,WSH@ATL 05/12/2025 07:15PM ET
83,Marcell Ozuna (38848263),ATL,34.590551,Jake Irvin,WSH@ATL 05/12/2025 07:15PM ET
82,Marcell Ozuna (38848263),ATL,34.539861,Jake Irvin,WSH@ATL 05/12/2025 07:15PM ET
10,Corbin Carroll (38848225),ARI,33.940615,Justin Verlander,ARI@SF 05/12/2025 09:45PM ET
341,Pete Alonso (38848226),NYM,33.695055,Paul Skenes,PIT@NYM 05/12/2025 07:10PM ET
343,Pete Alonso (38848226),NYM,32.475943,Paul Skenes,PIT@NYM 05/12/2025 07:10PM ET
16,Eugenio Suarez (38848274),ARI,32.461686,Justin Verlander,ARI@SF 05/12/2025 09:45PM ET


In [12]:
# Calculate rolling averages over the past seasons
merged_data["HR/9_rolling"] = merged_data.groupby("pitcher")["HR/9"].transform(lambda x: x.rolling(window=3, min_periods=1).mean())
merged_data["Barrel%_rolling"] = merged_data.groupby("batter")["Barrel%"].transform(lambda x: x.rolling(window=3, min_periods=1).mean())
merged_data["FB%_rolling"] = merged_data.groupby("batter")["FB%"].transform(lambda x: x.rolling(window=3, min_periods=1).mean())
merged_data["ParkFactor_rolling"] = merged_data.groupby("Game Info")["HR_ParkFactor"].transform(lambda x: x.rolling(window=3, min_periods=1).mean())


In [13]:
# Assign adjusted weights based on historical importance
feature_weights = {
    "HR/9_rolling": 0.35,
    "Barrel%_rolling": 0.25,
    "FB%_rolling": 0.15,
    "ParkFactor_rolling": 0.15,
    "ISO": 0.1,
    "SLG": 0.1,
    "temp": 0.05,
    "humidity": 0.05,
    "wind_speed": 0.05,
}

# Scale features by their respective weights
for feature, weight in feature_weights.items():
    merged_data[feature] = merged_data[feature] * weight


In [15]:
# Select weighted features for training
adjusted_features = list(feature_weights.keys())
X_train, X_test, y_train, y_test = train_test_split(
    merged_data[adjusted_features], merged_data["HR"], test_size=0.2, random_state=42
)

# Train new linear regression model
lr_model_adjusted = LinearRegression()
lr_model_adjusted.fit(X_train, y_train)

# Evaluate model
y_pred_adjusted = lr_model_adjusted.predict(X_test)
adjusted_mse = mean_squared_error(y_test, y_pred_adjusted)

print(f"Adjusted Model MSE: {adjusted_mse:.4f}")


Adjusted Model MSE: 65.9561


In [16]:
merged_data["Adjusted_HR_Probability"] = lr_model_adjusted.predict(merged_data[adjusted_features])

# Display top HR candidates using adjusted model
top_hr_candidates_adjusted = merged_data.sort_values("Adjusted_HR_Probability", ascending=False).head(10)
top_hr_candidates_adjusted[["Name + ID", "TeamAbbrev", "Adjusted_HR_Probability", "pitcher", "Game Info"]]


Unnamed: 0,Name + ID,TeamAbbrev,Adjusted_HR_Probability,pitcher,Game Info
347,Aaron Judge (38848218),NYY,43.655107,Paul Skenes,NYY@SEA 05/12/2025 09:40PM ET
348,Aaron Judge (38848218),NYY,42.397779,Paul Skenes,NYY@SEA 05/12/2025 09:40PM ET
89,Matt Olson (38848248),ATL,38.972181,Jake Irvin,WSH@ATL 05/12/2025 07:15PM ET
88,Matt Olson (38848248),ATL,38.665105,Jake Irvin,WSH@ATL 05/12/2025 07:15PM ET
10,Corbin Carroll (38848225),ARI,35.224992,Justin Verlander,ARI@SF 05/12/2025 09:45PM ET
11,Corbin Carroll (38848225),ARI,34.87892,Justin Verlander,ARI@SF 05/12/2025 09:45PM ET
16,Eugenio Suarez (38848274),ARI,33.718167,Justin Verlander,ARI@SF 05/12/2025 09:45PM ET
17,Eugenio Suarez (38848274),ARI,33.455452,Justin Verlander,ARI@SF 05/12/2025 09:45PM ET
83,Marcell Ozuna (38848263),ATL,33.369237,Jake Irvin,WSH@ATL 05/12/2025 07:15PM ET
82,Marcell Ozuna (38848263),ATL,33.062161,Jake Irvin,WSH@ATL 05/12/2025 07:15PM ET
