In [119]:
import numpy as np
import pandas as pd

In [120]:
game_logs = pd.read_csv("../data/game_logs_2024.csv")
pitching = pd.read_csv("../data/pitching_stats_2024.csv")
batting = pd.read_csv("../data/team_batting_stats_2024.csv", index_col=False)

In [121]:
# RUN ONLY ONCE
pitching = pitching.rename(columns={'last_name, first_name' : 'Name', 'strikeout' : 'SO','batting_avg' : 'AVG', 'k_percent' : 'K%', 'p_earned_run' : 'ER', 'p_run' : 'R', 'p_era' : 'ERA'})
pitching = pitching.drop('year', axis=1)
pitching['Name'] = pitching['Name'].str.replace(',', '')

batting = batting.rename(columns = {'SO' : 'SO_batting'})

In [122]:
def swap_name(name):
    parts = name.split()
    return parts[1] + " " + parts[0]

pitching['Name'] = pitching['Name'].apply(swap_name)
pitching.head()

Unnamed: 0,Name,player_id,SO,K%,AVG,ER,R,ERA
0,Ronel Blanco,669854,166,24.6,0.19,52,56,2.8
1,Lucas Giolito,608337,204,25.7,0.24,100,110,4.88
2,Kyle Freeland,607536,94,13.9,0.3,87,96,5.03
3,Marcus Stroman,573186,113,16.7,0.277,74,81,4.31
4,Dylan Cease,656302,214,27.3,0.25,90,98,4.58


In [123]:
# Final dataframe that contains game log features, pitching features of the starting pitcher, and the team's batting features
## Feature Selection ##
# Game Logs : [‘home_name’, ‘away_name’, ‘home_probable_pitcher’, ‘away_probable_pitcher’, ‘away_score’, ‘home_score’]
# Pitching Stats : ['ERA', 'ER', 'R', 'SO', 'K%', 'AVG']
# Batting Stats :[‘R/G’, ‘OBP’, ‘SLG’, ‘HR’, ‘BB’, ‘SO’]

game_logs = game_logs[['home_name', 'away_name', 'home_probable_pitcher','away_probable_pitcher', 'away_score', 'home_score']]
pitching_features = pitching[['Name', 'ERA', 'ER', 'R', 'SO', 'K%', 'AVG']]
batting_features = batting[['Tm', 'OBP', 'SLG', 'HR', 'R/G', 'BB', 'SO_batting', 'IBB']]

#Merge Home Pitcher
pitching_home = pitching_features.copy()
pitching_home.columns = ['home_' + col if col != 'Name' else col for col in pitching_home.columns]
game_logs = game_logs.merge(
    pitching_home,
    left_on = 'home_probable_pitcher',
    right_on = 'Name',
    how = 'left',
).drop(columns=['Name'])

#Merge Away Pitcher
pitching_away = pitching_features.copy()
pitching_away.columns = ['away_' + col if col != 'Name' else col for col in pitching_away.columns]
game_logs = game_logs.merge(
    pitching_away,
    left_on = 'away_probable_pitcher',
    right_on = 'Name',
    how = 'left',
).drop(columns=['Name'])

#Merge Home Batters
batting_home = batting_features.copy()
batting_home.columns = ['home_' + col if col != 'Tm' else col for col in batting_home.columns]
game_logs = game_logs.merge(
    batting_home,
    left_on = 'home_name',
    right_on = 'Tm',
    how = 'left',
).drop(columns=['Tm'])

#Merge Away Batters
batting_away = batting_features.copy()
batting_away.columns = ['away_' + col if col != 'Tm' else col for col in batting_home.columns]
game_logs = game_logs.merge(
    batting_away,
    left_on = 'away_name',
    right_on = 'Tm',
    how = 'left',
).drop(columns=['Tm'])


In [124]:
game_logs.head().transpose()

Unnamed: 0,0,1,2,3,4
home_name,San Diego Padres,Boston Red Sox,Texas Rangers,Chicago Cubs,Los Angeles Dodgers
away_name,Los Angeles Dodgers,Northeastern Huskies,Kansas City Royals,Chicago White Sox,San Diego Padres
home_probable_pitcher,Joe Musgrove,Helcris Olivárez,Dane Dunning,Jordan Wicks,Landon Knack
away_probable_pitcher,Gavin Stone,Aiven Cabral,Daniel Lynch IV,Jesse Chavez,Jhony Brito
away_score,14,2,4,1,1
home_score,1,7,5,8,4
home_ERA,,,3.7,,
home_ER,,,71.0,,
home_R,,,73.0,,
home_SO,,,140.0,,


In [125]:
len(game_logs)

5252

In [126]:
game_logs.isnull().sum()

home_name                   0
away_name                   0
home_probable_pitcher      14
away_probable_pitcher       3
away_score                  0
home_score                  0
home_ERA                 1281
home_ER                  1281
home_R                   1281
home_SO                  1281
home_K%                  1281
home_AVG                 1281
away_ERA                 1350
away_ER                  1350
away_R                   1350
away_SO                  1350
away_K%                  1350
away_AVG                 1350
home_OBP                    9
home_SLG                    9
home_HR                     9
home_R/G                    9
home_BB                     9
home_SO_batting             9
home_IBB                    9
away_home_OBP              11
away_home_SLG              11
away_home_HR               11
away_home_R/G              11
away_home_BB               11
away_home_SO_batting       11
away_home_IBB              11
dtype: int64

In [127]:
game_logs.to_csv("../data/final_game_logs.csv")