#### Cleaning data and Seperating into the Different Frames

In [68]:
# panda
import pandas as pd

In [76]:
fg = pd.read_csv('fg_00_24.csv')
stc = pd.read_csv('stc_15_24.csv')

find the data cutoffs based on year to help decide data seperation

In [77]:
def find_latest_missing_year(df, year_col_name):
    latest_missing_year = {}
    missing_counts = {}
    for col in df.columns:
        missing_data = df[df[col].isnull()]
        missing_counts[col] = df[col].isnull().sum()
        if not missing_data.empty:
            latest_year = missing_data[year_col_name].max()
            latest_missing_year[col] = latest_year
        else:
            latest_missing_year[col] = None  # No missing values in this column
    return pd.Series(latest_missing_year), pd.Series(missing_counts)

fg_missing_years, fg_missing_counts = find_latest_missing_year(fg, 'Season')
stc_missing_years, stc_missing_counts = find_latest_missing_year(stc, 'year')

print("FG Latest Missing Years:\n", fg_missing_years)
print("\nFG Missing Value Counts:\n", fg_missing_counts)
print("\nSTC Latest Missing Years:\n", stc_missing_years)
print("\nSTC Missing Value Counts:\n", stc_missing_counts)

FG Latest Missing Years:
 Season           NaN
Name             NaN
Team             NaN
PA               NaN
Age              NaN
BB%              NaN
K%               NaN
BABIP            NaN
wOBA             NaN
xwOBA         2024.0
wRC+             NaN
BsR              NaN
Off              NaN
Def              NaN
WAR              NaN
Barrel%       2014.0
maxEV         2014.0
HardHit%      2014.0
xSLG          2024.0
BsR.1            NaN
O-Swing%      2001.0
Z-Swing%      2001.0
O-Contact%    2007.0
Z-Contact%    2001.0
Contact%      2001.0
SwStr%        2001.0
CSW%          2001.0
NameASCII        NaN
PlayerId         NaN
MLBAMID          NaN
dtype: float64

FG Missing Value Counts:
 Season            0
Name              0
Team              0
PA                0
Age               0
BB%               0
K%                0
BABIP             0
wOBA              0
xwOBA         14863
wRC+              0
BsR               0
Off               0
Def               0
WAR               0
Ba

basic cleaning and merge

In [71]:
fg.drop(columns=['xwOBA', 'wOBA', 'NameASCII', 'PlayerId', 'MLBAMID', 'xSLG', 'BsR.1'], inplace=True)

In [72]:
# fix name cols so able to merge
fg['Name'] = fg['Name'].str.lower()
stc = stc.rename(columns={'last_name, first_name': 'Name', 'year': 'Season'}) # rename cols so easier
stc['Name'] = stc['Name'].str.lower()
stc['Name'] = stc['Name'].str.split(', ').str[::-1].str.join(' ')
stc.head()

Unnamed: 0,Name,player_id,Season,pa,k_percent,bb_percent,woba,xwoba,sweet_spot_percent,barrel_batted_rate,hard_hit_percent,avg_best_speed,avg_hyper_speed,whiff_percent,swing_percent
0,bartolo colon,112526,2015,64,37.5,0.0,0.136,0.153,23.1,0.0,15.4,93.759829,91.247927,30.6,58.1
1,torii hunter,116338,2015,567,18.5,6.2,0.304,0.29,28.5,5.0,34.9,98.563404,93.39348,23.1,53.4
2,david ortiz,120074,2015,614,15.5,12.5,0.379,0.42,34.8,13.1,49.1,102.851133,96.053058,23.2,44.7
3,alex rodriguez,121347,2015,620,23.4,13.5,0.361,0.368,31.4,10.9,43.9,101.381141,95.01438,32.0,43.9
4,aramis ramirez,133380,2015,516,13.2,6.0,0.309,0.304,33.5,5.6,34.5,97.851256,92.944763,17.9,52.9


In [73]:
import unicodedata
import re

def clean_name(name):
    # Normalize the string to decompose accented characters
    name = unicodedata.normalize('NFKD', name).encode('ascii', 'ignore').decode('utf-8')
    # Remove all non-alphanumeric characters
    name = re.sub(r'[^a-z\s]', '', name)
    return name

fg['Name'] = fg['Name'].apply(clean_name)
stc['Name'] = stc['Name'].apply(clean_name)

create data sets

In [74]:
fg15 = fg[fg['Season'] >= 2015]
full_15 = fg15.merge(stc, how='outer', on=['Name', 'Season'])

clean to account for pitchers

In [78]:
full_15 = full_15[full_15['Team'].notna()]